kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
|
|
2
|
+
use_cache: false
|
|
3
|
+
enable_quality_processing: true
|
|
4
|
+
force_ocr: true
|
|
5
|
+
|
|
6
|
+
ocr:
|
|
7
|
+
backend: tesseract
|
|
8
|
+
language: fra
|
|
9
|
+
|
|
10
|
+
chunking:
|
|
11
|
+
max_chars: 750
|
|
12
|
+
max_overlap: 150
|
|
13
|
+
preset: balanced
|
|
14
|
+
|
|
15
|
+
language_detection:
|
|
16
|
+
enabled: true
|
|
17
|
+
min_confidence: 0.85
|
|
18
|
+
|
|
19
|
+
pdf_options:
|
|
20
|
+
extract_images: false
|
|
21
|
+
passwords:
|
|
22
|
+
- password1
|
|
23
|
+
- password2
|
|
24
|
+
extract_metadata: true
|
|
25
|
+
|
|
26
|
+
image_extraction:
|
|
27
|
+
extract_images: true
|
|
28
|
+
target_dpi: 300
|
|
29
|
+
max_image_dimension: 4096
|
|
30
|
+
auto_adjust_dpi: true
|
|
31
|
+
min_dpi: 72
|
|
32
|
+
max_dpi: 600
|
|
33
|
+
|
|
34
|
+
postprocessor:
|
|
35
|
+
enabled: false
|
|
36
|
+
disabled_processors:
|
|
37
|
+
- token_reduction
|
|
38
|
+
|
|
39
|
+
token_reduction:
|
|
40
|
+
mode: light
|
|
41
|
+
preserve_important_words: false
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Cross-language serialization tests for Ruby bindings
|
|
4
|
+
#
|
|
5
|
+
# Validates that ExtractionConfig serializes consistently with other language bindings
|
|
6
|
+
|
|
7
|
+
require 'json'
|
|
8
|
+
require 'spec_helper'
|
|
9
|
+
|
|
10
|
+
RSpec.describe Kreuzberg::ExtractionConfig do
|
|
11
|
+
describe '#to_h' do
|
|
12
|
+
it 'serializes minimal config to hash' do
|
|
13
|
+
config = described_class.new
|
|
14
|
+
hash = config.to_h
|
|
15
|
+
|
|
16
|
+
expect(hash).to be_a(Hash)
|
|
17
|
+
expect(hash).to have_key(:use_cache)
|
|
18
|
+
expect(hash).to have_key(:enable_quality_processing)
|
|
19
|
+
expect(hash).to have_key(:force_ocr)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it 'serializes config with all fields' do
|
|
23
|
+
config = described_class.new(
|
|
24
|
+
use_cache: true,
|
|
25
|
+
enable_quality_processing: true,
|
|
26
|
+
force_ocr: false
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
hash = config.to_h
|
|
30
|
+
|
|
31
|
+
expect(hash[:use_cache]).to be(true)
|
|
32
|
+
expect(hash[:enable_quality_processing]).to be(true)
|
|
33
|
+
expect(hash[:force_ocr]).to be(false)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it 'preserves field values after serialization' do
|
|
37
|
+
original = described_class.new(
|
|
38
|
+
use_cache: false,
|
|
39
|
+
enable_quality_processing: true
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
hash = original.to_h
|
|
43
|
+
|
|
44
|
+
expect(hash[:use_cache]).to be(false)
|
|
45
|
+
expect(hash[:enable_quality_processing]).to be(true)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
describe '#to_json' do
|
|
50
|
+
it 'serializes to JSON' do
|
|
51
|
+
config = described_class.new(use_cache: true)
|
|
52
|
+
json = config.to_json
|
|
53
|
+
|
|
54
|
+
expect(json).to be_a(String)
|
|
55
|
+
|
|
56
|
+
parsed = JSON.parse(json, symbolize_names: true)
|
|
57
|
+
expect(parsed).to have_key(:use_cache)
|
|
58
|
+
expect(parsed[:use_cache]).to be(true)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
it 'produces valid JSON' do
|
|
62
|
+
config = described_class.new
|
|
63
|
+
json = config.to_json
|
|
64
|
+
|
|
65
|
+
expect { JSON.parse(json) }.not_to raise_error
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
it 'uses snake_case field names' do
|
|
69
|
+
config = described_class.new(use_cache: true)
|
|
70
|
+
json = config.to_json
|
|
71
|
+
|
|
72
|
+
expect(json).to include('use_cache')
|
|
73
|
+
expect(json).not_to include('useCache')
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
describe 'round-trip serialization' do
|
|
78
|
+
it 'survives serialization -> deserialization -> serialization' do
|
|
79
|
+
config1 = described_class.new(
|
|
80
|
+
use_cache: true,
|
|
81
|
+
enable_quality_processing: false
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
json1 = config1.to_json
|
|
85
|
+
hash1 = JSON.parse(json1, symbolize_names: true)
|
|
86
|
+
|
|
87
|
+
config2 = described_class.new(hash1)
|
|
88
|
+
json2 = config2.to_json
|
|
89
|
+
|
|
90
|
+
# JSON strings should be equivalent
|
|
91
|
+
expect(JSON.parse(json1)).to eq(JSON.parse(json2))
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
describe 'field consistency' do
|
|
96
|
+
it 'includes all mandatory fields' do
|
|
97
|
+
config = described_class.new
|
|
98
|
+
hash = config.to_h
|
|
99
|
+
|
|
100
|
+
mandatory_fields = %i[use_cache enable_quality_processing force_ocr]
|
|
101
|
+
mandatory_fields.each do |field|
|
|
102
|
+
expect(hash).to have_key(field)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it 'handles nested ocr config' do
|
|
107
|
+
config = described_class.new(
|
|
108
|
+
ocr: {
|
|
109
|
+
backend: 'tesseract',
|
|
110
|
+
language: 'eng'
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
hash = config.to_h
|
|
115
|
+
|
|
116
|
+
expect(hash).to have_key(:ocr)
|
|
117
|
+
expect(hash[:ocr][:backend]).to eq('tesseract')
|
|
118
|
+
expect(hash[:ocr][:language]).to eq('eng')
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
describe 'immutability' do
|
|
123
|
+
it 'does not modify original config during serialization' do
|
|
124
|
+
config = described_class.new(use_cache: true)
|
|
125
|
+
|
|
126
|
+
json1 = config.to_json
|
|
127
|
+
json2 = config.to_json
|
|
128
|
+
json3 = config.to_json
|
|
129
|
+
|
|
130
|
+
expect(json1).to eq(json2)
|
|
131
|
+
expect(json2).to eq(json3)
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'stringio'
|
|
4
|
+
|
|
5
|
+
RSpec.describe 'Kreuzberg package' do
|
|
6
|
+
describe 'import and structure' do
|
|
7
|
+
it 'can be required without errors' do
|
|
8
|
+
expect { require 'kreuzberg' }.not_to raise_error
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
it 'has a version constant' do
|
|
12
|
+
expect(Kreuzberg::VERSION).not_to be_nil
|
|
13
|
+
expect(Kreuzberg::VERSION).to be_a(String)
|
|
14
|
+
expect(Kreuzberg::VERSION).to match(/^\d+\.\d+\.\d+/)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
describe 'public API exports' do
|
|
19
|
+
describe 'configuration classes' do
|
|
20
|
+
it 'exports Config::Extraction' do
|
|
21
|
+
expect(defined?(Kreuzberg::Config::Extraction)).to eq('constant')
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it 'exports Config::OCR' do
|
|
25
|
+
expect(defined?(Kreuzberg::Config::OCR)).to eq('constant')
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
it 'exports Config::Chunking' do
|
|
29
|
+
expect(defined?(Kreuzberg::Config::Chunking)).to eq('constant')
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
it 'exports Config::LanguageDetection' do
|
|
33
|
+
expect(defined?(Kreuzberg::Config::LanguageDetection)).to eq('constant')
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it 'exports Config::PDF' do
|
|
37
|
+
expect(defined?(Kreuzberg::Config::PDF)).to eq('constant')
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
it 'exports Config::HtmlOptions' do
|
|
41
|
+
expect(defined?(Kreuzberg::Config::HtmlOptions)).to eq('constant')
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it 'exports Config::Keywords' do
|
|
45
|
+
expect(defined?(Kreuzberg::Config::Keywords)).to eq('constant')
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
describe 'result classes' do
|
|
50
|
+
it 'exports Result' do
|
|
51
|
+
expect(defined?(Kreuzberg::Result)).to eq('constant')
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it 'exports Result::Table' do
|
|
55
|
+
expect(defined?(Kreuzberg::Result::Table)).to eq('constant')
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it 'exports Result::Chunk' do
|
|
59
|
+
expect(defined?(Kreuzberg::Result::Chunk)).to eq('constant')
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
it 'exports Result::Image' do
|
|
63
|
+
expect(defined?(Kreuzberg::Result::Image)).to eq('constant')
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
describe 'exception classes' do
|
|
68
|
+
it 'exports Errors::Error' do
|
|
69
|
+
expect(defined?(Kreuzberg::Errors::Error)).to eq('constant')
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it 'exports Errors::ValidationError' do
|
|
73
|
+
expect(defined?(Kreuzberg::Errors::ValidationError)).to eq('constant')
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
it 'exports Errors::ParsingError' do
|
|
77
|
+
expect(defined?(Kreuzberg::Errors::ParsingError)).to eq('constant')
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
it 'exports Errors::OCRError' do
|
|
81
|
+
expect(defined?(Kreuzberg::Errors::OCRError)).to eq('constant')
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
it 'exports Errors::MissingDependencyError' do
|
|
85
|
+
expect(defined?(Kreuzberg::Errors::MissingDependencyError)).to eq('constant')
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
it 'exports Errors::IOError' do
|
|
89
|
+
expect(defined?(Kreuzberg::Errors::IOError)).to eq('constant')
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'exports Errors::PluginError' do
|
|
93
|
+
expect(defined?(Kreuzberg::Errors::PluginError)).to eq('constant')
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
describe 'extraction functions (sync)' do
|
|
98
|
+
it 'exports extract_file_sync' do
|
|
99
|
+
expect(Kreuzberg).to respond_to(:extract_file_sync)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
it 'exports extract_bytes_sync' do
|
|
103
|
+
expect(Kreuzberg).to respond_to(:extract_bytes_sync)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it 'exports batch_extract_files_sync' do
|
|
107
|
+
expect(Kreuzberg).to respond_to(:batch_extract_files_sync)
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
describe 'extraction functions (async)' do
|
|
112
|
+
it 'exports extract_file' do
|
|
113
|
+
expect(Kreuzberg).to respond_to(:extract_file)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
it 'exports extract_bytes' do
|
|
117
|
+
expect(Kreuzberg).to respond_to(:extract_bytes)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
it 'exports batch_extract_files' do
|
|
121
|
+
expect(Kreuzberg).to respond_to(:batch_extract_files)
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
describe 'utility modules' do
|
|
126
|
+
it 'exports CLI' do
|
|
127
|
+
expect(defined?(Kreuzberg::CLI)).to eq('constant')
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
it 'exports CLIProxy' do
|
|
131
|
+
expect(defined?(Kreuzberg::CLIProxy)).to eq('constant')
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
it 'exports APIProxy' do
|
|
135
|
+
expect(defined?(Kreuzberg::APIProxy)).to eq('constant')
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it 'exports MCPProxy' do
|
|
139
|
+
expect(defined?(Kreuzberg::MCPProxy)).to eq('constant')
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
describe 'module structure' do
|
|
145
|
+
it 'defines Kreuzberg as a module' do
|
|
146
|
+
expect(Kreuzberg).to be_a(Module)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
it 'defines Kreuzberg::Config as a module' do
|
|
150
|
+
expect(Kreuzberg::Config).to be_a(Module)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it 'defines Kreuzberg::Errors as a module' do
|
|
154
|
+
expect(Kreuzberg::Errors).to be_a(Module)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
describe 'basic extraction smoke tests' do
|
|
159
|
+
it 'extracts inline text via bytes API' do
|
|
160
|
+
bytes = StringIO.new('Hello from Kreuzberg')
|
|
161
|
+
result = Kreuzberg.extract_bytes_sync(data: bytes.string, mime_type: 'text/plain')
|
|
162
|
+
|
|
163
|
+
expect(result.content).to include('Hello')
|
|
164
|
+
expect(result.mime_type).to eq('text/plain')
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
it 'extracts from small temp file via sync API' do
|
|
168
|
+
file = create_test_file('Simple document for smoke testing')
|
|
169
|
+
result = Kreuzberg.extract_file_sync(path: file)
|
|
170
|
+
|
|
171
|
+
expect(result.content).to include('Simple document')
|
|
172
|
+
expect(result.mime_type).to eq('text/plain')
|
|
173
|
+
ensure
|
|
174
|
+
File.delete(file) if file && File.exist?(file)
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
data/spec/spec_helper.rb
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'kreuzberg'
|
|
4
|
+
require 'tmpdir'
|
|
5
|
+
require 'fileutils'
|
|
6
|
+
|
|
7
|
+
RSpec.configure do |config|
|
|
8
|
+
config.expect_with :rspec do |expectations|
|
|
9
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
config.mock_with :rspec do |mocks|
|
|
13
|
+
mocks.verify_partial_doubles = true
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
config.shared_context_metadata_behavior = :apply_to_host_groups
|
|
17
|
+
config.filter_run_when_matching :focus
|
|
18
|
+
config.example_status_persistence_file_path = 'spec/examples.txt'
|
|
19
|
+
config.disable_monkey_patching!
|
|
20
|
+
config.warnings = true
|
|
21
|
+
config.default_formatter = 'doc' if config.files_to_run.one?
|
|
22
|
+
config.order = :random
|
|
23
|
+
Kernel.srand config.seed
|
|
24
|
+
|
|
25
|
+
config.include(Module.new do
|
|
26
|
+
def fixture_path(filename)
|
|
27
|
+
File.join(__dir__, 'fixtures', filename)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def test_document_path(relative_path)
|
|
31
|
+
File.expand_path(File.join(__dir__, '..', '..', '..', 'test_documents', relative_path))
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def create_test_file(content, filename: 'test.txt')
|
|
35
|
+
path = File.join(Dir.tmpdir, filename)
|
|
36
|
+
File.write(path, content)
|
|
37
|
+
path
|
|
38
|
+
end
|
|
39
|
+
end)
|
|
40
|
+
end
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Config::Chunking do
|
|
4
|
+
describe '#initialize' do
|
|
5
|
+
it 'creates config with default values' do
|
|
6
|
+
config = described_class.new
|
|
7
|
+
|
|
8
|
+
expect(config.max_chars).to eq 1000
|
|
9
|
+
expect(config.max_overlap).to eq 200
|
|
10
|
+
expect(config.preset).to be_nil
|
|
11
|
+
expect(config.embedding).to be_nil
|
|
12
|
+
expect(config.enabled).to be true
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
it 'creates config with custom values' do
|
|
16
|
+
config = described_class.new(
|
|
17
|
+
max_chars: 500,
|
|
18
|
+
max_overlap: 100,
|
|
19
|
+
preset: 'fast'
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
expect(config.max_chars).to eq 500
|
|
23
|
+
expect(config.max_overlap).to eq 100
|
|
24
|
+
expect(config.preset).to eq 'fast'
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'accepts chunk_size alias for max_chars' do
|
|
28
|
+
config = described_class.new(chunk_size: 750)
|
|
29
|
+
|
|
30
|
+
expect(config.max_chars).to eq 750
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it 'accepts chunk_overlap alias for max_overlap' do
|
|
34
|
+
config = described_class.new(chunk_overlap: 150)
|
|
35
|
+
|
|
36
|
+
expect(config.max_overlap).to eq 150
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'uses max_chars when both chunk_size and max_chars provided' do
|
|
40
|
+
config = described_class.new(chunk_size: 500, max_chars: 1000)
|
|
41
|
+
|
|
42
|
+
expect(config.max_chars).to eq 500
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
it 'accepts embedding as instance' do
|
|
46
|
+
embedding = Kreuzberg::Config::Embedding.new(model: { type: :preset, name: 'fast' })
|
|
47
|
+
config = described_class.new(embedding: embedding)
|
|
48
|
+
|
|
49
|
+
expect(config.embedding).to be_a Kreuzberg::Config::Embedding
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it 'converts embedding hash to instance' do
|
|
53
|
+
config = described_class.new(embedding: { model: { type: :preset, name: 'balanced' } })
|
|
54
|
+
|
|
55
|
+
expect(config.embedding).to be_a Kreuzberg::Config::Embedding
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
describe '#to_h' do
|
|
60
|
+
it 'serializes to hash with default values' do
|
|
61
|
+
config = described_class.new
|
|
62
|
+
hash = config.to_h
|
|
63
|
+
|
|
64
|
+
expect(hash).to be_a Hash
|
|
65
|
+
expect(hash[:max_chars]).to eq 1000
|
|
66
|
+
expect(hash[:max_overlap]).to eq 200
|
|
67
|
+
expect(hash[:enabled]).to be true
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
it 'includes embedding in hash when present' do
|
|
71
|
+
config = described_class.new(embedding: { model: { type: :preset, name: 'fast' } })
|
|
72
|
+
hash = config.to_h
|
|
73
|
+
|
|
74
|
+
expect(hash[:embedding]).to be_a Hash
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it 'compacts nil values from hash' do
|
|
78
|
+
config = described_class.new
|
|
79
|
+
hash = config.to_h
|
|
80
|
+
|
|
81
|
+
expect(hash.key?(:preset)).to be false
|
|
82
|
+
expect(hash.key?(:embedding)).to be false
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
describe 'validation' do
|
|
87
|
+
it 'rejects negative max_chars' do
|
|
88
|
+
expect do
|
|
89
|
+
described_class.new(max_chars: -100)
|
|
90
|
+
end.to raise_error ArgumentError, /max_chars must be a positive integer/
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
it 'rejects negative max_overlap' do
|
|
94
|
+
expect do
|
|
95
|
+
described_class.new(max_overlap: -50)
|
|
96
|
+
end.to raise_error ArgumentError, /max_overlap must be a positive integer/
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
it 'accepts zero values' do
|
|
100
|
+
expect do
|
|
101
|
+
described_class.new(max_chars: 0, max_overlap: 0)
|
|
102
|
+
end.not_to raise_error
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
it 'accepts positive values' do
|
|
106
|
+
expect do
|
|
107
|
+
described_class.new(max_chars: 2000, max_overlap: 500)
|
|
108
|
+
end.not_to raise_error
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
describe 'keyword arguments' do
|
|
113
|
+
it 'accepts all keyword arguments' do
|
|
114
|
+
config = described_class.new(
|
|
115
|
+
max_chars: 750,
|
|
116
|
+
max_overlap: 150,
|
|
117
|
+
preset: 'balanced',
|
|
118
|
+
enabled: true
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
expect(config.max_chars).to eq 750
|
|
122
|
+
expect(config.max_overlap).to eq 150
|
|
123
|
+
expect(config.preset).to eq 'balanced'
|
|
124
|
+
expect(config.enabled).to be true
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
it 'converts preset to string' do
|
|
128
|
+
config = described_class.new(preset: :fast)
|
|
129
|
+
|
|
130
|
+
expect(config.preset).to eq 'fast'
|
|
131
|
+
expect(config.preset).to be_a String
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
describe 'equality' do
|
|
136
|
+
it 'compares configs by value' do
|
|
137
|
+
config1 = described_class.new(max_chars: 500, max_overlap: 100)
|
|
138
|
+
config2 = described_class.new(max_chars: 500, max_overlap: 100)
|
|
139
|
+
|
|
140
|
+
expect(config1.max_chars).to eq config2.max_chars
|
|
141
|
+
expect(config1.max_overlap).to eq config2.max_overlap
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
it 'detects differences' do
|
|
145
|
+
config1 = described_class.new(max_chars: 500)
|
|
146
|
+
config2 = described_class.new(max_chars: 1000)
|
|
147
|
+
|
|
148
|
+
expect(config1.max_chars).not_to eq config2.max_chars
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
describe 'nested config integration' do
|
|
153
|
+
it 'can be nested in Extraction config' do
|
|
154
|
+
chunking = described_class.new(max_chars: 750, preset: 'fast')
|
|
155
|
+
extraction = Kreuzberg::Config::Extraction.new(chunking: chunking)
|
|
156
|
+
|
|
157
|
+
expect(extraction.chunking).to be_a described_class
|
|
158
|
+
expect(extraction.chunking.max_chars).to eq 750
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
it 'accepts hash in Extraction config' do
|
|
162
|
+
extraction = Kreuzberg::Config::Extraction.new(
|
|
163
|
+
chunking: { max_chars: 500, preset: 'balanced' }
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
expect(extraction.chunking).to be_a described_class
|
|
167
|
+
expect(extraction.chunking.max_chars).to eq 500
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
describe 'symbol vs string key handling' do
|
|
172
|
+
it 'converts symbol preset to string' do
|
|
173
|
+
config = described_class.new(preset: :fast)
|
|
174
|
+
|
|
175
|
+
expect(config.preset).to eq 'fast'
|
|
176
|
+
expect(config.preset).to be_a String
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
it 'converts integer strings to integers' do
|
|
180
|
+
config = described_class.new(max_chars: '1500', max_overlap: '300')
|
|
181
|
+
|
|
182
|
+
expect(config.max_chars).to eq 1500
|
|
183
|
+
expect(config.max_overlap).to eq 300
|
|
184
|
+
expect(config.max_chars).to be_a Integer
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
describe 'enabled field' do
|
|
189
|
+
it 'defaults to true' do
|
|
190
|
+
config = described_class.new
|
|
191
|
+
|
|
192
|
+
expect(config.enabled).to be true
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
it 'accepts false' do
|
|
196
|
+
config = described_class.new(enabled: false)
|
|
197
|
+
|
|
198
|
+
expect(config.enabled).to be false
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
it 'converts truthy values to true' do
|
|
202
|
+
config = described_class.new(enabled: 'yes')
|
|
203
|
+
|
|
204
|
+
expect(config.enabled).to be true
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
it 'can be nil' do
|
|
208
|
+
config = described_class.new(enabled: nil)
|
|
209
|
+
|
|
210
|
+
expect(config.enabled).to be_nil
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|