kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,41 @@
1
+
2
+ use_cache: false
3
+ enable_quality_processing: true
4
+ force_ocr: true
5
+
6
+ ocr:
7
+ backend: tesseract
8
+ language: fra
9
+
10
+ chunking:
11
+ max_chars: 750
12
+ max_overlap: 150
13
+ preset: balanced
14
+
15
+ language_detection:
16
+ enabled: true
17
+ min_confidence: 0.85
18
+
19
+ pdf_options:
20
+ extract_images: false
21
+ passwords:
22
+ - password1
23
+ - password2
24
+ extract_metadata: true
25
+
26
+ image_extraction:
27
+ extract_images: true
28
+ target_dpi: 300
29
+ max_image_dimension: 4096
30
+ auto_adjust_dpi: true
31
+ min_dpi: 72
32
+ max_dpi: 600
33
+
34
+ postprocessor:
35
+ enabled: false
36
+ disabled_processors:
37
+ - token_reduction
38
+
39
+ token_reduction:
40
+ mode: light
41
+ preserve_important_words: false
@@ -0,0 +1,3 @@
1
+ use_cache = "not_a_boolean"
2
+ [ocr
3
+ backend = "tesseract"
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Cross-language serialization tests for Ruby bindings
4
+ #
5
+ # Validates that ExtractionConfig serializes consistently with other language bindings
6
+
7
+ require 'json'
8
+ require 'spec_helper'
9
+
10
+ RSpec.describe Kreuzberg::ExtractionConfig do
11
+ describe '#to_h' do
12
+ it 'serializes minimal config to hash' do
13
+ config = described_class.new
14
+ hash = config.to_h
15
+
16
+ expect(hash).to be_a(Hash)
17
+ expect(hash).to have_key(:use_cache)
18
+ expect(hash).to have_key(:enable_quality_processing)
19
+ expect(hash).to have_key(:force_ocr)
20
+ end
21
+
22
+ it 'serializes config with all fields' do
23
+ config = described_class.new(
24
+ use_cache: true,
25
+ enable_quality_processing: true,
26
+ force_ocr: false
27
+ )
28
+
29
+ hash = config.to_h
30
+
31
+ expect(hash[:use_cache]).to be(true)
32
+ expect(hash[:enable_quality_processing]).to be(true)
33
+ expect(hash[:force_ocr]).to be(false)
34
+ end
35
+
36
+ it 'preserves field values after serialization' do
37
+ original = described_class.new(
38
+ use_cache: false,
39
+ enable_quality_processing: true
40
+ )
41
+
42
+ hash = original.to_h
43
+
44
+ expect(hash[:use_cache]).to be(false)
45
+ expect(hash[:enable_quality_processing]).to be(true)
46
+ end
47
+ end
48
+
49
+ describe '#to_json' do
50
+ it 'serializes to JSON' do
51
+ config = described_class.new(use_cache: true)
52
+ json = config.to_json
53
+
54
+ expect(json).to be_a(String)
55
+
56
+ parsed = JSON.parse(json, symbolize_names: true)
57
+ expect(parsed).to have_key(:use_cache)
58
+ expect(parsed[:use_cache]).to be(true)
59
+ end
60
+
61
+ it 'produces valid JSON' do
62
+ config = described_class.new
63
+ json = config.to_json
64
+
65
+ expect { JSON.parse(json) }.not_to raise_error
66
+ end
67
+
68
+ it 'uses snake_case field names' do
69
+ config = described_class.new(use_cache: true)
70
+ json = config.to_json
71
+
72
+ expect(json).to include('use_cache')
73
+ expect(json).not_to include('useCache')
74
+ end
75
+ end
76
+
77
+ describe 'round-trip serialization' do
78
+ it 'survives serialization -> deserialization -> serialization' do
79
+ config1 = described_class.new(
80
+ use_cache: true,
81
+ enable_quality_processing: false
82
+ )
83
+
84
+ json1 = config1.to_json
85
+ hash1 = JSON.parse(json1, symbolize_names: true)
86
+
87
+ config2 = described_class.new(hash1)
88
+ json2 = config2.to_json
89
+
90
+ # JSON strings should be equivalent
91
+ expect(JSON.parse(json1)).to eq(JSON.parse(json2))
92
+ end
93
+ end
94
+
95
+ describe 'field consistency' do
96
+ it 'includes all mandatory fields' do
97
+ config = described_class.new
98
+ hash = config.to_h
99
+
100
+ mandatory_fields = %i[use_cache enable_quality_processing force_ocr]
101
+ mandatory_fields.each do |field|
102
+ expect(hash).to have_key(field)
103
+ end
104
+ end
105
+
106
+ it 'handles nested ocr config' do
107
+ config = described_class.new(
108
+ ocr: {
109
+ backend: 'tesseract',
110
+ language: 'eng'
111
+ }
112
+ )
113
+
114
+ hash = config.to_h
115
+
116
+ expect(hash).to have_key(:ocr)
117
+ expect(hash[:ocr][:backend]).to eq('tesseract')
118
+ expect(hash[:ocr][:language]).to eq('eng')
119
+ end
120
+ end
121
+
122
+ describe 'immutability' do
123
+ it 'does not modify original config during serialization' do
124
+ config = described_class.new(use_cache: true)
125
+
126
+ json1 = config.to_json
127
+ json2 = config.to_json
128
+ json3 = config.to_json
129
+
130
+ expect(json1).to eq(json2)
131
+ expect(json2).to eq(json3)
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,177 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ RSpec.describe 'Kreuzberg package' do
6
+ describe 'import and structure' do
7
+ it 'can be required without errors' do
8
+ expect { require 'kreuzberg' }.not_to raise_error
9
+ end
10
+
11
+ it 'has a version constant' do
12
+ expect(Kreuzberg::VERSION).not_to be_nil
13
+ expect(Kreuzberg::VERSION).to be_a(String)
14
+ expect(Kreuzberg::VERSION).to match(/^\d+\.\d+\.\d+/)
15
+ end
16
+ end
17
+
18
+ describe 'public API exports' do
19
+ describe 'configuration classes' do
20
+ it 'exports Config::Extraction' do
21
+ expect(defined?(Kreuzberg::Config::Extraction)).to eq('constant')
22
+ end
23
+
24
+ it 'exports Config::OCR' do
25
+ expect(defined?(Kreuzberg::Config::OCR)).to eq('constant')
26
+ end
27
+
28
+ it 'exports Config::Chunking' do
29
+ expect(defined?(Kreuzberg::Config::Chunking)).to eq('constant')
30
+ end
31
+
32
+ it 'exports Config::LanguageDetection' do
33
+ expect(defined?(Kreuzberg::Config::LanguageDetection)).to eq('constant')
34
+ end
35
+
36
+ it 'exports Config::PDF' do
37
+ expect(defined?(Kreuzberg::Config::PDF)).to eq('constant')
38
+ end
39
+
40
+ it 'exports Config::HtmlOptions' do
41
+ expect(defined?(Kreuzberg::Config::HtmlOptions)).to eq('constant')
42
+ end
43
+
44
+ it 'exports Config::Keywords' do
45
+ expect(defined?(Kreuzberg::Config::Keywords)).to eq('constant')
46
+ end
47
+ end
48
+
49
+ describe 'result classes' do
50
+ it 'exports Result' do
51
+ expect(defined?(Kreuzberg::Result)).to eq('constant')
52
+ end
53
+
54
+ it 'exports Result::Table' do
55
+ expect(defined?(Kreuzberg::Result::Table)).to eq('constant')
56
+ end
57
+
58
+ it 'exports Result::Chunk' do
59
+ expect(defined?(Kreuzberg::Result::Chunk)).to eq('constant')
60
+ end
61
+
62
+ it 'exports Result::Image' do
63
+ expect(defined?(Kreuzberg::Result::Image)).to eq('constant')
64
+ end
65
+ end
66
+
67
+ describe 'exception classes' do
68
+ it 'exports Errors::Error' do
69
+ expect(defined?(Kreuzberg::Errors::Error)).to eq('constant')
70
+ end
71
+
72
+ it 'exports Errors::ValidationError' do
73
+ expect(defined?(Kreuzberg::Errors::ValidationError)).to eq('constant')
74
+ end
75
+
76
+ it 'exports Errors::ParsingError' do
77
+ expect(defined?(Kreuzberg::Errors::ParsingError)).to eq('constant')
78
+ end
79
+
80
+ it 'exports Errors::OCRError' do
81
+ expect(defined?(Kreuzberg::Errors::OCRError)).to eq('constant')
82
+ end
83
+
84
+ it 'exports Errors::MissingDependencyError' do
85
+ expect(defined?(Kreuzberg::Errors::MissingDependencyError)).to eq('constant')
86
+ end
87
+
88
+ it 'exports Errors::IOError' do
89
+ expect(defined?(Kreuzberg::Errors::IOError)).to eq('constant')
90
+ end
91
+
92
+ it 'exports Errors::PluginError' do
93
+ expect(defined?(Kreuzberg::Errors::PluginError)).to eq('constant')
94
+ end
95
+ end
96
+
97
+ describe 'extraction functions (sync)' do
98
+ it 'exports extract_file_sync' do
99
+ expect(Kreuzberg).to respond_to(:extract_file_sync)
100
+ end
101
+
102
+ it 'exports extract_bytes_sync' do
103
+ expect(Kreuzberg).to respond_to(:extract_bytes_sync)
104
+ end
105
+
106
+ it 'exports batch_extract_files_sync' do
107
+ expect(Kreuzberg).to respond_to(:batch_extract_files_sync)
108
+ end
109
+ end
110
+
111
+ describe 'extraction functions (async)' do
112
+ it 'exports extract_file' do
113
+ expect(Kreuzberg).to respond_to(:extract_file)
114
+ end
115
+
116
+ it 'exports extract_bytes' do
117
+ expect(Kreuzberg).to respond_to(:extract_bytes)
118
+ end
119
+
120
+ it 'exports batch_extract_files' do
121
+ expect(Kreuzberg).to respond_to(:batch_extract_files)
122
+ end
123
+ end
124
+
125
+ describe 'utility modules' do
126
+ it 'exports CLI' do
127
+ expect(defined?(Kreuzberg::CLI)).to eq('constant')
128
+ end
129
+
130
+ it 'exports CLIProxy' do
131
+ expect(defined?(Kreuzberg::CLIProxy)).to eq('constant')
132
+ end
133
+
134
+ it 'exports APIProxy' do
135
+ expect(defined?(Kreuzberg::APIProxy)).to eq('constant')
136
+ end
137
+
138
+ it 'exports MCPProxy' do
139
+ expect(defined?(Kreuzberg::MCPProxy)).to eq('constant')
140
+ end
141
+ end
142
+ end
143
+
144
+ describe 'module structure' do
145
+ it 'defines Kreuzberg as a module' do
146
+ expect(Kreuzberg).to be_a(Module)
147
+ end
148
+
149
+ it 'defines Kreuzberg::Config as a module' do
150
+ expect(Kreuzberg::Config).to be_a(Module)
151
+ end
152
+
153
+ it 'defines Kreuzberg::Errors as a module' do
154
+ expect(Kreuzberg::Errors).to be_a(Module)
155
+ end
156
+ end
157
+
158
+ describe 'basic extraction smoke tests' do
159
+ it 'extracts inline text via bytes API' do
160
+ bytes = StringIO.new('Hello from Kreuzberg')
161
+ result = Kreuzberg.extract_bytes_sync(data: bytes.string, mime_type: 'text/plain')
162
+
163
+ expect(result.content).to include('Hello')
164
+ expect(result.mime_type).to eq('text/plain')
165
+ end
166
+
167
+ it 'extracts from small temp file via sync API' do
168
+ file = create_test_file('Simple document for smoke testing')
169
+ result = Kreuzberg.extract_file_sync(path: file)
170
+
171
+ expect(result.content).to include('Simple document')
172
+ expect(result.mime_type).to eq('text/plain')
173
+ ensure
174
+ File.delete(file) if file && File.exist?(file)
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'kreuzberg'
4
+ require 'tmpdir'
5
+ require 'fileutils'
6
+
7
+ RSpec.configure do |config|
8
+ config.expect_with :rspec do |expectations|
9
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
10
+ end
11
+
12
+ config.mock_with :rspec do |mocks|
13
+ mocks.verify_partial_doubles = true
14
+ end
15
+
16
+ config.shared_context_metadata_behavior = :apply_to_host_groups
17
+ config.filter_run_when_matching :focus
18
+ config.example_status_persistence_file_path = 'spec/examples.txt'
19
+ config.disable_monkey_patching!
20
+ config.warnings = true
21
+ config.default_formatter = 'doc' if config.files_to_run.one?
22
+ config.order = :random
23
+ Kernel.srand config.seed
24
+
25
+ config.include(Module.new do
26
+ def fixture_path(filename)
27
+ File.join(__dir__, 'fixtures', filename)
28
+ end
29
+
30
+ def test_document_path(relative_path)
31
+ File.expand_path(File.join(__dir__, '..', '..', '..', 'test_documents', relative_path))
32
+ end
33
+
34
+ def create_test_file(content, filename: 'test.txt')
35
+ path = File.join(Dir.tmpdir, filename)
36
+ File.write(path, content)
37
+ path
38
+ end
39
+ end)
40
+ end
@@ -0,0 +1,213 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::Chunking do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.max_chars).to eq 1000
9
+ expect(config.max_overlap).to eq 200
10
+ expect(config.preset).to be_nil
11
+ expect(config.embedding).to be_nil
12
+ expect(config.enabled).to be true
13
+ end
14
+
15
+ it 'creates config with custom values' do
16
+ config = described_class.new(
17
+ max_chars: 500,
18
+ max_overlap: 100,
19
+ preset: 'fast'
20
+ )
21
+
22
+ expect(config.max_chars).to eq 500
23
+ expect(config.max_overlap).to eq 100
24
+ expect(config.preset).to eq 'fast'
25
+ end
26
+
27
+ it 'accepts chunk_size alias for max_chars' do
28
+ config = described_class.new(chunk_size: 750)
29
+
30
+ expect(config.max_chars).to eq 750
31
+ end
32
+
33
+ it 'accepts chunk_overlap alias for max_overlap' do
34
+ config = described_class.new(chunk_overlap: 150)
35
+
36
+ expect(config.max_overlap).to eq 150
37
+ end
38
+
39
+ it 'uses max_chars when both chunk_size and max_chars provided' do
40
+ config = described_class.new(chunk_size: 500, max_chars: 1000)
41
+
42
+ expect(config.max_chars).to eq 500
43
+ end
44
+
45
+ it 'accepts embedding as instance' do
46
+ embedding = Kreuzberg::Config::Embedding.new(model: { type: :preset, name: 'fast' })
47
+ config = described_class.new(embedding: embedding)
48
+
49
+ expect(config.embedding).to be_a Kreuzberg::Config::Embedding
50
+ end
51
+
52
+ it 'converts embedding hash to instance' do
53
+ config = described_class.new(embedding: { model: { type: :preset, name: 'balanced' } })
54
+
55
+ expect(config.embedding).to be_a Kreuzberg::Config::Embedding
56
+ end
57
+ end
58
+
59
+ describe '#to_h' do
60
+ it 'serializes to hash with default values' do
61
+ config = described_class.new
62
+ hash = config.to_h
63
+
64
+ expect(hash).to be_a Hash
65
+ expect(hash[:max_chars]).to eq 1000
66
+ expect(hash[:max_overlap]).to eq 200
67
+ expect(hash[:enabled]).to be true
68
+ end
69
+
70
+ it 'includes embedding in hash when present' do
71
+ config = described_class.new(embedding: { model: { type: :preset, name: 'fast' } })
72
+ hash = config.to_h
73
+
74
+ expect(hash[:embedding]).to be_a Hash
75
+ end
76
+
77
+ it 'compacts nil values from hash' do
78
+ config = described_class.new
79
+ hash = config.to_h
80
+
81
+ expect(hash.key?(:preset)).to be false
82
+ expect(hash.key?(:embedding)).to be false
83
+ end
84
+ end
85
+
86
+ describe 'validation' do
87
+ it 'rejects negative max_chars' do
88
+ expect do
89
+ described_class.new(max_chars: -100)
90
+ end.to raise_error ArgumentError, /max_chars must be a positive integer/
91
+ end
92
+
93
+ it 'rejects negative max_overlap' do
94
+ expect do
95
+ described_class.new(max_overlap: -50)
96
+ end.to raise_error ArgumentError, /max_overlap must be a positive integer/
97
+ end
98
+
99
+ it 'accepts zero values' do
100
+ expect do
101
+ described_class.new(max_chars: 0, max_overlap: 0)
102
+ end.not_to raise_error
103
+ end
104
+
105
+ it 'accepts positive values' do
106
+ expect do
107
+ described_class.new(max_chars: 2000, max_overlap: 500)
108
+ end.not_to raise_error
109
+ end
110
+ end
111
+
112
+ describe 'keyword arguments' do
113
+ it 'accepts all keyword arguments' do
114
+ config = described_class.new(
115
+ max_chars: 750,
116
+ max_overlap: 150,
117
+ preset: 'balanced',
118
+ enabled: true
119
+ )
120
+
121
+ expect(config.max_chars).to eq 750
122
+ expect(config.max_overlap).to eq 150
123
+ expect(config.preset).to eq 'balanced'
124
+ expect(config.enabled).to be true
125
+ end
126
+
127
+ it 'converts preset to string' do
128
+ config = described_class.new(preset: :fast)
129
+
130
+ expect(config.preset).to eq 'fast'
131
+ expect(config.preset).to be_a String
132
+ end
133
+ end
134
+
135
+ describe 'equality' do
136
+ it 'compares configs by value' do
137
+ config1 = described_class.new(max_chars: 500, max_overlap: 100)
138
+ config2 = described_class.new(max_chars: 500, max_overlap: 100)
139
+
140
+ expect(config1.max_chars).to eq config2.max_chars
141
+ expect(config1.max_overlap).to eq config2.max_overlap
142
+ end
143
+
144
+ it 'detects differences' do
145
+ config1 = described_class.new(max_chars: 500)
146
+ config2 = described_class.new(max_chars: 1000)
147
+
148
+ expect(config1.max_chars).not_to eq config2.max_chars
149
+ end
150
+ end
151
+
152
+ describe 'nested config integration' do
153
+ it 'can be nested in Extraction config' do
154
+ chunking = described_class.new(max_chars: 750, preset: 'fast')
155
+ extraction = Kreuzberg::Config::Extraction.new(chunking: chunking)
156
+
157
+ expect(extraction.chunking).to be_a described_class
158
+ expect(extraction.chunking.max_chars).to eq 750
159
+ end
160
+
161
+ it 'accepts hash in Extraction config' do
162
+ extraction = Kreuzberg::Config::Extraction.new(
163
+ chunking: { max_chars: 500, preset: 'balanced' }
164
+ )
165
+
166
+ expect(extraction.chunking).to be_a described_class
167
+ expect(extraction.chunking.max_chars).to eq 500
168
+ end
169
+ end
170
+
171
+ describe 'symbol vs string key handling' do
172
+ it 'converts symbol preset to string' do
173
+ config = described_class.new(preset: :fast)
174
+
175
+ expect(config.preset).to eq 'fast'
176
+ expect(config.preset).to be_a String
177
+ end
178
+
179
+ it 'converts integer strings to integers' do
180
+ config = described_class.new(max_chars: '1500', max_overlap: '300')
181
+
182
+ expect(config.max_chars).to eq 1500
183
+ expect(config.max_overlap).to eq 300
184
+ expect(config.max_chars).to be_a Integer
185
+ end
186
+ end
187
+
188
+ describe 'enabled field' do
189
+ it 'defaults to true' do
190
+ config = described_class.new
191
+
192
+ expect(config.enabled).to be true
193
+ end
194
+
195
+ it 'accepts false' do
196
+ config = described_class.new(enabled: false)
197
+
198
+ expect(config.enabled).to be false
199
+ end
200
+
201
+ it 'converts truthy values to true' do
202
+ config = described_class.new(enabled: 'yes')
203
+
204
+ expect(config.enabled).to be true
205
+ end
206
+
207
+ it 'can be nil' do
208
+ config = described_class.new(enabled: nil)
209
+
210
+ expect(config.enabled).to be_nil
211
+ end
212
+ end
213
+ end