kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,1241 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Kreuzberg
|
|
6
|
+
module Config
|
|
7
|
+
# @example
|
|
8
|
+
class OCR
|
|
9
|
+
attr_reader :backend, :language, :tesseract_config, :paddle_ocr_config, :element_config
|
|
10
|
+
|
|
11
|
+
def initialize(
|
|
12
|
+
backend: 'tesseract',
|
|
13
|
+
language: 'eng',
|
|
14
|
+
tesseract_config: nil,
|
|
15
|
+
paddle_ocr_config: nil,
|
|
16
|
+
element_config: nil
|
|
17
|
+
)
|
|
18
|
+
@backend = backend.to_s
|
|
19
|
+
@language = language.to_s
|
|
20
|
+
@tesseract_config = normalize_tesseract_config(tesseract_config)
|
|
21
|
+
@paddle_ocr_config = normalize_paddle_ocr_config(paddle_ocr_config)
|
|
22
|
+
@element_config = normalize_element_config(element_config)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def to_h
|
|
26
|
+
{
|
|
27
|
+
backend: @backend,
|
|
28
|
+
language: @language,
|
|
29
|
+
tesseract_config: @tesseract_config&.to_h,
|
|
30
|
+
paddle_ocr_config: @paddle_ocr_config&.to_h,
|
|
31
|
+
element_config: @element_config&.to_h
|
|
32
|
+
}.compact
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def normalize_tesseract_config(value)
|
|
38
|
+
return nil if value.nil?
|
|
39
|
+
return value if value.is_a?(Tesseract)
|
|
40
|
+
return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
41
|
+
|
|
42
|
+
raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def normalize_paddle_ocr_config(value)
|
|
46
|
+
return nil if value.nil?
|
|
47
|
+
return value if value.is_a?(PaddleOcr)
|
|
48
|
+
return PaddleOcr.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
49
|
+
|
|
50
|
+
raise ArgumentError, "Expected #{PaddleOcr}, Hash, or nil, got #{value.class}"
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def normalize_element_config(value)
|
|
54
|
+
return nil if value.nil?
|
|
55
|
+
return value if value.is_a?(OcrElementConfig)
|
|
56
|
+
return OcrElementConfig.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
57
|
+
|
|
58
|
+
raise ArgumentError, "Expected #{OcrElementConfig}, Hash, or nil, got #{value.class}"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Tesseract OCR engine configuration
|
|
63
|
+
class Tesseract
|
|
64
|
+
attr_reader :options
|
|
65
|
+
|
|
66
|
+
def initialize(**options)
|
|
67
|
+
@options = options.transform_keys(&:to_sym)
|
|
68
|
+
normalize_nested_preprocessing!
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def to_h
|
|
72
|
+
@options.dup
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
def normalize_nested_preprocessing!
|
|
78
|
+
preprocessing = @options[:preprocessing]
|
|
79
|
+
return if preprocessing.nil?
|
|
80
|
+
return if preprocessing.is_a?(ImagePreprocessing)
|
|
81
|
+
return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
|
|
82
|
+
preprocessing.is_a?(Hash)
|
|
83
|
+
|
|
84
|
+
raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# PaddleOCR engine configuration
|
|
89
|
+
#
|
|
90
|
+
# @example Basic usage
|
|
91
|
+
# paddle = PaddleOcr.new(language: 'en', cache_dir: '/tmp/paddle')
|
|
92
|
+
#
|
|
93
|
+
# @example Advanced configuration
|
|
94
|
+
# paddle = PaddleOcr.new(
|
|
95
|
+
# language: 'en',
|
|
96
|
+
# cache_dir: '/tmp/paddle',
|
|
97
|
+
# use_angle_cls: true,
|
|
98
|
+
# det_db_thresh: 0.3,
|
|
99
|
+
# rec_batch_num: 32
|
|
100
|
+
# )
|
|
101
|
+
#
|
|
102
|
+
class PaddleOcr
|
|
103
|
+
attr_reader :language, :cache_dir, :use_angle_cls, :enable_table_detection,
|
|
104
|
+
:det_db_thresh, :det_db_box_thresh, :det_db_unclip_ratio,
|
|
105
|
+
:det_limit_side_len, :rec_batch_num
|
|
106
|
+
|
|
107
|
+
def initialize(
|
|
108
|
+
language: nil,
|
|
109
|
+
cache_dir: nil,
|
|
110
|
+
use_angle_cls: nil,
|
|
111
|
+
enable_table_detection: nil,
|
|
112
|
+
det_db_thresh: nil,
|
|
113
|
+
det_db_box_thresh: nil,
|
|
114
|
+
det_db_unclip_ratio: nil,
|
|
115
|
+
det_limit_side_len: nil,
|
|
116
|
+
rec_batch_num: nil
|
|
117
|
+
)
|
|
118
|
+
@language = language&.to_s
|
|
119
|
+
@cache_dir = cache_dir&.to_s
|
|
120
|
+
@use_angle_cls = boolean_or_nil(use_angle_cls)
|
|
121
|
+
@enable_table_detection = boolean_or_nil(enable_table_detection)
|
|
122
|
+
@det_db_thresh = det_db_thresh&.to_f
|
|
123
|
+
@det_db_box_thresh = det_db_box_thresh&.to_f
|
|
124
|
+
@det_db_unclip_ratio = det_db_unclip_ratio&.to_f
|
|
125
|
+
@det_limit_side_len = det_limit_side_len&.to_i
|
|
126
|
+
@rec_batch_num = rec_batch_num&.to_i
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def to_h
|
|
130
|
+
{
|
|
131
|
+
language: @language,
|
|
132
|
+
cache_dir: @cache_dir,
|
|
133
|
+
use_angle_cls: @use_angle_cls,
|
|
134
|
+
enable_table_detection: @enable_table_detection,
|
|
135
|
+
det_db_thresh: @det_db_thresh,
|
|
136
|
+
det_db_box_thresh: @det_db_box_thresh,
|
|
137
|
+
det_db_unclip_ratio: @det_db_unclip_ratio,
|
|
138
|
+
det_limit_side_len: @det_limit_side_len,
|
|
139
|
+
rec_batch_num: @rec_batch_num
|
|
140
|
+
}.compact
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
private
|
|
144
|
+
|
|
145
|
+
def boolean_or_nil(value)
|
|
146
|
+
return nil if value.nil?
|
|
147
|
+
|
|
148
|
+
value ? true : false
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# OCR element configuration for output control
|
|
153
|
+
#
|
|
154
|
+
# @example Basic usage
|
|
155
|
+
# config = OcrElementConfig.new(include_elements: true)
|
|
156
|
+
#
|
|
157
|
+
# @example Advanced configuration
|
|
158
|
+
# config = OcrElementConfig.new(
|
|
159
|
+
# include_elements: true,
|
|
160
|
+
# min_level: 'word',
|
|
161
|
+
# min_confidence: 0.7,
|
|
162
|
+
# build_hierarchy: true
|
|
163
|
+
# )
|
|
164
|
+
#
|
|
165
|
+
class OcrElementConfig
|
|
166
|
+
attr_reader :include_elements, :min_level, :min_confidence, :build_hierarchy
|
|
167
|
+
|
|
168
|
+
def initialize(
|
|
169
|
+
include_elements: false,
|
|
170
|
+
min_level: nil,
|
|
171
|
+
min_confidence: nil,
|
|
172
|
+
build_hierarchy: false
|
|
173
|
+
)
|
|
174
|
+
@include_elements = include_elements ? true : false
|
|
175
|
+
@min_level = min_level&.to_s
|
|
176
|
+
@min_confidence = min_confidence&.to_f
|
|
177
|
+
@build_hierarchy = build_hierarchy ? true : false
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def to_h
|
|
181
|
+
{
|
|
182
|
+
include_elements: @include_elements,
|
|
183
|
+
min_level: @min_level,
|
|
184
|
+
min_confidence: @min_confidence,
|
|
185
|
+
build_hierarchy: @build_hierarchy
|
|
186
|
+
}.compact
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Chunking configuration
|
|
191
|
+
#
|
|
192
|
+
# @example
|
|
193
|
+
# chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
|
|
194
|
+
#
|
|
195
|
+
class Chunking
|
|
196
|
+
attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
|
|
197
|
+
|
|
198
|
+
def initialize(
|
|
199
|
+
max_chars: nil,
|
|
200
|
+
max_overlap: nil,
|
|
201
|
+
preset: nil,
|
|
202
|
+
embedding: nil,
|
|
203
|
+
chunk_size: nil,
|
|
204
|
+
chunk_overlap: nil,
|
|
205
|
+
enabled: true
|
|
206
|
+
)
|
|
207
|
+
resolved_size = chunk_size || max_chars || 1000
|
|
208
|
+
resolved_overlap = chunk_overlap || max_overlap || 200
|
|
209
|
+
|
|
210
|
+
@max_chars = resolved_size.to_i
|
|
211
|
+
@max_overlap = resolved_overlap.to_i
|
|
212
|
+
|
|
213
|
+
# Validate positive values
|
|
214
|
+
raise ArgumentError, "max_chars must be a positive integer, got #{@max_chars}" if @max_chars.negative?
|
|
215
|
+
raise ArgumentError, "max_overlap must be a positive integer, got #{@max_overlap}" if @max_overlap.negative?
|
|
216
|
+
|
|
217
|
+
@preset = preset&.to_s
|
|
218
|
+
@embedding = normalize_embedding(embedding)
|
|
219
|
+
@enabled = boolean_or_nil(enabled)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def to_h
|
|
223
|
+
config = {
|
|
224
|
+
max_chars: @max_chars,
|
|
225
|
+
max_overlap: @max_overlap,
|
|
226
|
+
preset: @preset,
|
|
227
|
+
embedding: @embedding&.to_h
|
|
228
|
+
}.compact
|
|
229
|
+
# @type var config: Hash[Symbol, untyped]
|
|
230
|
+
config[:enabled] = @enabled unless @enabled.nil?
|
|
231
|
+
config
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
private
|
|
235
|
+
|
|
236
|
+
def normalize_embedding(value)
|
|
237
|
+
return nil if value.nil?
|
|
238
|
+
return value if value.is_a?(Embedding)
|
|
239
|
+
return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
240
|
+
|
|
241
|
+
raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def boolean_or_nil(value)
|
|
245
|
+
return nil if value.nil?
|
|
246
|
+
|
|
247
|
+
value ? true : false
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Embedding model configuration for document chunking
|
|
252
|
+
class Embedding
|
|
253
|
+
attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
|
|
254
|
+
|
|
255
|
+
def initialize(
|
|
256
|
+
model: { type: :preset, name: 'balanced' },
|
|
257
|
+
normalize: true,
|
|
258
|
+
batch_size: 32,
|
|
259
|
+
show_download_progress: false,
|
|
260
|
+
cache_dir: nil
|
|
261
|
+
)
|
|
262
|
+
@model = normalize_model(model)
|
|
263
|
+
@normalize = boolean_or_nil(normalize)
|
|
264
|
+
@batch_size = batch_size&.to_i
|
|
265
|
+
@show_download_progress = boolean_or_nil(show_download_progress)
|
|
266
|
+
@cache_dir = cache_dir&.to_s
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def to_h
|
|
270
|
+
{
|
|
271
|
+
model: @model,
|
|
272
|
+
normalize: @normalize,
|
|
273
|
+
batch_size: @batch_size,
|
|
274
|
+
show_download_progress: @show_download_progress,
|
|
275
|
+
cache_dir: @cache_dir
|
|
276
|
+
}.compact
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
private
|
|
280
|
+
|
|
281
|
+
def normalize_model(model)
|
|
282
|
+
normalized = if model.respond_to?(:to_h)
|
|
283
|
+
model.to_h
|
|
284
|
+
else
|
|
285
|
+
model
|
|
286
|
+
end
|
|
287
|
+
raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
|
|
288
|
+
|
|
289
|
+
normalized.transform_keys(&:to_sym)
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
def boolean_or_nil(value)
|
|
293
|
+
return nil if value.nil?
|
|
294
|
+
|
|
295
|
+
value ? true : false
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# Language detection configuration
|
|
300
|
+
#
|
|
301
|
+
# @example
|
|
302
|
+
# lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
|
|
303
|
+
#
|
|
304
|
+
class LanguageDetection
|
|
305
|
+
attr_reader :enabled, :min_confidence, :detect_multiple
|
|
306
|
+
|
|
307
|
+
def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
|
|
308
|
+
@enabled = enabled ? true : false
|
|
309
|
+
@min_confidence = min_confidence.to_f
|
|
310
|
+
@detect_multiple = detect_multiple ? true : false
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def to_h
|
|
314
|
+
{
|
|
315
|
+
enabled: @enabled,
|
|
316
|
+
min_confidence: @min_confidence,
|
|
317
|
+
detect_multiple: @detect_multiple
|
|
318
|
+
}
|
|
319
|
+
end
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
# Font configuration for PDF rendering
|
|
323
|
+
#
|
|
324
|
+
# @example
|
|
325
|
+
# font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
|
|
326
|
+
#
|
|
327
|
+
class FontConfig
|
|
328
|
+
attr_accessor :enabled, :custom_font_dirs
|
|
329
|
+
|
|
330
|
+
def initialize(enabled: true, custom_font_dirs: nil)
|
|
331
|
+
@enabled = enabled ? true : false
|
|
332
|
+
@custom_font_dirs = custom_font_dirs
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def to_h
|
|
336
|
+
{
|
|
337
|
+
enabled: @enabled,
|
|
338
|
+
custom_font_dirs: @custom_font_dirs
|
|
339
|
+
}.compact
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
# Hierarchy detection configuration
|
|
344
|
+
#
|
|
345
|
+
# @example
|
|
346
|
+
# hierarchy = Hierarchy.new(enabled: true, k_clusters: 6, include_bbox: true)
|
|
347
|
+
#
|
|
348
|
+
class Hierarchy
|
|
349
|
+
attr_reader :enabled, :k_clusters, :include_bbox, :ocr_coverage_threshold
|
|
350
|
+
|
|
351
|
+
def initialize(
|
|
352
|
+
enabled: true,
|
|
353
|
+
k_clusters: 6,
|
|
354
|
+
include_bbox: true,
|
|
355
|
+
ocr_coverage_threshold: nil
|
|
356
|
+
)
|
|
357
|
+
@enabled = enabled ? true : false
|
|
358
|
+
@k_clusters = k_clusters&.to_i || 6
|
|
359
|
+
@include_bbox = include_bbox ? true : false
|
|
360
|
+
@ocr_coverage_threshold = ocr_coverage_threshold&.to_f
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
def to_h
|
|
364
|
+
{
|
|
365
|
+
enabled: @enabled,
|
|
366
|
+
k_clusters: @k_clusters,
|
|
367
|
+
include_bbox: @include_bbox,
|
|
368
|
+
ocr_coverage_threshold: @ocr_coverage_threshold
|
|
369
|
+
}.compact
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
def self.from_h(hash)
|
|
373
|
+
return nil if hash.nil?
|
|
374
|
+
return hash if hash.is_a?(self)
|
|
375
|
+
|
|
376
|
+
new(**hash.transform_keys(&:to_sym)) if hash.is_a?(Hash)
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
# PDF-specific options
|
|
381
|
+
#
|
|
382
|
+
# @example
|
|
383
|
+
# pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
|
|
384
|
+
#
|
|
385
|
+
# @example With font configuration
|
|
386
|
+
# font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
|
|
387
|
+
# pdf = PDF.new(extract_images: true, font_config: font_config)
|
|
388
|
+
#
|
|
389
|
+
# @example With hierarchy configuration
|
|
390
|
+
# hierarchy = Hierarchy.new(enabled: true, k_clusters: 6)
|
|
391
|
+
# pdf = PDF.new(extract_images: true, hierarchy: hierarchy)
|
|
392
|
+
#
|
|
393
|
+
class PDF
|
|
394
|
+
attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy
|
|
395
|
+
|
|
396
|
+
def initialize(
|
|
397
|
+
extract_images: false,
|
|
398
|
+
passwords: nil,
|
|
399
|
+
extract_metadata: true,
|
|
400
|
+
font_config: nil,
|
|
401
|
+
hierarchy: nil
|
|
402
|
+
)
|
|
403
|
+
@extract_images = extract_images ? true : false
|
|
404
|
+
@passwords = if passwords.is_a?(Array)
|
|
405
|
+
passwords.map(&:to_s)
|
|
406
|
+
else
|
|
407
|
+
(passwords ? [passwords.to_s] : nil)
|
|
408
|
+
end
|
|
409
|
+
@extract_metadata = extract_metadata ? true : false
|
|
410
|
+
@font_config = normalize_font_config(font_config)
|
|
411
|
+
@hierarchy = normalize_hierarchy(hierarchy)
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
def to_h
|
|
415
|
+
{
|
|
416
|
+
extract_images: @extract_images,
|
|
417
|
+
passwords: @passwords,
|
|
418
|
+
extract_metadata: @extract_metadata,
|
|
419
|
+
font_config: @font_config&.to_h,
|
|
420
|
+
hierarchy: @hierarchy&.to_h
|
|
421
|
+
}.compact
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
def font_config=(value)
|
|
425
|
+
@font_config = normalize_font_config(value)
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
def hierarchy=(value)
|
|
429
|
+
@hierarchy = normalize_hierarchy(value)
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
private
|
|
433
|
+
|
|
434
|
+
def normalize_font_config(value)
|
|
435
|
+
return nil if value.nil?
|
|
436
|
+
return value if value.is_a?(FontConfig)
|
|
437
|
+
return FontConfig.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
438
|
+
|
|
439
|
+
raise ArgumentError, "Expected #{FontConfig}, Hash, or nil, got #{value.class}"
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
def normalize_hierarchy(value)
|
|
443
|
+
return nil if value.nil?
|
|
444
|
+
return value if value.is_a?(Hierarchy)
|
|
445
|
+
return Hierarchy.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
446
|
+
|
|
447
|
+
raise ArgumentError, "Expected #{Hierarchy}, Hash, or nil, got #{value.class}"
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
# Image extraction configuration
|
|
452
|
+
#
|
|
453
|
+
# @example
|
|
454
|
+
# image = ImageExtraction.new(extract_images: true, target_dpi: 300)
|
|
455
|
+
#
|
|
456
|
+
# @example With auto-adjust DPI
|
|
457
|
+
# image = ImageExtraction.new(
|
|
458
|
+
# extract_images: true,
|
|
459
|
+
# auto_adjust_dpi: true,
|
|
460
|
+
# min_dpi: 150,
|
|
461
|
+
# max_dpi: 600
|
|
462
|
+
# )
|
|
463
|
+
#
|
|
464
|
+
class ImageExtraction
|
|
465
|
+
attr_reader :extract_images, :target_dpi, :max_image_dimension,
|
|
466
|
+
:auto_adjust_dpi, :min_dpi, :max_dpi
|
|
467
|
+
|
|
468
|
+
def initialize(
|
|
469
|
+
extract_images: true,
|
|
470
|
+
target_dpi: 300,
|
|
471
|
+
max_image_dimension: 2000,
|
|
472
|
+
auto_adjust_dpi: true,
|
|
473
|
+
min_dpi: 150,
|
|
474
|
+
max_dpi: 600
|
|
475
|
+
)
|
|
476
|
+
@extract_images = extract_images ? true : false
|
|
477
|
+
@target_dpi = target_dpi.to_i
|
|
478
|
+
@max_image_dimension = max_image_dimension.to_i
|
|
479
|
+
@auto_adjust_dpi = auto_adjust_dpi ? true : false
|
|
480
|
+
@min_dpi = min_dpi.to_i
|
|
481
|
+
@max_dpi = max_dpi.to_i
|
|
482
|
+
end
|
|
483
|
+
|
|
484
|
+
def to_h
|
|
485
|
+
{
|
|
486
|
+
extract_images: @extract_images,
|
|
487
|
+
target_dpi: @target_dpi,
|
|
488
|
+
max_image_dimension: @max_image_dimension,
|
|
489
|
+
auto_adjust_dpi: @auto_adjust_dpi,
|
|
490
|
+
min_dpi: @min_dpi,
|
|
491
|
+
max_dpi: @max_dpi
|
|
492
|
+
}
|
|
493
|
+
end
|
|
494
|
+
end
|
|
495
|
+
|
|
496
|
+
# Image preprocessing configuration for OCR
|
|
497
|
+
#
|
|
498
|
+
# @example Basic preprocessing
|
|
499
|
+
# preprocessing = ImagePreprocessing.new(
|
|
500
|
+
# binarization_method: "otsu",
|
|
501
|
+
# denoise: true
|
|
502
|
+
# )
|
|
503
|
+
#
|
|
504
|
+
# @example Advanced preprocessing
|
|
505
|
+
# preprocessing = ImagePreprocessing.new(
|
|
506
|
+
# target_dpi: 600,
|
|
507
|
+
# auto_rotate: true,
|
|
508
|
+
# deskew: true,
|
|
509
|
+
# denoise: true,
|
|
510
|
+
# contrast_enhance: true,
|
|
511
|
+
# binarization_method: "sauvola",
|
|
512
|
+
# invert_colors: false
|
|
513
|
+
# )
|
|
514
|
+
#
|
|
515
|
+
class ImagePreprocessing
|
|
516
|
+
attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
|
|
517
|
+
:contrast_enhance, :binarization_method, :invert_colors
|
|
518
|
+
|
|
519
|
+
VALID_BINARIZATION_METHODS = %w[otsu sauvola niblack wolf bradley adaptive].freeze
|
|
520
|
+
|
|
521
|
+
def initialize(
|
|
522
|
+
target_dpi: 300,
|
|
523
|
+
auto_rotate: true,
|
|
524
|
+
deskew: true,
|
|
525
|
+
denoise: false,
|
|
526
|
+
contrast_enhance: true,
|
|
527
|
+
binarization_method: 'otsu',
|
|
528
|
+
invert_colors: false
|
|
529
|
+
)
|
|
530
|
+
@target_dpi = target_dpi.to_i
|
|
531
|
+
@auto_rotate = auto_rotate ? true : false
|
|
532
|
+
@deskew = deskew ? true : false
|
|
533
|
+
@denoise = denoise ? true : false
|
|
534
|
+
@contrast_enhance = contrast_enhance ? true : false
|
|
535
|
+
@binarization_method = binarization_method.to_s
|
|
536
|
+
@invert_colors = invert_colors ? true : false
|
|
537
|
+
|
|
538
|
+
# Validate binarization method
|
|
539
|
+
return if VALID_BINARIZATION_METHODS.include?(@binarization_method)
|
|
540
|
+
|
|
541
|
+
valid_methods = VALID_BINARIZATION_METHODS.join(', ')
|
|
542
|
+
raise ArgumentError,
|
|
543
|
+
"Invalid binarization_method: #{@binarization_method}. Valid methods are: #{valid_methods}"
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
def to_h
|
|
547
|
+
{
|
|
548
|
+
target_dpi: @target_dpi,
|
|
549
|
+
auto_rotate: @auto_rotate,
|
|
550
|
+
deskew: @deskew,
|
|
551
|
+
denoise: @denoise,
|
|
552
|
+
contrast_enhance: @contrast_enhance,
|
|
553
|
+
binarization_method: @binarization_method,
|
|
554
|
+
invert_colors: @invert_colors
|
|
555
|
+
}
|
|
556
|
+
end
|
|
557
|
+
end
|
|
558
|
+
|
|
559
|
+
# Token reduction configuration
|
|
560
|
+
#
|
|
561
|
+
# @example Disable token reduction
|
|
562
|
+
# token = TokenReduction.new(mode: "off")
|
|
563
|
+
#
|
|
564
|
+
# @example Light reduction
|
|
565
|
+
# token = TokenReduction.new(mode: "light", preserve_important_words: true)
|
|
566
|
+
#
|
|
567
|
+
# @example Aggressive reduction
|
|
568
|
+
# token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
|
|
569
|
+
#
|
|
570
|
+
class TokenReduction
|
|
571
|
+
attr_reader :mode, :preserve_important_words
|
|
572
|
+
|
|
573
|
+
VALID_MODES = %w[off light moderate aggressive maximum].freeze
|
|
574
|
+
|
|
575
|
+
def initialize(mode: 'off', preserve_important_words: true)
|
|
576
|
+
@mode = mode.to_s
|
|
577
|
+
@preserve_important_words = preserve_important_words ? true : false
|
|
578
|
+
|
|
579
|
+
# Validate mode against known valid modes
|
|
580
|
+
return if VALID_MODES.include?(@mode)
|
|
581
|
+
|
|
582
|
+
raise ArgumentError, "Invalid token reduction mode: #{@mode}. Valid modes are: #{VALID_MODES.join(', ')}"
|
|
583
|
+
end
|
|
584
|
+
|
|
585
|
+
def to_h
|
|
586
|
+
{
|
|
587
|
+
mode: @mode,
|
|
588
|
+
preserve_important_words: @preserve_important_words
|
|
589
|
+
}
|
|
590
|
+
end
|
|
591
|
+
end
|
|
592
|
+
|
|
593
|
+
# HTML preprocessing configuration for content extraction
|
|
594
|
+
class HtmlPreprocessing
|
|
595
|
+
attr_reader :enabled, :preset, :remove_navigation, :remove_forms
|
|
596
|
+
|
|
597
|
+
def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
|
|
598
|
+
@enabled = boolean_or_nil(enabled)
|
|
599
|
+
@preset = preset&.to_sym
|
|
600
|
+
@remove_navigation = boolean_or_nil(remove_navigation)
|
|
601
|
+
@remove_forms = boolean_or_nil(remove_forms)
|
|
602
|
+
end
|
|
603
|
+
|
|
604
|
+
def to_h
|
|
605
|
+
{
|
|
606
|
+
enabled: @enabled,
|
|
607
|
+
preset: @preset,
|
|
608
|
+
remove_navigation: @remove_navigation,
|
|
609
|
+
remove_forms: @remove_forms
|
|
610
|
+
}.compact
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
private
|
|
614
|
+
|
|
615
|
+
def boolean_or_nil(value)
|
|
616
|
+
return nil if value.nil?
|
|
617
|
+
|
|
618
|
+
value ? true : false
|
|
619
|
+
end
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
# HTML rendering options for document conversion
|
|
623
|
+
class HtmlOptions
|
|
624
|
+
attr_reader :options
|
|
625
|
+
|
|
626
|
+
def initialize(**options)
|
|
627
|
+
normalized = options.transform_keys(&:to_sym)
|
|
628
|
+
symbol_keys = %i[
|
|
629
|
+
heading_style
|
|
630
|
+
code_block_style
|
|
631
|
+
highlight_style
|
|
632
|
+
list_indent_type
|
|
633
|
+
newline_style
|
|
634
|
+
whitespace_mode
|
|
635
|
+
]
|
|
636
|
+
symbol_keys.each do |key|
|
|
637
|
+
normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
|
|
638
|
+
end
|
|
639
|
+
if normalized[:preprocessing].is_a?(Hash)
|
|
640
|
+
normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
|
|
641
|
+
end
|
|
642
|
+
@options = normalized
|
|
643
|
+
end
|
|
644
|
+
|
|
645
|
+
def to_h
|
|
646
|
+
@options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
|
|
647
|
+
end
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
# YAKE keyword extraction parameters
|
|
651
|
+
class KeywordYakeParams
|
|
652
|
+
attr_reader :window_size
|
|
653
|
+
|
|
654
|
+
def initialize(window_size: 2)
|
|
655
|
+
@window_size = window_size.to_i
|
|
656
|
+
end
|
|
657
|
+
|
|
658
|
+
def to_h
|
|
659
|
+
{ window_size: @window_size }
|
|
660
|
+
end
|
|
661
|
+
end
|
|
662
|
+
|
|
663
|
+
# RAKE keyword extraction parameters
|
|
664
|
+
class KeywordRakeParams
|
|
665
|
+
attr_reader :min_word_length, :max_words_per_phrase
|
|
666
|
+
|
|
667
|
+
def initialize(min_word_length: 1, max_words_per_phrase: 3)
|
|
668
|
+
@min_word_length = min_word_length.to_i
|
|
669
|
+
@max_words_per_phrase = max_words_per_phrase.to_i
|
|
670
|
+
end
|
|
671
|
+
|
|
672
|
+
def to_h
|
|
673
|
+
{
|
|
674
|
+
min_word_length: @min_word_length,
|
|
675
|
+
max_words_per_phrase: @max_words_per_phrase
|
|
676
|
+
}
|
|
677
|
+
end
|
|
678
|
+
end
|
|
679
|
+
|
|
680
|
+
# Keyword extraction configuration for document analysis
|
|
681
|
+
class Keywords
|
|
682
|
+
attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
|
|
683
|
+
:language, :yake_params, :rake_params
|
|
684
|
+
|
|
685
|
+
def initialize(
|
|
686
|
+
algorithm: nil,
|
|
687
|
+
max_keywords: nil,
|
|
688
|
+
min_score: nil,
|
|
689
|
+
ngram_range: nil,
|
|
690
|
+
language: nil,
|
|
691
|
+
yake_params: nil,
|
|
692
|
+
rake_params: nil
|
|
693
|
+
)
|
|
694
|
+
@algorithm = algorithm&.to_s
|
|
695
|
+
@max_keywords = max_keywords&.to_i
|
|
696
|
+
@min_score = min_score&.to_f
|
|
697
|
+
@ngram_range = ngram_range&.map(&:to_i)
|
|
698
|
+
@language = language&.to_s
|
|
699
|
+
@yake_params = normalize_nested(yake_params, KeywordYakeParams)
|
|
700
|
+
@rake_params = normalize_nested(rake_params, KeywordRakeParams)
|
|
701
|
+
end
|
|
702
|
+
|
|
703
|
+
def to_h
|
|
704
|
+
{
|
|
705
|
+
algorithm: @algorithm,
|
|
706
|
+
max_keywords: @max_keywords,
|
|
707
|
+
min_score: @min_score,
|
|
708
|
+
ngram_range: @ngram_range,
|
|
709
|
+
language: @language,
|
|
710
|
+
yake_params: @yake_params&.to_h,
|
|
711
|
+
rake_params: @rake_params&.to_h
|
|
712
|
+
}.compact
|
|
713
|
+
end
|
|
714
|
+
|
|
715
|
+
private
|
|
716
|
+
|
|
717
|
+
def normalize_nested(value, klass)
|
|
718
|
+
return nil if value.nil?
|
|
719
|
+
return value if value.is_a?(klass)
|
|
720
|
+
return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
721
|
+
|
|
722
|
+
raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
|
|
723
|
+
end
|
|
724
|
+
end
|
|
725
|
+
|
|
726
|
+
# Page tracking configuration for multi-page documents
|
|
727
|
+
#
|
|
728
|
+
# @example Enable page extraction
|
|
729
|
+
# pages = PageConfig.new(extract_pages: true)
|
|
730
|
+
#
|
|
731
|
+
# @example Enable page markers in content
|
|
732
|
+
# pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
|
|
733
|
+
#
|
|
734
|
+
class PageConfig
|
|
735
|
+
attr_reader :extract_pages, :insert_page_markers, :marker_format
|
|
736
|
+
|
|
737
|
+
def initialize(
|
|
738
|
+
extract_pages: false,
|
|
739
|
+
insert_page_markers: false,
|
|
740
|
+
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
741
|
+
)
|
|
742
|
+
# Handle boolean conversion: treat 0 as false (like in C/FFI), but other truthy values as true
|
|
743
|
+
@extract_pages = !extract_pages.nil? && extract_pages != false && extract_pages != 0
|
|
744
|
+
@insert_page_markers = !insert_page_markers.nil? && insert_page_markers != false && insert_page_markers != 0
|
|
745
|
+
@marker_format = marker_format.to_s
|
|
746
|
+
end
|
|
747
|
+
|
|
748
|
+
def to_h
|
|
749
|
+
{
|
|
750
|
+
extract_pages: @extract_pages,
|
|
751
|
+
insert_page_markers: @insert_page_markers,
|
|
752
|
+
marker_format: @marker_format
|
|
753
|
+
}
|
|
754
|
+
end
|
|
755
|
+
end
|
|
756
|
+
|
|
757
|
+
# Post-processor configuration
|
|
758
|
+
#
|
|
759
|
+
# @example Enable all post-processors
|
|
760
|
+
# postprocessor = PostProcessor.new(enabled: true)
|
|
761
|
+
#
|
|
762
|
+
# @example Enable specific processors
|
|
763
|
+
# postprocessor = PostProcessor.new(
|
|
764
|
+
# enabled: true,
|
|
765
|
+
# enabled_processors: ["quality", "formatting"]
|
|
766
|
+
# )
|
|
767
|
+
#
|
|
768
|
+
# @example Disable specific processors
|
|
769
|
+
# postprocessor = PostProcessor.new(
|
|
770
|
+
# enabled: true,
|
|
771
|
+
# disabled_processors: ["token_reduction"]
|
|
772
|
+
# )
|
|
773
|
+
#
|
|
774
|
+
class PostProcessor
|
|
775
|
+
attr_reader :enabled, :enabled_processors, :disabled_processors
|
|
776
|
+
|
|
777
|
+
def initialize(
|
|
778
|
+
enabled: true,
|
|
779
|
+
enabled_processors: nil,
|
|
780
|
+
disabled_processors: nil
|
|
781
|
+
)
|
|
782
|
+
@enabled = enabled ? true : false
|
|
783
|
+
@enabled_processors = enabled_processors&.map(&:to_s)
|
|
784
|
+
@disabled_processors = disabled_processors&.map(&:to_s)
|
|
785
|
+
end
|
|
786
|
+
|
|
787
|
+
def to_h
|
|
788
|
+
{
|
|
789
|
+
enabled: @enabled,
|
|
790
|
+
enabled_processors: @enabled_processors,
|
|
791
|
+
disabled_processors: @disabled_processors
|
|
792
|
+
}.compact
|
|
793
|
+
end
|
|
794
|
+
end
|
|
795
|
+
|
|
796
|
+
# Main extraction configuration
|
|
797
|
+
#
|
|
798
|
+
# @example Basic usage
|
|
799
|
+
# config = Extraction.new(use_cache: true, force_ocr: true)
|
|
800
|
+
#
|
|
801
|
+
# @example With OCR
|
|
802
|
+
# ocr = Config::OCR.new(backend: "tesseract", language: "eng")
|
|
803
|
+
# config = Extraction.new(ocr: ocr)
|
|
804
|
+
#
|
|
805
|
+
# @example With image extraction
|
|
806
|
+
# image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
|
|
807
|
+
# config = Extraction.new(image_extraction: image)
|
|
808
|
+
#
|
|
809
|
+
# @example With post-processing
|
|
810
|
+
# postprocessor = Config::PostProcessor.new(
|
|
811
|
+
# enabled: true,
|
|
812
|
+
# enabled_processors: ["quality"]
|
|
813
|
+
# )
|
|
814
|
+
# config = Extraction.new(postprocessor: postprocessor)
|
|
815
|
+
#
|
|
816
|
+
# @example With document structure
|
|
817
|
+
# config = Extraction.new(include_document_structure: true)
|
|
818
|
+
#
|
|
819
|
+
# @example With all options
|
|
820
|
+
# config = Extraction.new(
|
|
821
|
+
# use_cache: true,
|
|
822
|
+
# enable_quality_processing: true,
|
|
823
|
+
# force_ocr: false,
|
|
824
|
+
# include_document_structure: true,
|
|
825
|
+
# ocr: Config::OCR.new(language: "deu"),
|
|
826
|
+
# chunking: Config::Chunking.new(max_chars: 500),
|
|
827
|
+
# language_detection: Config::LanguageDetection.new(enabled: true),
|
|
828
|
+
# pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
|
|
829
|
+
# image_extraction: Config::ImageExtraction.new(target_dpi: 600),
|
|
830
|
+
# postprocessor: Config::PostProcessor.new(enabled: true)
|
|
831
|
+
# )
|
|
832
|
+
#
|
|
833
|
+
class Extraction
|
|
834
|
+
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
835
|
+
:include_document_structure,
|
|
836
|
+
:ocr, :chunking, :language_detection, :pdf_options,
|
|
837
|
+
:images, :postprocessor,
|
|
838
|
+
:token_reduction, :keywords, :html_options, :pages,
|
|
839
|
+
:max_concurrent_extractions, :output_format, :result_format,
|
|
840
|
+
:security_limits
|
|
841
|
+
|
|
842
|
+
# Alias for backward compatibility - image_extraction is the canonical name
|
|
843
|
+
alias image_extraction images
|
|
844
|
+
|
|
845
|
+
# Load configuration from a file.
|
|
846
|
+
#
|
|
847
|
+
# Detects the file format from the extension (.toml, .yaml, .json)
|
|
848
|
+
# and loads the configuration accordingly.
|
|
849
|
+
#
|
|
850
|
+
# @param path [String] Path to the configuration file
|
|
851
|
+
# @return [Kreuzberg::Config::Extraction] Loaded configuration object
|
|
852
|
+
#
|
|
853
|
+
# @example Load from TOML
|
|
854
|
+
# config = Kreuzberg::Config::Extraction.from_file("config.toml")
|
|
855
|
+
#
|
|
856
|
+
# @example Load from YAML
|
|
857
|
+
# config = Kreuzberg::Config::Extraction.from_file("config.yaml")
|
|
858
|
+
#
|
|
859
|
+
# Keys that are allowed in the Extraction config
|
|
860
|
+
ALLOWED_KEYS = %i[
|
|
861
|
+
use_cache enable_quality_processing force_ocr include_document_structure ocr chunking
|
|
862
|
+
language_detection pdf_options image_extraction
|
|
863
|
+
postprocessor token_reduction keywords html_options pages
|
|
864
|
+
max_concurrent_extractions output_format result_format
|
|
865
|
+
security_limits
|
|
866
|
+
].freeze
|
|
867
|
+
|
|
868
|
+
# Aliases for backward compatibility
|
|
869
|
+
KEY_ALIASES = {
|
|
870
|
+
images: :image_extraction
|
|
871
|
+
}.freeze
|
|
872
|
+
|
|
873
|
+
# Valid output format values (case-insensitive, normalized internally)
|
|
874
|
+
VALID_OUTPUT_FORMATS = %w[plain markdown html djot].freeze
|
|
875
|
+
|
|
876
|
+
# Valid result format values (case-insensitive, normalized internally)
|
|
877
|
+
VALID_RESULT_FORMATS = %w[unified elements element_based].freeze
|
|
878
|
+
|
|
879
|
+
def self.from_file(path)
|
|
880
|
+
hash = Kreuzberg._config_from_file_native(path)
|
|
881
|
+
new(**normalize_hash_keys(hash))
|
|
882
|
+
end
|
|
883
|
+
|
|
884
|
+
# Normalize hash keys from native function
|
|
885
|
+
# - Converts string keys to symbols
|
|
886
|
+
# - Maps aliased keys to their canonical names
|
|
887
|
+
# - Filters out unknown keys
|
|
888
|
+
def self.normalize_hash_keys(hash)
|
|
889
|
+
symbolized = hash.transform_keys(&:to_sym)
|
|
890
|
+
|
|
891
|
+
# Apply key aliases
|
|
892
|
+
KEY_ALIASES.each do |from, to|
|
|
893
|
+
symbolized[to] = symbolized.delete(from) if symbolized.key?(from) && !symbolized.key?(to)
|
|
894
|
+
end
|
|
895
|
+
|
|
896
|
+
# Filter to only allowed keys
|
|
897
|
+
symbolized.slice(*ALLOWED_KEYS)
|
|
898
|
+
end
|
|
899
|
+
|
|
900
|
+
private_class_method :normalize_hash_keys
|
|
901
|
+
|
|
902
|
+
# Discover configuration file in current or parent directories.
|
|
903
|
+
#
|
|
904
|
+
# Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
|
|
905
|
+
# directory and parent directories.
|
|
906
|
+
#
|
|
907
|
+
# @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
|
|
908
|
+
#
|
|
909
|
+
# @example
|
|
910
|
+
# config = Kreuzberg::Config::Extraction.discover
|
|
911
|
+
# if config
|
|
912
|
+
# # Use discovered config
|
|
913
|
+
# end
|
|
914
|
+
#
|
|
915
|
+
def self.discover
|
|
916
|
+
hash = Kreuzberg._config_discover_native
|
|
917
|
+
return nil if hash.nil?
|
|
918
|
+
|
|
919
|
+
new(**normalize_hash_keys(hash))
|
|
920
|
+
end
|
|
921
|
+
|
|
922
|
+
def initialize(hash = nil,
|
|
923
|
+
use_cache: true,
|
|
924
|
+
enable_quality_processing: true,
|
|
925
|
+
force_ocr: false,
|
|
926
|
+
include_document_structure: false,
|
|
927
|
+
ocr: nil,
|
|
928
|
+
chunking: nil,
|
|
929
|
+
language_detection: nil,
|
|
930
|
+
pdf_options: nil,
|
|
931
|
+
image_extraction: nil,
|
|
932
|
+
postprocessor: nil,
|
|
933
|
+
token_reduction: nil,
|
|
934
|
+
keywords: nil,
|
|
935
|
+
html_options: nil,
|
|
936
|
+
pages: nil,
|
|
937
|
+
max_concurrent_extractions: nil,
|
|
938
|
+
output_format: nil,
|
|
939
|
+
result_format: nil,
|
|
940
|
+
security_limits: nil)
|
|
941
|
+
kwargs = {
|
|
942
|
+
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
943
|
+
force_ocr: force_ocr, include_document_structure: include_document_structure,
|
|
944
|
+
ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
945
|
+
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
946
|
+
postprocessor: postprocessor,
|
|
947
|
+
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
948
|
+
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
949
|
+
output_format: output_format, result_format: result_format,
|
|
950
|
+
security_limits: security_limits
|
|
951
|
+
}
|
|
952
|
+
extracted = extract_from_hash(hash, kwargs)
|
|
953
|
+
|
|
954
|
+
assign_attributes(extracted)
|
|
955
|
+
end
|
|
956
|
+
|
|
957
|
+
def extract_from_hash(hash, defaults)
|
|
958
|
+
return defaults unless hash.is_a?(Hash)
|
|
959
|
+
|
|
960
|
+
hash = hash.transform_keys(&:to_sym)
|
|
961
|
+
defaults.merge(hash.slice(*defaults.keys))
|
|
962
|
+
end
|
|
963
|
+
|
|
964
|
+
def assign_attributes(params)
|
|
965
|
+
@use_cache = params[:use_cache] ? true : false
|
|
966
|
+
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
967
|
+
@force_ocr = params[:force_ocr] ? true : false
|
|
968
|
+
@include_document_structure = params[:include_document_structure] ? true : false
|
|
969
|
+
@ocr = normalize_config(params[:ocr], OCR)
|
|
970
|
+
@chunking = normalize_config(params[:chunking], Chunking)
|
|
971
|
+
@language_detection = normalize_config(params[:language_detection], LanguageDetection)
|
|
972
|
+
@pdf_options = normalize_config(params[:pdf_options], PDF)
|
|
973
|
+
@images = normalize_config(params[:image_extraction], ImageExtraction)
|
|
974
|
+
@postprocessor = normalize_config(params[:postprocessor], PostProcessor)
|
|
975
|
+
@token_reduction = normalize_config(params[:token_reduction], TokenReduction)
|
|
976
|
+
@keywords = normalize_config(params[:keywords], Keywords)
|
|
977
|
+
@html_options = normalize_config(params[:html_options], HtmlOptions)
|
|
978
|
+
@pages = normalize_config(params[:pages], PageConfig)
|
|
979
|
+
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
980
|
+
@output_format = validate_output_format(params[:output_format])
|
|
981
|
+
@result_format = validate_result_format(params[:result_format])
|
|
982
|
+
@security_limits = params[:security_limits]
|
|
983
|
+
end
|
|
984
|
+
|
|
985
|
+
def validate_output_format(value)
|
|
986
|
+
return nil if value.nil?
|
|
987
|
+
|
|
988
|
+
str_value = value.to_s.downcase
|
|
989
|
+
return str_value if VALID_OUTPUT_FORMATS.include?(str_value)
|
|
990
|
+
|
|
991
|
+
raise ArgumentError,
|
|
992
|
+
"Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
|
|
993
|
+
end
|
|
994
|
+
|
|
995
|
+
def validate_result_format(value)
|
|
996
|
+
return nil if value.nil?
|
|
997
|
+
|
|
998
|
+
str_value = value.to_s.downcase
|
|
999
|
+
return str_value if VALID_RESULT_FORMATS.include?(str_value)
|
|
1000
|
+
|
|
1001
|
+
raise ArgumentError,
|
|
1002
|
+
"Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
|
|
1003
|
+
end
|
|
1004
|
+
|
|
1005
|
+
def to_h
|
|
1006
|
+
core_config_hash.merge(sub_config_hash).compact
|
|
1007
|
+
end
|
|
1008
|
+
|
|
1009
|
+
def core_config_hash
|
|
1010
|
+
{
|
|
1011
|
+
use_cache: @use_cache,
|
|
1012
|
+
enable_quality_processing: @enable_quality_processing,
|
|
1013
|
+
force_ocr: @force_ocr,
|
|
1014
|
+
include_document_structure: @include_document_structure,
|
|
1015
|
+
max_concurrent_extractions: @max_concurrent_extractions,
|
|
1016
|
+
output_format: @output_format,
|
|
1017
|
+
result_format: @result_format
|
|
1018
|
+
}
|
|
1019
|
+
end
|
|
1020
|
+
|
|
1021
|
+
def sub_config_hash
|
|
1022
|
+
{
|
|
1023
|
+
ocr: @ocr&.to_h, chunking: @chunking&.to_h,
|
|
1024
|
+
language_detection: @language_detection&.to_h, pdf_options: @pdf_options&.to_h,
|
|
1025
|
+
image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
|
|
1026
|
+
token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
|
|
1027
|
+
html_options: @html_options&.to_h, pages: @pages&.to_h
|
|
1028
|
+
}
|
|
1029
|
+
end
|
|
1030
|
+
|
|
1031
|
+
# Serialize configuration to JSON string
|
|
1032
|
+
#
|
|
1033
|
+
# @return [String] JSON representation of the configuration
|
|
1034
|
+
#
|
|
1035
|
+
# @example
|
|
1036
|
+
# config = Extraction.new(use_cache: true)
|
|
1037
|
+
# json = config.to_json
|
|
1038
|
+
# puts json # => "{\"use_cache\":true,...}"
|
|
1039
|
+
#
|
|
1040
|
+
def to_json(*_args)
|
|
1041
|
+
json_hash = to_h
|
|
1042
|
+
# Convert to JSON directly - the native function has issues
|
|
1043
|
+
JSON.generate(json_hash)
|
|
1044
|
+
end
|
|
1045
|
+
|
|
1046
|
+
# Get a field from the configuration
|
|
1047
|
+
#
|
|
1048
|
+
# Supports dot notation for nested fields (e.g., "ocr.backend")
|
|
1049
|
+
#
|
|
1050
|
+
# @param field_name [String, Symbol] Field name to retrieve
|
|
1051
|
+
# @return [Object, nil] Parsed field value, or nil if field doesn't exist
|
|
1052
|
+
#
|
|
1053
|
+
# @example Get a top-level field
|
|
1054
|
+
# config = Extraction.new(use_cache: true)
|
|
1055
|
+
# config.get_field("use_cache") # => true
|
|
1056
|
+
#
|
|
1057
|
+
# @example Get a nested field
|
|
1058
|
+
# config = Extraction.new(ocr: OCR.new(backend: "tesseract"))
|
|
1059
|
+
# config.get_field("ocr.backend") # => "tesseract"
|
|
1060
|
+
#
|
|
1061
|
+
def get_field(field_name)
|
|
1062
|
+
json_hash = to_h
|
|
1063
|
+
field_path = field_name.to_s.split('.')
|
|
1064
|
+
|
|
1065
|
+
# Navigate the nested hash using the field path
|
|
1066
|
+
field_path.reduce(json_hash) do |current, key|
|
|
1067
|
+
case current
|
|
1068
|
+
when Hash
|
|
1069
|
+
# Check both symbol and string keys, prefer symbol if exists
|
|
1070
|
+
if current.key?(key.to_sym)
|
|
1071
|
+
current[key.to_sym]
|
|
1072
|
+
elsif current.key?(key.to_s)
|
|
1073
|
+
current[key.to_s]
|
|
1074
|
+
end
|
|
1075
|
+
end
|
|
1076
|
+
end
|
|
1077
|
+
end
|
|
1078
|
+
|
|
1079
|
+
# Merge another configuration into this one
|
|
1080
|
+
#
|
|
1081
|
+
# Returns a new configuration with fields from the other config overriding
|
|
1082
|
+
# fields from this config (shallow merge).
|
|
1083
|
+
#
|
|
1084
|
+
# @param other [Extraction, Hash] Configuration to merge
|
|
1085
|
+
# @return [Extraction] New merged configuration
|
|
1086
|
+
#
|
|
1087
|
+
# @example
|
|
1088
|
+
# base = Extraction.new(use_cache: true, force_ocr: false)
|
|
1089
|
+
# override = Extraction.new(force_ocr: true)
|
|
1090
|
+
# merged = base.merge(override)
|
|
1091
|
+
# merged.use_cache # => true
|
|
1092
|
+
# merged.force_ocr # => true
|
|
1093
|
+
#
|
|
1094
|
+
def merge(other)
|
|
1095
|
+
other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
|
|
1096
|
+
# Merge the two config hashes
|
|
1097
|
+
merged_hash = to_h.merge(other_config.to_h)
|
|
1098
|
+
Extraction.new(**merged_hash)
|
|
1099
|
+
end
|
|
1100
|
+
|
|
1101
|
+
# Merge another configuration into this one (mutating)
|
|
1102
|
+
#
|
|
1103
|
+
# Modifies this configuration in-place by merging fields from another config.
|
|
1104
|
+
#
|
|
1105
|
+
# @param other [Extraction, Hash] Configuration to merge
|
|
1106
|
+
# @return [self]
|
|
1107
|
+
#
|
|
1108
|
+
# @example
|
|
1109
|
+
# base = Extraction.new(use_cache: true, force_ocr: false)
|
|
1110
|
+
# override = Extraction.new(force_ocr: true)
|
|
1111
|
+
# base.merge!(override)
|
|
1112
|
+
# base.use_cache # => true
|
|
1113
|
+
# base.force_ocr # => true
|
|
1114
|
+
#
|
|
1115
|
+
def merge!(other)
|
|
1116
|
+
other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
|
|
1117
|
+
merged = merge(other_config)
|
|
1118
|
+
update_from_merged(merged)
|
|
1119
|
+
self
|
|
1120
|
+
end
|
|
1121
|
+
|
|
1122
|
+
# Set a configuration field using hash-like syntax
|
|
1123
|
+
#
|
|
1124
|
+
# @param key [Symbol, String] Field name to set
|
|
1125
|
+
# @param value [Object] Value to set
|
|
1126
|
+
# @return [Object] The value that was set
|
|
1127
|
+
#
|
|
1128
|
+
# @example
|
|
1129
|
+
# config = Extraction.new(use_cache: true)
|
|
1130
|
+
# config[:use_cache] = false
|
|
1131
|
+
# config[:force_ocr] = true
|
|
1132
|
+
#
|
|
1133
|
+
# rubocop:disable Metrics/MethodLength
|
|
1134
|
+
def []=(key, value)
|
|
1135
|
+
key_sym = key.to_sym
|
|
1136
|
+
case key_sym
|
|
1137
|
+
when :use_cache
|
|
1138
|
+
@use_cache = value ? true : false
|
|
1139
|
+
when :enable_quality_processing
|
|
1140
|
+
@enable_quality_processing = value ? true : false
|
|
1141
|
+
when :force_ocr
|
|
1142
|
+
@force_ocr = value ? true : false
|
|
1143
|
+
when :include_document_structure
|
|
1144
|
+
@include_document_structure = value ? true : false
|
|
1145
|
+
when :ocr
|
|
1146
|
+
@ocr = normalize_config(value, OCR)
|
|
1147
|
+
when :chunking
|
|
1148
|
+
@chunking = normalize_config(value, Chunking)
|
|
1149
|
+
when :language_detection
|
|
1150
|
+
@language_detection = normalize_config(value, LanguageDetection)
|
|
1151
|
+
when :pdf_options
|
|
1152
|
+
@pdf_options = normalize_config(value, PDF)
|
|
1153
|
+
when :image_extraction
|
|
1154
|
+
@images = normalize_config(value, ImageExtraction)
|
|
1155
|
+
when :postprocessor
|
|
1156
|
+
@postprocessor = normalize_config(value, PostProcessor)
|
|
1157
|
+
when :token_reduction
|
|
1158
|
+
@token_reduction = normalize_config(value, TokenReduction)
|
|
1159
|
+
when :keywords
|
|
1160
|
+
@keywords = normalize_config(value, Keywords)
|
|
1161
|
+
when :html_options
|
|
1162
|
+
@html_options = normalize_config(value, HtmlOptions)
|
|
1163
|
+
when :pages
|
|
1164
|
+
@pages = normalize_config(value, PageConfig)
|
|
1165
|
+
when :max_concurrent_extractions
|
|
1166
|
+
@max_concurrent_extractions = value&.to_i
|
|
1167
|
+
when :output_format
|
|
1168
|
+
@output_format = validate_output_format(value)
|
|
1169
|
+
when :result_format
|
|
1170
|
+
@result_format = validate_result_format(value)
|
|
1171
|
+
else
|
|
1172
|
+
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
1173
|
+
end
|
|
1174
|
+
end
|
|
1175
|
+
# rubocop:enable Metrics/MethodLength
|
|
1176
|
+
|
|
1177
|
+
# Get a configuration field using hash-like syntax
|
|
1178
|
+
#
|
|
1179
|
+
# @param key [Symbol, String] Field name to get
|
|
1180
|
+
# @return [Object, nil] The field value
|
|
1181
|
+
#
|
|
1182
|
+
# @example
|
|
1183
|
+
# config = Extraction.new(use_cache: true)
|
|
1184
|
+
# config[:use_cache] # => true
|
|
1185
|
+
#
|
|
1186
|
+
def [](key)
|
|
1187
|
+
send(key.to_sym)
|
|
1188
|
+
rescue NoMethodError
|
|
1189
|
+
nil
|
|
1190
|
+
end
|
|
1191
|
+
|
|
1192
|
+
# Set output_format attribute
|
|
1193
|
+
#
|
|
1194
|
+
# @param value [String, nil] Output format value
|
|
1195
|
+
# @return [String, nil] The value that was set
|
|
1196
|
+
#
|
|
1197
|
+
def output_format=(value)
|
|
1198
|
+
@output_format = validate_output_format(value)
|
|
1199
|
+
end
|
|
1200
|
+
|
|
1201
|
+
# Set result_format attribute
|
|
1202
|
+
#
|
|
1203
|
+
# @param value [String, nil] Result format value
|
|
1204
|
+
# @return [String, nil] The value that was set
|
|
1205
|
+
#
|
|
1206
|
+
def result_format=(value)
|
|
1207
|
+
@result_format = validate_result_format(value)
|
|
1208
|
+
end
|
|
1209
|
+
|
|
1210
|
+
private
|
|
1211
|
+
|
|
1212
|
+
def normalize_config(value, klass)
|
|
1213
|
+
return nil if value.nil?
|
|
1214
|
+
return value if value.is_a?(klass)
|
|
1215
|
+
return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
1216
|
+
|
|
1217
|
+
raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
|
|
1218
|
+
end
|
|
1219
|
+
|
|
1220
|
+
def update_from_merged(merged)
|
|
1221
|
+
@use_cache = merged.use_cache
|
|
1222
|
+
@enable_quality_processing = merged.enable_quality_processing
|
|
1223
|
+
@force_ocr = merged.force_ocr
|
|
1224
|
+
@include_document_structure = merged.include_document_structure
|
|
1225
|
+
@ocr = merged.ocr
|
|
1226
|
+
@chunking = merged.chunking
|
|
1227
|
+
@language_detection = merged.language_detection
|
|
1228
|
+
@pdf_options = merged.pdf_options
|
|
1229
|
+
@images = merged.image_extraction
|
|
1230
|
+
@postprocessor = merged.postprocessor
|
|
1231
|
+
@token_reduction = merged.token_reduction
|
|
1232
|
+
@keywords = merged.keywords
|
|
1233
|
+
@html_options = merged.html_options
|
|
1234
|
+
@pages = merged.pages
|
|
1235
|
+
@max_concurrent_extractions = merged.max_concurrent_extractions
|
|
1236
|
+
@output_format = merged.output_format
|
|
1237
|
+
@result_format = merged.result_format
|
|
1238
|
+
end
|
|
1239
|
+
end
|
|
1240
|
+
end
|
|
1241
|
+
end
|