kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,1241 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Kreuzberg
6
+ module Config
7
+ # @example
8
+ class OCR
9
+ attr_reader :backend, :language, :tesseract_config, :paddle_ocr_config, :element_config
10
+
11
+ def initialize(
12
+ backend: 'tesseract',
13
+ language: 'eng',
14
+ tesseract_config: nil,
15
+ paddle_ocr_config: nil,
16
+ element_config: nil
17
+ )
18
+ @backend = backend.to_s
19
+ @language = language.to_s
20
+ @tesseract_config = normalize_tesseract_config(tesseract_config)
21
+ @paddle_ocr_config = normalize_paddle_ocr_config(paddle_ocr_config)
22
+ @element_config = normalize_element_config(element_config)
23
+ end
24
+
25
+ def to_h
26
+ {
27
+ backend: @backend,
28
+ language: @language,
29
+ tesseract_config: @tesseract_config&.to_h,
30
+ paddle_ocr_config: @paddle_ocr_config&.to_h,
31
+ element_config: @element_config&.to_h
32
+ }.compact
33
+ end
34
+
35
+ private
36
+
37
+ def normalize_tesseract_config(value)
38
+ return nil if value.nil?
39
+ return value if value.is_a?(Tesseract)
40
+ return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
41
+
42
+ raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
43
+ end
44
+
45
+ def normalize_paddle_ocr_config(value)
46
+ return nil if value.nil?
47
+ return value if value.is_a?(PaddleOcr)
48
+ return PaddleOcr.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
49
+
50
+ raise ArgumentError, "Expected #{PaddleOcr}, Hash, or nil, got #{value.class}"
51
+ end
52
+
53
+ def normalize_element_config(value)
54
+ return nil if value.nil?
55
+ return value if value.is_a?(OcrElementConfig)
56
+ return OcrElementConfig.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
57
+
58
+ raise ArgumentError, "Expected #{OcrElementConfig}, Hash, or nil, got #{value.class}"
59
+ end
60
+ end
61
+
62
+ # Tesseract OCR engine configuration
63
+ class Tesseract
64
+ attr_reader :options
65
+
66
+ def initialize(**options)
67
+ @options = options.transform_keys(&:to_sym)
68
+ normalize_nested_preprocessing!
69
+ end
70
+
71
+ def to_h
72
+ @options.dup
73
+ end
74
+
75
+ private
76
+
77
+ def normalize_nested_preprocessing!
78
+ preprocessing = @options[:preprocessing]
79
+ return if preprocessing.nil?
80
+ return if preprocessing.is_a?(ImagePreprocessing)
81
+ return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
82
+ preprocessing.is_a?(Hash)
83
+
84
+ raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
85
+ end
86
+ end
87
+
88
+ # PaddleOCR engine configuration
89
+ #
90
+ # @example Basic usage
91
+ # paddle = PaddleOcr.new(language: 'en', cache_dir: '/tmp/paddle')
92
+ #
93
+ # @example Advanced configuration
94
+ # paddle = PaddleOcr.new(
95
+ # language: 'en',
96
+ # cache_dir: '/tmp/paddle',
97
+ # use_angle_cls: true,
98
+ # det_db_thresh: 0.3,
99
+ # rec_batch_num: 32
100
+ # )
101
+ #
102
+ class PaddleOcr
103
+ attr_reader :language, :cache_dir, :use_angle_cls, :enable_table_detection,
104
+ :det_db_thresh, :det_db_box_thresh, :det_db_unclip_ratio,
105
+ :det_limit_side_len, :rec_batch_num
106
+
107
+ def initialize(
108
+ language: nil,
109
+ cache_dir: nil,
110
+ use_angle_cls: nil,
111
+ enable_table_detection: nil,
112
+ det_db_thresh: nil,
113
+ det_db_box_thresh: nil,
114
+ det_db_unclip_ratio: nil,
115
+ det_limit_side_len: nil,
116
+ rec_batch_num: nil
117
+ )
118
+ @language = language&.to_s
119
+ @cache_dir = cache_dir&.to_s
120
+ @use_angle_cls = boolean_or_nil(use_angle_cls)
121
+ @enable_table_detection = boolean_or_nil(enable_table_detection)
122
+ @det_db_thresh = det_db_thresh&.to_f
123
+ @det_db_box_thresh = det_db_box_thresh&.to_f
124
+ @det_db_unclip_ratio = det_db_unclip_ratio&.to_f
125
+ @det_limit_side_len = det_limit_side_len&.to_i
126
+ @rec_batch_num = rec_batch_num&.to_i
127
+ end
128
+
129
+ def to_h
130
+ {
131
+ language: @language,
132
+ cache_dir: @cache_dir,
133
+ use_angle_cls: @use_angle_cls,
134
+ enable_table_detection: @enable_table_detection,
135
+ det_db_thresh: @det_db_thresh,
136
+ det_db_box_thresh: @det_db_box_thresh,
137
+ det_db_unclip_ratio: @det_db_unclip_ratio,
138
+ det_limit_side_len: @det_limit_side_len,
139
+ rec_batch_num: @rec_batch_num
140
+ }.compact
141
+ end
142
+
143
+ private
144
+
145
+ def boolean_or_nil(value)
146
+ return nil if value.nil?
147
+
148
+ value ? true : false
149
+ end
150
+ end
151
+
152
+ # OCR element configuration for output control
153
+ #
154
+ # @example Basic usage
155
+ # config = OcrElementConfig.new(include_elements: true)
156
+ #
157
+ # @example Advanced configuration
158
+ # config = OcrElementConfig.new(
159
+ # include_elements: true,
160
+ # min_level: 'word',
161
+ # min_confidence: 0.7,
162
+ # build_hierarchy: true
163
+ # )
164
+ #
165
+ class OcrElementConfig
166
+ attr_reader :include_elements, :min_level, :min_confidence, :build_hierarchy
167
+
168
+ def initialize(
169
+ include_elements: false,
170
+ min_level: nil,
171
+ min_confidence: nil,
172
+ build_hierarchy: false
173
+ )
174
+ @include_elements = include_elements ? true : false
175
+ @min_level = min_level&.to_s
176
+ @min_confidence = min_confidence&.to_f
177
+ @build_hierarchy = build_hierarchy ? true : false
178
+ end
179
+
180
+ def to_h
181
+ {
182
+ include_elements: @include_elements,
183
+ min_level: @min_level,
184
+ min_confidence: @min_confidence,
185
+ build_hierarchy: @build_hierarchy
186
+ }.compact
187
+ end
188
+ end
189
+
190
+ # Chunking configuration
191
+ #
192
+ # @example
193
+ # chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
194
+ #
195
+ class Chunking
196
+ attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
197
+
198
+ def initialize(
199
+ max_chars: nil,
200
+ max_overlap: nil,
201
+ preset: nil,
202
+ embedding: nil,
203
+ chunk_size: nil,
204
+ chunk_overlap: nil,
205
+ enabled: true
206
+ )
207
+ resolved_size = chunk_size || max_chars || 1000
208
+ resolved_overlap = chunk_overlap || max_overlap || 200
209
+
210
+ @max_chars = resolved_size.to_i
211
+ @max_overlap = resolved_overlap.to_i
212
+
213
+ # Validate positive values
214
+ raise ArgumentError, "max_chars must be a positive integer, got #{@max_chars}" if @max_chars.negative?
215
+ raise ArgumentError, "max_overlap must be a positive integer, got #{@max_overlap}" if @max_overlap.negative?
216
+
217
+ @preset = preset&.to_s
218
+ @embedding = normalize_embedding(embedding)
219
+ @enabled = boolean_or_nil(enabled)
220
+ end
221
+
222
+ def to_h
223
+ config = {
224
+ max_chars: @max_chars,
225
+ max_overlap: @max_overlap,
226
+ preset: @preset,
227
+ embedding: @embedding&.to_h
228
+ }.compact
229
+ # @type var config: Hash[Symbol, untyped]
230
+ config[:enabled] = @enabled unless @enabled.nil?
231
+ config
232
+ end
233
+
234
+ private
235
+
236
+ def normalize_embedding(value)
237
+ return nil if value.nil?
238
+ return value if value.is_a?(Embedding)
239
+ return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
240
+
241
+ raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
242
+ end
243
+
244
+ def boolean_or_nil(value)
245
+ return nil if value.nil?
246
+
247
+ value ? true : false
248
+ end
249
+ end
250
+
251
+ # Embedding model configuration for document chunking
252
+ class Embedding
253
+ attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
254
+
255
+ def initialize(
256
+ model: { type: :preset, name: 'balanced' },
257
+ normalize: true,
258
+ batch_size: 32,
259
+ show_download_progress: false,
260
+ cache_dir: nil
261
+ )
262
+ @model = normalize_model(model)
263
+ @normalize = boolean_or_nil(normalize)
264
+ @batch_size = batch_size&.to_i
265
+ @show_download_progress = boolean_or_nil(show_download_progress)
266
+ @cache_dir = cache_dir&.to_s
267
+ end
268
+
269
+ def to_h
270
+ {
271
+ model: @model,
272
+ normalize: @normalize,
273
+ batch_size: @batch_size,
274
+ show_download_progress: @show_download_progress,
275
+ cache_dir: @cache_dir
276
+ }.compact
277
+ end
278
+
279
+ private
280
+
281
+ def normalize_model(model)
282
+ normalized = if model.respond_to?(:to_h)
283
+ model.to_h
284
+ else
285
+ model
286
+ end
287
+ raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
288
+
289
+ normalized.transform_keys(&:to_sym)
290
+ end
291
+
292
+ def boolean_or_nil(value)
293
+ return nil if value.nil?
294
+
295
+ value ? true : false
296
+ end
297
+ end
298
+
299
+ # Language detection configuration
300
+ #
301
+ # @example
302
+ # lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
303
+ #
304
+ class LanguageDetection
305
+ attr_reader :enabled, :min_confidence, :detect_multiple
306
+
307
+ def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
308
+ @enabled = enabled ? true : false
309
+ @min_confidence = min_confidence.to_f
310
+ @detect_multiple = detect_multiple ? true : false
311
+ end
312
+
313
+ def to_h
314
+ {
315
+ enabled: @enabled,
316
+ min_confidence: @min_confidence,
317
+ detect_multiple: @detect_multiple
318
+ }
319
+ end
320
+ end
321
+
322
+ # Font configuration for PDF rendering
323
+ #
324
+ # @example
325
+ # font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
326
+ #
327
+ class FontConfig
328
+ attr_accessor :enabled, :custom_font_dirs
329
+
330
+ def initialize(enabled: true, custom_font_dirs: nil)
331
+ @enabled = enabled ? true : false
332
+ @custom_font_dirs = custom_font_dirs
333
+ end
334
+
335
+ def to_h
336
+ {
337
+ enabled: @enabled,
338
+ custom_font_dirs: @custom_font_dirs
339
+ }.compact
340
+ end
341
+ end
342
+
343
+ # Hierarchy detection configuration
344
+ #
345
+ # @example
346
+ # hierarchy = Hierarchy.new(enabled: true, k_clusters: 6, include_bbox: true)
347
+ #
348
+ class Hierarchy
349
+ attr_reader :enabled, :k_clusters, :include_bbox, :ocr_coverage_threshold
350
+
351
+ def initialize(
352
+ enabled: true,
353
+ k_clusters: 6,
354
+ include_bbox: true,
355
+ ocr_coverage_threshold: nil
356
+ )
357
+ @enabled = enabled ? true : false
358
+ @k_clusters = k_clusters&.to_i || 6
359
+ @include_bbox = include_bbox ? true : false
360
+ @ocr_coverage_threshold = ocr_coverage_threshold&.to_f
361
+ end
362
+
363
+ def to_h
364
+ {
365
+ enabled: @enabled,
366
+ k_clusters: @k_clusters,
367
+ include_bbox: @include_bbox,
368
+ ocr_coverage_threshold: @ocr_coverage_threshold
369
+ }.compact
370
+ end
371
+
372
+ def self.from_h(hash)
373
+ return nil if hash.nil?
374
+ return hash if hash.is_a?(self)
375
+
376
+ new(**hash.transform_keys(&:to_sym)) if hash.is_a?(Hash)
377
+ end
378
+ end
379
+
380
+ # PDF-specific options
381
+ #
382
+ # @example
383
+ # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
384
+ #
385
+ # @example With font configuration
386
+ # font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
387
+ # pdf = PDF.new(extract_images: true, font_config: font_config)
388
+ #
389
+ # @example With hierarchy configuration
390
+ # hierarchy = Hierarchy.new(enabled: true, k_clusters: 6)
391
+ # pdf = PDF.new(extract_images: true, hierarchy: hierarchy)
392
+ #
393
+ class PDF
394
+ attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy
395
+
396
+ def initialize(
397
+ extract_images: false,
398
+ passwords: nil,
399
+ extract_metadata: true,
400
+ font_config: nil,
401
+ hierarchy: nil
402
+ )
403
+ @extract_images = extract_images ? true : false
404
+ @passwords = if passwords.is_a?(Array)
405
+ passwords.map(&:to_s)
406
+ else
407
+ (passwords ? [passwords.to_s] : nil)
408
+ end
409
+ @extract_metadata = extract_metadata ? true : false
410
+ @font_config = normalize_font_config(font_config)
411
+ @hierarchy = normalize_hierarchy(hierarchy)
412
+ end
413
+
414
+ def to_h
415
+ {
416
+ extract_images: @extract_images,
417
+ passwords: @passwords,
418
+ extract_metadata: @extract_metadata,
419
+ font_config: @font_config&.to_h,
420
+ hierarchy: @hierarchy&.to_h
421
+ }.compact
422
+ end
423
+
424
+ def font_config=(value)
425
+ @font_config = normalize_font_config(value)
426
+ end
427
+
428
+ def hierarchy=(value)
429
+ @hierarchy = normalize_hierarchy(value)
430
+ end
431
+
432
+ private
433
+
434
+ def normalize_font_config(value)
435
+ return nil if value.nil?
436
+ return value if value.is_a?(FontConfig)
437
+ return FontConfig.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
438
+
439
+ raise ArgumentError, "Expected #{FontConfig}, Hash, or nil, got #{value.class}"
440
+ end
441
+
442
+ def normalize_hierarchy(value)
443
+ return nil if value.nil?
444
+ return value if value.is_a?(Hierarchy)
445
+ return Hierarchy.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
446
+
447
+ raise ArgumentError, "Expected #{Hierarchy}, Hash, or nil, got #{value.class}"
448
+ end
449
+ end
450
+
451
+ # Image extraction configuration
452
+ #
453
+ # @example
454
+ # image = ImageExtraction.new(extract_images: true, target_dpi: 300)
455
+ #
456
+ # @example With auto-adjust DPI
457
+ # image = ImageExtraction.new(
458
+ # extract_images: true,
459
+ # auto_adjust_dpi: true,
460
+ # min_dpi: 150,
461
+ # max_dpi: 600
462
+ # )
463
+ #
464
+ class ImageExtraction
465
+ attr_reader :extract_images, :target_dpi, :max_image_dimension,
466
+ :auto_adjust_dpi, :min_dpi, :max_dpi
467
+
468
+ def initialize(
469
+ extract_images: true,
470
+ target_dpi: 300,
471
+ max_image_dimension: 2000,
472
+ auto_adjust_dpi: true,
473
+ min_dpi: 150,
474
+ max_dpi: 600
475
+ )
476
+ @extract_images = extract_images ? true : false
477
+ @target_dpi = target_dpi.to_i
478
+ @max_image_dimension = max_image_dimension.to_i
479
+ @auto_adjust_dpi = auto_adjust_dpi ? true : false
480
+ @min_dpi = min_dpi.to_i
481
+ @max_dpi = max_dpi.to_i
482
+ end
483
+
484
+ def to_h
485
+ {
486
+ extract_images: @extract_images,
487
+ target_dpi: @target_dpi,
488
+ max_image_dimension: @max_image_dimension,
489
+ auto_adjust_dpi: @auto_adjust_dpi,
490
+ min_dpi: @min_dpi,
491
+ max_dpi: @max_dpi
492
+ }
493
+ end
494
+ end
495
+
496
+ # Image preprocessing configuration for OCR
497
+ #
498
+ # @example Basic preprocessing
499
+ # preprocessing = ImagePreprocessing.new(
500
+ # binarization_method: "otsu",
501
+ # denoise: true
502
+ # )
503
+ #
504
+ # @example Advanced preprocessing
505
+ # preprocessing = ImagePreprocessing.new(
506
+ # target_dpi: 600,
507
+ # auto_rotate: true,
508
+ # deskew: true,
509
+ # denoise: true,
510
+ # contrast_enhance: true,
511
+ # binarization_method: "sauvola",
512
+ # invert_colors: false
513
+ # )
514
+ #
515
+ class ImagePreprocessing
516
+ attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
517
+ :contrast_enhance, :binarization_method, :invert_colors
518
+
519
+ VALID_BINARIZATION_METHODS = %w[otsu sauvola niblack wolf bradley adaptive].freeze
520
+
521
+ def initialize(
522
+ target_dpi: 300,
523
+ auto_rotate: true,
524
+ deskew: true,
525
+ denoise: false,
526
+ contrast_enhance: true,
527
+ binarization_method: 'otsu',
528
+ invert_colors: false
529
+ )
530
+ @target_dpi = target_dpi.to_i
531
+ @auto_rotate = auto_rotate ? true : false
532
+ @deskew = deskew ? true : false
533
+ @denoise = denoise ? true : false
534
+ @contrast_enhance = contrast_enhance ? true : false
535
+ @binarization_method = binarization_method.to_s
536
+ @invert_colors = invert_colors ? true : false
537
+
538
+ # Validate binarization method
539
+ return if VALID_BINARIZATION_METHODS.include?(@binarization_method)
540
+
541
+ valid_methods = VALID_BINARIZATION_METHODS.join(', ')
542
+ raise ArgumentError,
543
+ "Invalid binarization_method: #{@binarization_method}. Valid methods are: #{valid_methods}"
544
+ end
545
+
546
+ def to_h
547
+ {
548
+ target_dpi: @target_dpi,
549
+ auto_rotate: @auto_rotate,
550
+ deskew: @deskew,
551
+ denoise: @denoise,
552
+ contrast_enhance: @contrast_enhance,
553
+ binarization_method: @binarization_method,
554
+ invert_colors: @invert_colors
555
+ }
556
+ end
557
+ end
558
+
559
+ # Token reduction configuration
560
+ #
561
+ # @example Disable token reduction
562
+ # token = TokenReduction.new(mode: "off")
563
+ #
564
+ # @example Light reduction
565
+ # token = TokenReduction.new(mode: "light", preserve_important_words: true)
566
+ #
567
+ # @example Aggressive reduction
568
+ # token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
569
+ #
570
+ class TokenReduction
571
+ attr_reader :mode, :preserve_important_words
572
+
573
+ VALID_MODES = %w[off light moderate aggressive maximum].freeze
574
+
575
+ def initialize(mode: 'off', preserve_important_words: true)
576
+ @mode = mode.to_s
577
+ @preserve_important_words = preserve_important_words ? true : false
578
+
579
+ # Validate mode against known valid modes
580
+ return if VALID_MODES.include?(@mode)
581
+
582
+ raise ArgumentError, "Invalid token reduction mode: #{@mode}. Valid modes are: #{VALID_MODES.join(', ')}"
583
+ end
584
+
585
+ def to_h
586
+ {
587
+ mode: @mode,
588
+ preserve_important_words: @preserve_important_words
589
+ }
590
+ end
591
+ end
592
+
593
+ # HTML preprocessing configuration for content extraction
594
+ class HtmlPreprocessing
595
+ attr_reader :enabled, :preset, :remove_navigation, :remove_forms
596
+
597
+ def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
598
+ @enabled = boolean_or_nil(enabled)
599
+ @preset = preset&.to_sym
600
+ @remove_navigation = boolean_or_nil(remove_navigation)
601
+ @remove_forms = boolean_or_nil(remove_forms)
602
+ end
603
+
604
+ def to_h
605
+ {
606
+ enabled: @enabled,
607
+ preset: @preset,
608
+ remove_navigation: @remove_navigation,
609
+ remove_forms: @remove_forms
610
+ }.compact
611
+ end
612
+
613
+ private
614
+
615
+ def boolean_or_nil(value)
616
+ return nil if value.nil?
617
+
618
+ value ? true : false
619
+ end
620
+ end
621
+
622
+ # HTML rendering options for document conversion
623
+ class HtmlOptions
624
+ attr_reader :options
625
+
626
+ def initialize(**options)
627
+ normalized = options.transform_keys(&:to_sym)
628
+ symbol_keys = %i[
629
+ heading_style
630
+ code_block_style
631
+ highlight_style
632
+ list_indent_type
633
+ newline_style
634
+ whitespace_mode
635
+ ]
636
+ symbol_keys.each do |key|
637
+ normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
638
+ end
639
+ if normalized[:preprocessing].is_a?(Hash)
640
+ normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
641
+ end
642
+ @options = normalized
643
+ end
644
+
645
+ def to_h
646
+ @options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
647
+ end
648
+ end
649
+
650
+ # YAKE keyword extraction parameters
651
+ class KeywordYakeParams
652
+ attr_reader :window_size
653
+
654
+ def initialize(window_size: 2)
655
+ @window_size = window_size.to_i
656
+ end
657
+
658
+ def to_h
659
+ { window_size: @window_size }
660
+ end
661
+ end
662
+
663
+ # RAKE keyword extraction parameters
664
+ class KeywordRakeParams
665
+ attr_reader :min_word_length, :max_words_per_phrase
666
+
667
+ def initialize(min_word_length: 1, max_words_per_phrase: 3)
668
+ @min_word_length = min_word_length.to_i
669
+ @max_words_per_phrase = max_words_per_phrase.to_i
670
+ end
671
+
672
+ def to_h
673
+ {
674
+ min_word_length: @min_word_length,
675
+ max_words_per_phrase: @max_words_per_phrase
676
+ }
677
+ end
678
+ end
679
+
680
+ # Keyword extraction configuration for document analysis
681
+ class Keywords
682
+ attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
683
+ :language, :yake_params, :rake_params
684
+
685
+ def initialize(
686
+ algorithm: nil,
687
+ max_keywords: nil,
688
+ min_score: nil,
689
+ ngram_range: nil,
690
+ language: nil,
691
+ yake_params: nil,
692
+ rake_params: nil
693
+ )
694
+ @algorithm = algorithm&.to_s
695
+ @max_keywords = max_keywords&.to_i
696
+ @min_score = min_score&.to_f
697
+ @ngram_range = ngram_range&.map(&:to_i)
698
+ @language = language&.to_s
699
+ @yake_params = normalize_nested(yake_params, KeywordYakeParams)
700
+ @rake_params = normalize_nested(rake_params, KeywordRakeParams)
701
+ end
702
+
703
+ def to_h
704
+ {
705
+ algorithm: @algorithm,
706
+ max_keywords: @max_keywords,
707
+ min_score: @min_score,
708
+ ngram_range: @ngram_range,
709
+ language: @language,
710
+ yake_params: @yake_params&.to_h,
711
+ rake_params: @rake_params&.to_h
712
+ }.compact
713
+ end
714
+
715
+ private
716
+
717
+ def normalize_nested(value, klass)
718
+ return nil if value.nil?
719
+ return value if value.is_a?(klass)
720
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
721
+
722
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
723
+ end
724
+ end
725
+
726
+ # Page tracking configuration for multi-page documents
727
+ #
728
+ # @example Enable page extraction
729
+ # pages = PageConfig.new(extract_pages: true)
730
+ #
731
+ # @example Enable page markers in content
732
+ # pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
733
+ #
734
+ class PageConfig
735
+ attr_reader :extract_pages, :insert_page_markers, :marker_format
736
+
737
+ def initialize(
738
+ extract_pages: false,
739
+ insert_page_markers: false,
740
+ marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
741
+ )
742
+ # Handle boolean conversion: treat 0 as false (like in C/FFI), but other truthy values as true
743
+ @extract_pages = !extract_pages.nil? && extract_pages != false && extract_pages != 0
744
+ @insert_page_markers = !insert_page_markers.nil? && insert_page_markers != false && insert_page_markers != 0
745
+ @marker_format = marker_format.to_s
746
+ end
747
+
748
+ def to_h
749
+ {
750
+ extract_pages: @extract_pages,
751
+ insert_page_markers: @insert_page_markers,
752
+ marker_format: @marker_format
753
+ }
754
+ end
755
+ end
756
+
757
+ # Post-processor configuration
758
+ #
759
+ # @example Enable all post-processors
760
+ # postprocessor = PostProcessor.new(enabled: true)
761
+ #
762
+ # @example Enable specific processors
763
+ # postprocessor = PostProcessor.new(
764
+ # enabled: true,
765
+ # enabled_processors: ["quality", "formatting"]
766
+ # )
767
+ #
768
+ # @example Disable specific processors
769
+ # postprocessor = PostProcessor.new(
770
+ # enabled: true,
771
+ # disabled_processors: ["token_reduction"]
772
+ # )
773
+ #
774
+ class PostProcessor
775
+ attr_reader :enabled, :enabled_processors, :disabled_processors
776
+
777
+ def initialize(
778
+ enabled: true,
779
+ enabled_processors: nil,
780
+ disabled_processors: nil
781
+ )
782
+ @enabled = enabled ? true : false
783
+ @enabled_processors = enabled_processors&.map(&:to_s)
784
+ @disabled_processors = disabled_processors&.map(&:to_s)
785
+ end
786
+
787
+ def to_h
788
+ {
789
+ enabled: @enabled,
790
+ enabled_processors: @enabled_processors,
791
+ disabled_processors: @disabled_processors
792
+ }.compact
793
+ end
794
+ end
795
+
796
+ # Main extraction configuration
797
+ #
798
+ # @example Basic usage
799
+ # config = Extraction.new(use_cache: true, force_ocr: true)
800
+ #
801
+ # @example With OCR
802
+ # ocr = Config::OCR.new(backend: "tesseract", language: "eng")
803
+ # config = Extraction.new(ocr: ocr)
804
+ #
805
+ # @example With image extraction
806
+ # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
807
+ # config = Extraction.new(image_extraction: image)
808
+ #
809
+ # @example With post-processing
810
+ # postprocessor = Config::PostProcessor.new(
811
+ # enabled: true,
812
+ # enabled_processors: ["quality"]
813
+ # )
814
+ # config = Extraction.new(postprocessor: postprocessor)
815
+ #
816
+ # @example With document structure
817
+ # config = Extraction.new(include_document_structure: true)
818
+ #
819
+ # @example With all options
820
+ # config = Extraction.new(
821
+ # use_cache: true,
822
+ # enable_quality_processing: true,
823
+ # force_ocr: false,
824
+ # include_document_structure: true,
825
+ # ocr: Config::OCR.new(language: "deu"),
826
+ # chunking: Config::Chunking.new(max_chars: 500),
827
+ # language_detection: Config::LanguageDetection.new(enabled: true),
828
+ # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
829
+ # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
830
+ # postprocessor: Config::PostProcessor.new(enabled: true)
831
+ # )
832
+ #
833
+ class Extraction
834
+ attr_reader :use_cache, :enable_quality_processing, :force_ocr,
835
+ :include_document_structure,
836
+ :ocr, :chunking, :language_detection, :pdf_options,
837
+ :images, :postprocessor,
838
+ :token_reduction, :keywords, :html_options, :pages,
839
+ :max_concurrent_extractions, :output_format, :result_format,
840
+ :security_limits
841
+
842
+ # Alias for backward compatibility - image_extraction is the canonical name
843
+ alias image_extraction images
844
+
845
+ # Load configuration from a file.
846
+ #
847
+ # Detects the file format from the extension (.toml, .yaml, .json)
848
+ # and loads the configuration accordingly.
849
+ #
850
+ # @param path [String] Path to the configuration file
851
+ # @return [Kreuzberg::Config::Extraction] Loaded configuration object
852
+ #
853
+ # @example Load from TOML
854
+ # config = Kreuzberg::Config::Extraction.from_file("config.toml")
855
+ #
856
+ # @example Load from YAML
857
+ # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
858
+ #
859
+ # Keys that are allowed in the Extraction config
860
+ ALLOWED_KEYS = %i[
861
+ use_cache enable_quality_processing force_ocr include_document_structure ocr chunking
862
+ language_detection pdf_options image_extraction
863
+ postprocessor token_reduction keywords html_options pages
864
+ max_concurrent_extractions output_format result_format
865
+ security_limits
866
+ ].freeze
867
+
868
+ # Aliases for backward compatibility
869
+ KEY_ALIASES = {
870
+ images: :image_extraction
871
+ }.freeze
872
+
873
+ # Valid output format values (case-insensitive, normalized internally)
874
+ VALID_OUTPUT_FORMATS = %w[plain markdown html djot].freeze
875
+
876
+ # Valid result format values (case-insensitive, normalized internally)
877
+ VALID_RESULT_FORMATS = %w[unified elements element_based].freeze
878
+
879
+ def self.from_file(path)
880
+ hash = Kreuzberg._config_from_file_native(path)
881
+ new(**normalize_hash_keys(hash))
882
+ end
883
+
884
+ # Normalize hash keys from native function
885
+ # - Converts string keys to symbols
886
+ # - Maps aliased keys to their canonical names
887
+ # - Filters out unknown keys
888
+ def self.normalize_hash_keys(hash)
889
+ symbolized = hash.transform_keys(&:to_sym)
890
+
891
+ # Apply key aliases
892
+ KEY_ALIASES.each do |from, to|
893
+ symbolized[to] = symbolized.delete(from) if symbolized.key?(from) && !symbolized.key?(to)
894
+ end
895
+
896
+ # Filter to only allowed keys
897
+ symbolized.slice(*ALLOWED_KEYS)
898
+ end
899
+
900
+ private_class_method :normalize_hash_keys
901
+
902
+ # Discover configuration file in current or parent directories.
903
+ #
904
+ # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
905
+ # directory and parent directories.
906
+ #
907
+ # @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
908
+ #
909
+ # @example
910
+ # config = Kreuzberg::Config::Extraction.discover
911
+ # if config
912
+ # # Use discovered config
913
+ # end
914
+ #
915
+ def self.discover
916
+ hash = Kreuzberg._config_discover_native
917
+ return nil if hash.nil?
918
+
919
+ new(**normalize_hash_keys(hash))
920
+ end
921
+
922
+ def initialize(hash = nil,
923
+ use_cache: true,
924
+ enable_quality_processing: true,
925
+ force_ocr: false,
926
+ include_document_structure: false,
927
+ ocr: nil,
928
+ chunking: nil,
929
+ language_detection: nil,
930
+ pdf_options: nil,
931
+ image_extraction: nil,
932
+ postprocessor: nil,
933
+ token_reduction: nil,
934
+ keywords: nil,
935
+ html_options: nil,
936
+ pages: nil,
937
+ max_concurrent_extractions: nil,
938
+ output_format: nil,
939
+ result_format: nil,
940
+ security_limits: nil)
941
+ kwargs = {
942
+ use_cache: use_cache, enable_quality_processing: enable_quality_processing,
943
+ force_ocr: force_ocr, include_document_structure: include_document_structure,
944
+ ocr: ocr, chunking: chunking, language_detection: language_detection,
945
+ pdf_options: pdf_options, image_extraction: image_extraction,
946
+ postprocessor: postprocessor,
947
+ token_reduction: token_reduction, keywords: keywords, html_options: html_options,
948
+ pages: pages, max_concurrent_extractions: max_concurrent_extractions,
949
+ output_format: output_format, result_format: result_format,
950
+ security_limits: security_limits
951
+ }
952
+ extracted = extract_from_hash(hash, kwargs)
953
+
954
+ assign_attributes(extracted)
955
+ end
956
+
957
+ def extract_from_hash(hash, defaults)
958
+ return defaults unless hash.is_a?(Hash)
959
+
960
+ hash = hash.transform_keys(&:to_sym)
961
+ defaults.merge(hash.slice(*defaults.keys))
962
+ end
963
+
964
+ def assign_attributes(params)
965
+ @use_cache = params[:use_cache] ? true : false
966
+ @enable_quality_processing = params[:enable_quality_processing] ? true : false
967
+ @force_ocr = params[:force_ocr] ? true : false
968
+ @include_document_structure = params[:include_document_structure] ? true : false
969
+ @ocr = normalize_config(params[:ocr], OCR)
970
+ @chunking = normalize_config(params[:chunking], Chunking)
971
+ @language_detection = normalize_config(params[:language_detection], LanguageDetection)
972
+ @pdf_options = normalize_config(params[:pdf_options], PDF)
973
+ @images = normalize_config(params[:image_extraction], ImageExtraction)
974
+ @postprocessor = normalize_config(params[:postprocessor], PostProcessor)
975
+ @token_reduction = normalize_config(params[:token_reduction], TokenReduction)
976
+ @keywords = normalize_config(params[:keywords], Keywords)
977
+ @html_options = normalize_config(params[:html_options], HtmlOptions)
978
+ @pages = normalize_config(params[:pages], PageConfig)
979
+ @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
980
+ @output_format = validate_output_format(params[:output_format])
981
+ @result_format = validate_result_format(params[:result_format])
982
+ @security_limits = params[:security_limits]
983
+ end
984
+
985
+ def validate_output_format(value)
986
+ return nil if value.nil?
987
+
988
+ str_value = value.to_s.downcase
989
+ return str_value if VALID_OUTPUT_FORMATS.include?(str_value)
990
+
991
+ raise ArgumentError,
992
+ "Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
993
+ end
994
+
995
+ def validate_result_format(value)
996
+ return nil if value.nil?
997
+
998
+ str_value = value.to_s.downcase
999
+ return str_value if VALID_RESULT_FORMATS.include?(str_value)
1000
+
1001
+ raise ArgumentError,
1002
+ "Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
1003
+ end
1004
+
1005
+ def to_h
1006
+ core_config_hash.merge(sub_config_hash).compact
1007
+ end
1008
+
1009
+ def core_config_hash
1010
+ {
1011
+ use_cache: @use_cache,
1012
+ enable_quality_processing: @enable_quality_processing,
1013
+ force_ocr: @force_ocr,
1014
+ include_document_structure: @include_document_structure,
1015
+ max_concurrent_extractions: @max_concurrent_extractions,
1016
+ output_format: @output_format,
1017
+ result_format: @result_format
1018
+ }
1019
+ end
1020
+
1021
+ def sub_config_hash
1022
+ {
1023
+ ocr: @ocr&.to_h, chunking: @chunking&.to_h,
1024
+ language_detection: @language_detection&.to_h, pdf_options: @pdf_options&.to_h,
1025
+ image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
1026
+ token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
1027
+ html_options: @html_options&.to_h, pages: @pages&.to_h
1028
+ }
1029
+ end
1030
+
1031
+ # Serialize configuration to JSON string
1032
+ #
1033
+ # @return [String] JSON representation of the configuration
1034
+ #
1035
+ # @example
1036
+ # config = Extraction.new(use_cache: true)
1037
+ # json = config.to_json
1038
+ # puts json # => "{\"use_cache\":true,...}"
1039
+ #
1040
+ def to_json(*_args)
1041
+ json_hash = to_h
1042
+ # Convert to JSON directly - the native function has issues
1043
+ JSON.generate(json_hash)
1044
+ end
1045
+
1046
+ # Get a field from the configuration
1047
+ #
1048
+ # Supports dot notation for nested fields (e.g., "ocr.backend")
1049
+ #
1050
+ # @param field_name [String, Symbol] Field name to retrieve
1051
+ # @return [Object, nil] Parsed field value, or nil if field doesn't exist
1052
+ #
1053
+ # @example Get a top-level field
1054
+ # config = Extraction.new(use_cache: true)
1055
+ # config.get_field("use_cache") # => true
1056
+ #
1057
+ # @example Get a nested field
1058
+ # config = Extraction.new(ocr: OCR.new(backend: "tesseract"))
1059
+ # config.get_field("ocr.backend") # => "tesseract"
1060
+ #
1061
+ def get_field(field_name)
1062
+ json_hash = to_h
1063
+ field_path = field_name.to_s.split('.')
1064
+
1065
+ # Navigate the nested hash using the field path
1066
+ field_path.reduce(json_hash) do |current, key|
1067
+ case current
1068
+ when Hash
1069
+ # Check both symbol and string keys, prefer symbol if exists
1070
+ if current.key?(key.to_sym)
1071
+ current[key.to_sym]
1072
+ elsif current.key?(key.to_s)
1073
+ current[key.to_s]
1074
+ end
1075
+ end
1076
+ end
1077
+ end
1078
+
1079
+ # Merge another configuration into this one
1080
+ #
1081
+ # Returns a new configuration with fields from the other config overriding
1082
+ # fields from this config (shallow merge).
1083
+ #
1084
+ # @param other [Extraction, Hash] Configuration to merge
1085
+ # @return [Extraction] New merged configuration
1086
+ #
1087
+ # @example
1088
+ # base = Extraction.new(use_cache: true, force_ocr: false)
1089
+ # override = Extraction.new(force_ocr: true)
1090
+ # merged = base.merge(override)
1091
+ # merged.use_cache # => true
1092
+ # merged.force_ocr # => true
1093
+ #
1094
+ def merge(other)
1095
+ other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
1096
+ # Merge the two config hashes
1097
+ merged_hash = to_h.merge(other_config.to_h)
1098
+ Extraction.new(**merged_hash)
1099
+ end
1100
+
1101
+ # Merge another configuration into this one (mutating)
1102
+ #
1103
+ # Modifies this configuration in-place by merging fields from another config.
1104
+ #
1105
+ # @param other [Extraction, Hash] Configuration to merge
1106
+ # @return [self]
1107
+ #
1108
+ # @example
1109
+ # base = Extraction.new(use_cache: true, force_ocr: false)
1110
+ # override = Extraction.new(force_ocr: true)
1111
+ # base.merge!(override)
1112
+ # base.use_cache # => true
1113
+ # base.force_ocr # => true
1114
+ #
1115
+ def merge!(other)
1116
+ other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
1117
+ merged = merge(other_config)
1118
+ update_from_merged(merged)
1119
+ self
1120
+ end
1121
+
1122
+ # Set a configuration field using hash-like syntax
1123
+ #
1124
+ # @param key [Symbol, String] Field name to set
1125
+ # @param value [Object] Value to set
1126
+ # @return [Object] The value that was set
1127
+ #
1128
+ # @example
1129
+ # config = Extraction.new(use_cache: true)
1130
+ # config[:use_cache] = false
1131
+ # config[:force_ocr] = true
1132
+ #
1133
+ # rubocop:disable Metrics/MethodLength
1134
+ def []=(key, value)
1135
+ key_sym = key.to_sym
1136
+ case key_sym
1137
+ when :use_cache
1138
+ @use_cache = value ? true : false
1139
+ when :enable_quality_processing
1140
+ @enable_quality_processing = value ? true : false
1141
+ when :force_ocr
1142
+ @force_ocr = value ? true : false
1143
+ when :include_document_structure
1144
+ @include_document_structure = value ? true : false
1145
+ when :ocr
1146
+ @ocr = normalize_config(value, OCR)
1147
+ when :chunking
1148
+ @chunking = normalize_config(value, Chunking)
1149
+ when :language_detection
1150
+ @language_detection = normalize_config(value, LanguageDetection)
1151
+ when :pdf_options
1152
+ @pdf_options = normalize_config(value, PDF)
1153
+ when :image_extraction
1154
+ @images = normalize_config(value, ImageExtraction)
1155
+ when :postprocessor
1156
+ @postprocessor = normalize_config(value, PostProcessor)
1157
+ when :token_reduction
1158
+ @token_reduction = normalize_config(value, TokenReduction)
1159
+ when :keywords
1160
+ @keywords = normalize_config(value, Keywords)
1161
+ when :html_options
1162
+ @html_options = normalize_config(value, HtmlOptions)
1163
+ when :pages
1164
+ @pages = normalize_config(value, PageConfig)
1165
+ when :max_concurrent_extractions
1166
+ @max_concurrent_extractions = value&.to_i
1167
+ when :output_format
1168
+ @output_format = validate_output_format(value)
1169
+ when :result_format
1170
+ @result_format = validate_result_format(value)
1171
+ else
1172
+ raise ArgumentError, "Unknown configuration key: #{key}"
1173
+ end
1174
+ end
1175
+ # rubocop:enable Metrics/MethodLength
1176
+
1177
+ # Get a configuration field using hash-like syntax
1178
+ #
1179
+ # @param key [Symbol, String] Field name to get
1180
+ # @return [Object, nil] The field value
1181
+ #
1182
+ # @example
1183
+ # config = Extraction.new(use_cache: true)
1184
+ # config[:use_cache] # => true
1185
+ #
1186
+ def [](key)
1187
+ send(key.to_sym)
1188
+ rescue NoMethodError
1189
+ nil
1190
+ end
1191
+
1192
+ # Set output_format attribute
1193
+ #
1194
+ # @param value [String, nil] Output format value
1195
+ # @return [String, nil] The value that was set
1196
+ #
1197
+ def output_format=(value)
1198
+ @output_format = validate_output_format(value)
1199
+ end
1200
+
1201
+ # Set result_format attribute
1202
+ #
1203
+ # @param value [String, nil] Result format value
1204
+ # @return [String, nil] The value that was set
1205
+ #
1206
+ def result_format=(value)
1207
+ @result_format = validate_result_format(value)
1208
+ end
1209
+
1210
+ private
1211
+
1212
+ def normalize_config(value, klass)
1213
+ return nil if value.nil?
1214
+ return value if value.is_a?(klass)
1215
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
1216
+
1217
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
1218
+ end
1219
+
1220
+ def update_from_merged(merged)
1221
+ @use_cache = merged.use_cache
1222
+ @enable_quality_processing = merged.enable_quality_processing
1223
+ @force_ocr = merged.force_ocr
1224
+ @include_document_structure = merged.include_document_structure
1225
+ @ocr = merged.ocr
1226
+ @chunking = merged.chunking
1227
+ @language_detection = merged.language_detection
1228
+ @pdf_options = merged.pdf_options
1229
+ @images = merged.image_extraction
1230
+ @postprocessor = merged.postprocessor
1231
+ @token_reduction = merged.token_reduction
1232
+ @keywords = merged.keywords
1233
+ @html_options = merged.html_options
1234
+ @pages = merged.pages
1235
+ @max_concurrent_extractions = merged.max_concurrent_extractions
1236
+ @output_format = merged.output_format
1237
+ @result_format = merged.result_format
1238
+ end
1239
+ end
1240
+ end
1241
+ end