kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
data/sig/kreuzberg.rbs ADDED
@@ -0,0 +1,1337 @@
1
+ # Type signatures for Kreuzberg document intelligence framework
2
+
3
+ module Kreuzberg
4
+ VERSION: String
5
+
6
+ # Error code constants
7
+ ERROR_CODE_SUCCESS: Integer
8
+ ERROR_CODE_GENERIC: Integer
9
+ ERROR_CODE_PANIC: Integer
10
+ ERROR_CODE_INVALID_ARGUMENT: Integer
11
+ ERROR_CODE_IO: Integer
12
+ ERROR_CODE_PARSING: Integer
13
+ ERROR_CODE_OCR: Integer
14
+ ERROR_CODE_MISSING_DEPENDENCY: Integer
15
+
16
+ # Semantic element type classification (T.type_alias)
17
+ type element_type = 'title' | 'narrative_text' | 'heading' | 'list_item' | 'table' | 'image' | 'page_break' | 'code_block' | 'block_quote' | 'footer' | 'header'
18
+
19
+ # Bounding box coordinates for element positioning (T::Struct from types.rb)
20
+ class BoundingBox attr_reader x0: Float
21
+ attr_reader y0: Float
22
+ attr_reader x1: Float
23
+ attr_reader y1: Float
24
+
25
+ def initialize: (x0: Float, y0: Float, x1: Float, y1: Float) -> void
26
+ def serialize: () -> Hash[Symbol, untyped]
27
+ end
28
+
29
+ # Metadata for a semantic element (T::Struct from types.rb)
30
+ class ElementMetadata attr_reader page_number: Integer?
31
+ attr_reader filename: String?
32
+ attr_reader coordinates: BoundingBox?
33
+ attr_reader element_index: Integer?
34
+ attr_reader additional: Hash[String, String]
35
+
36
+ def initialize: (page_number: Integer?, filename: String?, coordinates: BoundingBox?, element_index: Integer?, additional: Hash[String, String]) -> void
37
+ def serialize: () -> Hash[Symbol, untyped]
38
+ end
39
+
40
+ # Semantic element extracted from document (T::Struct from types.rb)
41
+ class Element attr_reader element_id: String
42
+ attr_reader element_type: String
43
+ attr_reader text: String
44
+ attr_reader metadata: ElementMetadata
45
+
46
+ def initialize: (element_id: String, element_type: String, text: String, metadata: ElementMetadata) -> void
47
+ def serialize: () -> Hash[Symbol, untyped]
48
+ end
49
+
50
+ # Header/Heading metadata (T::Struct from types.rb)
51
+ class HeaderMetadata attr_reader level: Integer
52
+ attr_reader text: String
53
+ attr_reader id: String?
54
+ attr_reader depth: Integer
55
+ attr_reader html_offset: Integer
56
+
57
+ def initialize: (level: Integer, text: String, id: String?, depth: Integer, html_offset: Integer) -> void
58
+ def serialize: () -> Hash[Symbol, untyped]
59
+ end
60
+
61
+ # Link metadata (T::Struct from types.rb)
62
+ class LinkMetadata attr_reader href: String
63
+ attr_reader text: String
64
+ attr_reader title: String?
65
+ attr_reader link_type: String
66
+ attr_reader rel: Array[String]
67
+ attr_reader attributes: Hash[String, String]
68
+
69
+ def initialize: (href: String, text: String, title: String?, link_type: String, rel: Array[String], attributes: Hash[String, String]) -> void
70
+ def serialize: () -> Hash[Symbol, untyped]
71
+ end
72
+
73
+ # Image metadata (T::Struct from types.rb)
74
+ class ImageMetadata attr_reader src: String
75
+ attr_reader alt: String?
76
+ attr_reader title: String?
77
+ attr_reader dimensions: Array[Integer]?
78
+ attr_reader image_type: String
79
+ attr_reader attributes: Hash[String, String]
80
+
81
+ def initialize: (src: String, alt: String?, title: String?, dimensions: Array[Integer]?, image_type: String, attributes: Hash[String, String]) -> void
82
+ def serialize: () -> Hash[Symbol, untyped]
83
+ end
84
+
85
+ # Structured data metadata (T::Struct from types.rb)
86
+ class StructuredData attr_reader data_type: String
87
+ attr_reader raw_json: String
88
+ attr_reader schema_type: String?
89
+
90
+ def initialize: (data_type: String, raw_json: String, schema_type: String?) -> void
91
+ def serialize: () -> Hash[Symbol, untyped]
92
+ end
93
+
94
+ # HTML metadata (T::Struct from types.rb)
95
+ class HtmlMetadata attr_reader title: String?
96
+ attr_reader description: String?
97
+ attr_reader author: String?
98
+ attr_reader copyright: String?
99
+ attr_reader keywords: Array[String]
100
+ attr_reader canonical_url: String?
101
+ attr_reader language: String?
102
+ attr_reader text_direction: String?
103
+ attr_reader mime_type: String?
104
+ attr_reader charset: String?
105
+ attr_reader generator: String?
106
+ attr_reader viewport: String?
107
+ attr_reader theme_color: String?
108
+ attr_reader application_name: String?
109
+ attr_reader robots: String?
110
+ attr_reader open_graph: Hash[String, String]
111
+ attr_reader twitter_card: Hash[String, String]
112
+ attr_reader meta_tags: Hash[String, String]
113
+ attr_reader headers: Array[HeaderMetadata]
114
+ attr_reader links: Array[LinkMetadata]
115
+ attr_reader images: Array[ImageMetadata]
116
+ attr_reader structured_data: Array[StructuredData]
117
+
118
+ def initialize: (
119
+ title: String?,
120
+ description: String?,
121
+ author: String?,
122
+ copyright: String?,
123
+ keywords: Array[String],
124
+ canonical_url: String?,
125
+ language: String?,
126
+ text_direction: String?,
127
+ mime_type: String?,
128
+ charset: String?,
129
+ generator: String?,
130
+ viewport: String?,
131
+ theme_color: String?,
132
+ application_name: String?,
133
+ robots: String?,
134
+ open_graph: Hash[String, String],
135
+ twitter_card: Hash[String, String],
136
+ meta_tags: Hash[String, String],
137
+ headers: Array[HeaderMetadata],
138
+ links: Array[LinkMetadata],
139
+ images: Array[ImageMetadata],
140
+ structured_data: Array[StructuredData]
141
+ ) -> void
142
+ def serialize: () -> Hash[Symbol, untyped]
143
+ end
144
+
145
+ # Extracted keyword with relevance metadata (T::Struct from types.rb)
146
+ class ExtractedKeyword attr_reader text: String
147
+ attr_reader score: Float
148
+ attr_reader algorithm: String
149
+ attr_reader positions: Array[Integer]?
150
+
151
+ def initialize: (text: String, score: Float, algorithm: String, ?positions: Array[Integer]?) -> void
152
+ def serialize: () -> Hash[Symbol, untyped]
153
+ end
154
+
155
+ # Processing warning from a pipeline stage (T::Struct from types.rb)
156
+ class ProcessingWarning attr_reader source: String
157
+ attr_reader message: String
158
+
159
+ def initialize: (source: String, message: String) -> void
160
+ def serialize: () -> Hash[Symbol, untyped]
161
+ end
162
+
163
+ # Bounding box for document node positioning (T::Struct from types.rb)
164
+ class DocumentBoundingBox attr_reader x0: Float
165
+ attr_reader y0: Float
166
+ attr_reader x1: Float
167
+ attr_reader y1: Float
168
+
169
+ def initialize: (x0: Float, y0: Float, x1: Float, y1: Float) -> void
170
+ def serialize: () -> Hash[Symbol, untyped]
171
+ end
172
+
173
+ # Annotation for a document node (T::Struct from types.rb)
174
+ class DocumentAnnotation attr_reader key: String
175
+ attr_reader value: String
176
+
177
+ def initialize: (key: String, value: String) -> void
178
+ def serialize: () -> Hash[Symbol, untyped]
179
+ end
180
+
181
+ # Single node in the document structure tree (T::Struct from types.rb)
182
+ class DocumentNode attr_reader id: String
183
+ attr_reader content: String
184
+ attr_reader parent: Integer?
185
+ attr_reader children: Array[Integer]
186
+ attr_reader content_layer: String
187
+ attr_reader page: Integer?
188
+ attr_reader page_end: Integer?
189
+ attr_reader bbox: DocumentBoundingBox?
190
+ attr_reader annotations: Array[DocumentAnnotation]
191
+
192
+ def initialize: (
193
+ id: String,
194
+ content: String,
195
+ parent: Integer?,
196
+ children: Array[Integer],
197
+ content_layer: String,
198
+ page: Integer?,
199
+ page_end: Integer?,
200
+ bbox: DocumentBoundingBox?,
201
+ annotations: Array[DocumentAnnotation]
202
+ ) -> void
203
+ def serialize: () -> Hash[Symbol, untyped]
204
+ end
205
+
206
+ # Structured document representation (T::Struct from types.rb)
207
+ class DocumentStructure attr_reader nodes: Array[DocumentNode]
208
+
209
+ def initialize: (nodes: Array[DocumentNode]) -> void
210
+ def serialize: () -> Hash[Symbol, untyped]
211
+ end
212
+
213
+ # Config namespace (defined in lib/kreuzberg/config.rb)
214
+ module Config
215
+ class OCR
216
+ attr_reader backend: String
217
+ attr_reader language: String
218
+ attr_reader tesseract_config: Tesseract?
219
+ attr_reader paddle_ocr_config: PaddleOcr?
220
+ attr_reader element_config: OcrElementConfig?
221
+
222
+ def initialize: (?backend: String, ?language: String, ?tesseract_config: (Tesseract | Hash[Symbol, untyped])?, ?paddle_ocr_config: (PaddleOcr | Hash[Symbol, untyped])?, ?element_config: (OcrElementConfig | Hash[Symbol, untyped])?) -> void
223
+ def to_h: () -> Hash[Symbol, untyped]
224
+ end
225
+
226
+ class Tesseract
227
+ attr_reader options: Hash[Symbol, untyped]
228
+
229
+ def initialize: (**untyped options) -> void
230
+ def to_h: () -> Hash[Symbol, untyped]
231
+ end
232
+
233
+ class PaddleOcr
234
+ attr_reader language: String?
235
+ attr_reader cache_dir: String?
236
+ attr_reader use_angle_cls: bool?
237
+ attr_reader enable_table_detection: bool?
238
+ attr_reader det_db_thresh: Float?
239
+ attr_reader det_db_box_thresh: Float?
240
+ attr_reader det_db_unclip_ratio: Float?
241
+ attr_reader det_limit_side_len: Integer?
242
+ attr_reader rec_batch_num: Integer?
243
+ def initialize: (?language: String?, ?cache_dir: String?, ?use_angle_cls: bool?, ?enable_table_detection: bool?, ?det_db_thresh: Float?, ?det_db_box_thresh: Float?, ?det_db_unclip_ratio: Float?, ?det_limit_side_len: Integer?, ?rec_batch_num: Integer?) -> void
244
+ def to_h: () -> Hash[Symbol, untyped]
245
+ end
246
+
247
+ class OcrElementConfig
248
+ attr_reader include_elements: bool
249
+ attr_reader min_level: String?
250
+ attr_reader min_confidence: Float?
251
+ attr_reader build_hierarchy: bool
252
+ def initialize: (?include_elements: bool, ?min_level: String?, ?min_confidence: Float?, ?build_hierarchy: bool) -> void
253
+ def to_h: () -> Hash[Symbol, untyped]
254
+ end
255
+
256
+ class Chunking
257
+ attr_reader max_chars: Integer
258
+ attr_reader max_overlap: Integer
259
+ attr_reader preset: String?
260
+ attr_reader embedding: Embedding?
261
+ attr_reader enabled: bool?
262
+
263
+ def initialize: (
264
+ ?max_chars: Integer?,
265
+ ?max_overlap: Integer?,
266
+ ?preset: String?,
267
+ ?embedding: (Embedding | Hash[Symbol, untyped])?,
268
+ ?chunk_size: Integer?,
269
+ ?chunk_overlap: Integer?,
270
+ ?enabled: bool
271
+ ) -> void
272
+ def to_h: () -> Hash[Symbol, untyped]
273
+ end
274
+
275
+ class Embedding
276
+ attr_reader model: Hash[Symbol, untyped]
277
+ attr_reader normalize: bool?
278
+ attr_reader batch_size: Integer?
279
+ attr_reader show_download_progress: bool?
280
+ attr_reader cache_dir: String?
281
+
282
+ def initialize: (
283
+ ?model: Hash[Symbol, untyped],
284
+ ?normalize: bool?,
285
+ ?batch_size: Integer?,
286
+ ?show_download_progress: bool?,
287
+ ?cache_dir: String?
288
+ ) -> void
289
+ def to_h: () -> Hash[Symbol, untyped]
290
+ end
291
+
292
+ class LanguageDetection
293
+ attr_reader enabled: bool
294
+ attr_reader min_confidence: Float
295
+ attr_reader detect_multiple: bool
296
+
297
+ def initialize: (?enabled: bool, ?min_confidence: Float, ?detect_multiple: bool) -> void
298
+ def to_h: () -> Hash[Symbol, untyped]
299
+ end
300
+
301
+ class FontConfig
302
+ attr_accessor enabled: bool
303
+ attr_accessor custom_font_dirs: Array[String]?
304
+
305
+ def initialize: (?enabled: bool, ?custom_font_dirs: Array[String]?) -> void
306
+ def to_h: () -> Hash[Symbol, untyped]
307
+ end
308
+
309
+ class Hierarchy
310
+ attr_reader enabled: bool
311
+ attr_reader k_clusters: Integer
312
+ attr_reader include_bbox: bool
313
+ attr_reader ocr_coverage_threshold: Float?
314
+
315
+ def initialize: (?enabled: bool, ?k_clusters: Integer, ?include_bbox: bool, ?ocr_coverage_threshold: Float?) -> void
316
+ def to_h: () -> Hash[Symbol, untyped]
317
+ def self.from_h: (Hash[Symbol, untyped]?) -> Hierarchy?
318
+ end
319
+
320
+ class PDF
321
+ attr_reader extract_images: bool
322
+ attr_reader passwords: Array[String]?
323
+ attr_reader extract_metadata: bool
324
+ attr_reader font_config: FontConfig?
325
+ attr_reader hierarchy: Hierarchy?
326
+
327
+ def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?) -> void
328
+ def to_h: () -> Hash[Symbol, untyped]
329
+ end
330
+
331
+ class ImageExtraction
332
+ attr_reader extract_images: bool
333
+ attr_reader target_dpi: Integer
334
+ attr_reader max_image_dimension: Integer
335
+ attr_reader auto_adjust_dpi: bool
336
+ attr_reader min_dpi: Integer
337
+ attr_reader max_dpi: Integer
338
+
339
+ def initialize: (
340
+ ?extract_images: bool,
341
+ ?target_dpi: Integer,
342
+ ?max_image_dimension: Integer,
343
+ ?auto_adjust_dpi: bool,
344
+ ?min_dpi: Integer,
345
+ ?max_dpi: Integer
346
+ ) -> void
347
+ def to_h: () -> Hash[Symbol, untyped]
348
+ end
349
+
350
+ class ImagePreprocessing
351
+ attr_reader target_dpi: Integer
352
+ attr_reader auto_rotate: bool
353
+ attr_reader deskew: bool
354
+ attr_reader denoise: bool
355
+ attr_reader contrast_enhance: bool
356
+ attr_reader binarization_method: String
357
+ attr_reader invert_colors: bool
358
+
359
+ def initialize: (
360
+ ?target_dpi: Integer,
361
+ ?auto_rotate: bool,
362
+ ?deskew: bool,
363
+ ?denoise: bool,
364
+ ?contrast_enhance: bool,
365
+ ?binarization_method: String,
366
+ ?invert_colors: bool
367
+ ) -> void
368
+ def to_h: () -> Hash[Symbol, untyped]
369
+ end
370
+
371
+ class TokenReduction
372
+ attr_reader mode: String
373
+ attr_reader preserve_important_words: bool
374
+
375
+ def initialize: (?mode: String, ?preserve_important_words: bool) -> void
376
+ def to_h: () -> Hash[Symbol, untyped]
377
+ end
378
+
379
+ class PostProcessor
380
+ attr_reader enabled: bool
381
+ attr_reader enabled_processors: Array[String]?
382
+ attr_reader disabled_processors: Array[String]?
383
+
384
+ def initialize: (?enabled: bool, ?enabled_processors: Array[String]?, ?disabled_processors: Array[String]?) -> void
385
+ def to_h: () -> Hash[Symbol, untyped]
386
+ end
387
+
388
+ class HtmlPreprocessing
389
+ attr_reader enabled: bool?
390
+ attr_reader preset: Symbol?
391
+ attr_reader remove_navigation: bool?
392
+ attr_reader remove_forms: bool?
393
+
394
+ def initialize: (?enabled: bool?, ?preset: Symbol?, ?remove_navigation: bool?, ?remove_forms: bool?) -> void
395
+ def to_h: () -> Hash[Symbol, untyped]
396
+ end
397
+
398
+ class HtmlOptions
399
+ attr_reader options: Hash[Symbol, untyped]
400
+
401
+ def initialize: (**untyped options) -> void
402
+ def to_h: () -> Hash[Symbol, untyped]
403
+ end
404
+
405
+ class KeywordYakeParams
406
+ attr_reader window_size: Integer
407
+
408
+ def initialize: (?window_size: Integer) -> void
409
+ def to_h: () -> Hash[Symbol, untyped]
410
+ end
411
+
412
+ class KeywordRakeParams
413
+ attr_reader min_word_length: Integer
414
+ attr_reader max_words_per_phrase: Integer
415
+
416
+ def initialize: (?min_word_length: Integer, ?max_words_per_phrase: Integer) -> void
417
+ def to_h: () -> Hash[Symbol, untyped]
418
+ end
419
+
420
+ class Keywords
421
+ attr_reader algorithm: String?
422
+ attr_reader max_keywords: Integer?
423
+ attr_reader min_score: Float?
424
+ attr_reader ngram_range: Array[Integer]?
425
+ attr_reader language: String?
426
+ attr_reader yake_params: KeywordYakeParams?
427
+ attr_reader rake_params: KeywordRakeParams?
428
+
429
+ def initialize: (
430
+ ?algorithm: (Symbol | String)?,
431
+ ?max_keywords: Integer?,
432
+ ?min_score: Float?,
433
+ ?ngram_range: Array[Integer]?,
434
+ ?language: (Symbol | String)?,
435
+ ?yake_params: (KeywordYakeParams | Hash[Symbol, untyped])?,
436
+ ?rake_params: (KeywordRakeParams | Hash[Symbol, untyped])?
437
+ ) -> void
438
+ def to_h: () -> Hash[Symbol, untyped]
439
+ end
440
+
441
+ class PageConfig
442
+ attr_reader extract_pages: bool
443
+ attr_reader insert_page_markers: bool
444
+ attr_reader marker_format: String
445
+
446
+ def initialize: (?extract_pages: bool, ?insert_page_markers: bool, ?marker_format: String) -> void
447
+ def to_h: () -> Hash[Symbol, untyped]
448
+ end
449
+
450
+ class Extraction
451
+ attr_reader use_cache: bool
452
+ attr_reader enable_quality_processing: bool
453
+ attr_reader force_ocr: bool
454
+ attr_reader include_document_structure: bool
455
+ attr_reader ocr: OCR?
456
+ attr_reader chunking: Chunking?
457
+ attr_reader language_detection: LanguageDetection?
458
+ attr_reader pdf_options: PDF?
459
+ attr_reader images: ImageExtraction?
460
+ attr_reader postprocessor: PostProcessor?
461
+ attr_reader token_reduction: TokenReduction?
462
+ attr_reader keywords: Keywords?
463
+ attr_reader html_options: HtmlOptions?
464
+ attr_reader pages: PageConfig?
465
+ attr_reader max_concurrent_extractions: Integer?
466
+ attr_reader output_format: String?
467
+ attr_reader result_format: String?
468
+ attr_reader security_limits: Hash[String, Integer]?
469
+
470
+ alias image_extraction images
471
+
472
+ def self.from_file: (String path) -> Extraction
473
+ def self.discover: () -> Extraction?
474
+ def initialize: (
475
+ ?use_cache: bool,
476
+ ?enable_quality_processing: bool,
477
+ ?force_ocr: bool,
478
+ ?include_document_structure: bool,
479
+ ?ocr: (OCR | Hash[Symbol, untyped])?,
480
+ ?chunking: (Chunking | Hash[Symbol, untyped])?,
481
+ ?language_detection: (LanguageDetection | Hash[Symbol, untyped])?,
482
+ ?pdf_options: (PDF | Hash[Symbol, untyped])?,
483
+ ?image_extraction: (ImageExtraction | Hash[Symbol, untyped])?,
484
+ ?postprocessor: (PostProcessor | Hash[Symbol, untyped])?,
485
+ ?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
486
+ ?keywords: (Keywords | Hash[Symbol, untyped])?,
487
+ ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
488
+ ?pages: (PageConfig | Hash[Symbol, untyped])?,
489
+ ?max_concurrent_extractions: Integer?,
490
+ ?output_format: String?,
491
+ ?result_format: String?
492
+ ) -> void
493
+ def to_h: () -> Hash[Symbol, untyped]
494
+ def to_json: (*untyped) -> String
495
+ def get_field: (String | Symbol field_name) -> untyped
496
+ def merge: (Extraction | Hash[Symbol, untyped] other) -> Extraction
497
+ def merge!: (Extraction | Hash[Symbol, untyped] other) -> self
498
+ def []: (Symbol | String key) -> untyped
499
+ def []=: (Symbol | String key, untyped value) -> untyped
500
+ def output_format=: (String? value) -> String?
501
+ def result_format=: (String? value) -> String?
502
+
503
+ private
504
+
505
+ def normalize_config: [T] (T | Hash[Symbol, untyped] | nil value, Class klass) -> T?
506
+ def extract_from_hash: (Hash[Symbol, untyped]? hash, Hash[Symbol, untyped] defaults) -> Hash[Symbol, untyped]
507
+ def assign_attributes: (Hash[Symbol, untyped] params) -> void
508
+ def validate_output_format: (untyped value) -> String?
509
+ def validate_result_format: (untyped value) -> String?
510
+ def update_from_merged: (Extraction merged) -> void
511
+ end
512
+
513
+ end
514
+
515
+ # Alias for Config::Extraction (for API consistency with other language bindings)
516
+ ExtractionConfig: singleton(Config::Extraction)
517
+
518
+ # Alias for Config::PageConfig (for API consistency with other language bindings)
519
+ PageConfig: singleton(Config::PageConfig)
520
+
521
+ # Keyword algorithm constants
522
+ module KeywordAlgorithm
523
+ YAKE: Symbol
524
+ RAKE: Symbol
525
+ end
526
+
527
+ # Extraction result type
528
+ type extraction_result_hash = {
529
+ content: String,
530
+ mime_type: String,
531
+ metadata_json: String,
532
+ metadata: Hash[String, untyped],
533
+ tables: Array[table_hash]?,
534
+ detected_languages: Array[String]?,
535
+ chunks: Array[chunk_hash]?,
536
+ images: Array[image_hash]?,
537
+ pages: Array[page_content_hash]?,
538
+ elements: Array[element_hash]?,
539
+ ocr_elements: Array[ocr_element_hash]?,
540
+ djot_content: djot_content_hash?,
541
+ document: document_structure_hash?,
542
+ extracted_keywords: Array[extracted_keyword_hash]?,
543
+ quality_score: Float?,
544
+ processing_warnings: Array[processing_warning_hash]?
545
+ }
546
+
547
+ type extracted_keyword_hash = {
548
+ text: String,
549
+ score: Float,
550
+ algorithm: String,
551
+ positions: Array[Integer]?
552
+ }
553
+
554
+ type processing_warning_hash = {
555
+ source: String,
556
+ message: String
557
+ }
558
+
559
+ type page_content_hash = {
560
+ page_number: Integer,
561
+ content: String,
562
+ tables: Array[table_hash],
563
+ images: Array[image_hash],
564
+ is_blank: bool?
565
+ }
566
+
567
+ type djot_content_hash = {
568
+ plain_text: String,
569
+ blocks: Array[formatted_block_hash],
570
+ metadata_json: String,
571
+ tables: Array[table_hash],
572
+ images: Array[djot_image_hash],
573
+ links: Array[djot_link_hash],
574
+ footnotes: Array[footnote_hash],
575
+ attributes: Hash[String, attributes_hash]?
576
+ }
577
+
578
+ type formatted_block_hash = {
579
+ block_type: String,
580
+ level: Integer?,
581
+ content: String?,
582
+ children: Array[formatted_block_hash]?,
583
+ attributes: attributes_hash?
584
+ }
585
+
586
+ type djot_image_hash = {
587
+ url: String,
588
+ alt: String?,
589
+ title: String?,
590
+ attributes: attributes_hash?
591
+ }
592
+
593
+ type djot_link_hash = {
594
+ url: String,
595
+ text: String,
596
+ title: String?,
597
+ link_type: String?
598
+ }
599
+
600
+ type footnote_hash = {
601
+ label: String,
602
+ content: String
603
+ }
604
+
605
+ type attributes_hash = Hash[String, String | Integer | bool | Array[String] | nil]
606
+
607
+ type document_bounding_box_hash = {
608
+ x0: Float,
609
+ y0: Float,
610
+ x1: Float,
611
+ y1: Float
612
+ }
613
+
614
+ type document_annotation_hash = {
615
+ start: Integer,
616
+ end: Integer,
617
+ kind: {
618
+ annotation_type: String,
619
+ url: String?,
620
+ title: String?
621
+ }
622
+ }
623
+
624
+ type document_node_hash = {
625
+ id: String,
626
+ content: Hash[untyped, untyped],
627
+ parent: Integer?,
628
+ children: Array[Integer],
629
+ content_layer: String,
630
+ page: Integer?,
631
+ page_end: Integer?,
632
+ bbox: document_bounding_box_hash?,
633
+ annotations: Array[document_annotation_hash]
634
+ }
635
+
636
+ type document_structure_hash = {
637
+ nodes: Array[document_node_hash]
638
+ }
639
+
640
+ type element_hash = {
641
+ element_id: String,
642
+ element_type: String,
643
+ text: String,
644
+ metadata: Hash[String, untyped]?
645
+ }
646
+
647
+ type ocr_confidence_hash = {
648
+ detection: Float?,
649
+ recognition: Float?
650
+ }
651
+
652
+ type ocr_rotation_hash = {
653
+ angle_degrees: Float?,
654
+ confidence: Float?
655
+ }
656
+
657
+ type ocr_bounding_geometry_hash = {
658
+ type: String,
659
+ left: Float?,
660
+ top: Float?,
661
+ width: Float?,
662
+ height: Float?,
663
+ points: Array[Array[Float]]?
664
+ }
665
+
666
+ type ocr_element_hash = {
667
+ text: String,
668
+ geometry: ocr_bounding_geometry_hash?,
669
+ confidence: ocr_confidence_hash?,
670
+ level: String?,
671
+ rotation: ocr_rotation_hash?,
672
+ page_number: Integer?,
673
+ parent_id: String?,
674
+ backend_metadata: Hash[String, untyped]?
675
+ }
676
+
677
+ type table_hash = {
678
+ cells: Array[Array[String]],
679
+ markdown: String,
680
+ page_number: Integer,
681
+ bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?
682
+ }
683
+
684
+ type chunk_hash = {
685
+ content: String,
686
+ byte_start: Integer,
687
+ byte_end: Integer,
688
+ token_count: Integer?,
689
+ chunk_index: Integer,
690
+ total_chunks: Integer,
691
+ first_page: Integer?,
692
+ last_page: Integer?,
693
+ embedding: Array[Float]?
694
+ }
695
+
696
+ type image_hash = {
697
+ data: String,
698
+ format: String,
699
+ image_index: Integer,
700
+ page_number: Integer?,
701
+ width: Integer?,
702
+ height: Integer?,
703
+ colorspace: String?,
704
+ bits_per_component: Integer?,
705
+ is_mask: bool,
706
+ description: String?,
707
+ bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?,
708
+ ocr_result: extraction_result_hash?
709
+ }
710
+
711
+ # Metadata hash type - represents the parsed JSON metadata from extraction results.
712
+ # Fields correspond to the Rust Metadata struct (serialized as JSON and parsed back).
713
+ #
714
+ # Common fields:
715
+ # "title" => String? - Document title
716
+ # "subject" => String? - Document subject or description
717
+ # "authors" => Array[String]? - Primary author(s)
718
+ # "keywords" => Array[String]? - Keywords/tags
719
+ # "language" => String? - Primary language (ISO 639 code)
720
+ # "created_at" => String? - Creation timestamp (ISO 8601)
721
+ # "modified_at" => String? - Last modification timestamp (ISO 8601)
722
+ # "created_by" => String? - User who created the document
723
+ # "modified_by" => String? - User who last modified the document
724
+ # "pages" => Hash? - Page/slide/sheet structure with boundaries
725
+ # "format_type" => String? - Format discriminator (pdf, docx, excel, etc.)
726
+ # "image_preprocessing" => Hash? - Image preprocessing metadata
727
+ # "json_schema" => untyped? - JSON schema for structured data extraction
728
+ # "error" => Hash? - Error metadata (for batch operations)
729
+ # "extraction_duration_ms" => Integer? - Extraction duration in milliseconds
730
+ # "category" => String? - Document category
731
+ # "tags" => Array[String]? - Document tags
732
+ # "document_version" => String? - Document version string
733
+ # "abstract_text" => String? - Abstract or summary text
734
+ # "output_format" => String? - Output format identifier
735
+ type metadata_hash = Hash[String, untyped]
736
+
737
+ type config_hash = Hash[Symbol, untyped]
738
+ type config_input = config_hash | _ToH
739
+
740
+ interface _ToH
741
+ def to_h: () -> config_hash
742
+ end
743
+
744
+ # Extraction result wrapper
745
+ class Result
746
+ # Table structure (Struct from result.rb)
747
+ class Table
748
+ attr_reader cells: Array[Array[String]]
749
+ attr_reader markdown: String
750
+ attr_reader page_number: Integer
751
+ attr_reader bounding_box: BoundingBox?
752
+
753
+ def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer, bounding_box: BoundingBox?) -> void
754
+ def to_h: () -> table_hash
755
+ end
756
+
757
+ # Text chunk (Struct from result.rb)
758
+ class Chunk
759
+ attr_reader content: String
760
+ attr_reader byte_start: Integer
761
+ attr_reader byte_end: Integer
762
+ attr_reader token_count: Integer?
763
+ attr_reader chunk_index: Integer
764
+ attr_reader total_chunks: Integer
765
+ attr_reader first_page: Integer?
766
+ attr_reader last_page: Integer?
767
+ attr_reader embedding: Array[Float]?
768
+
769
+ def initialize: (
770
+ content: String,
771
+ byte_start: Integer,
772
+ byte_end: Integer,
773
+ token_count: Integer?,
774
+ chunk_index: Integer,
775
+ total_chunks: Integer,
776
+ first_page: Integer?,
777
+ last_page: Integer?,
778
+ embedding: Array[Float]?
779
+ ) -> void
780
+ def to_h: () -> chunk_hash
781
+ end
782
+
783
+ # Extracted image (Struct from result.rb)
784
+ class Image
785
+ attr_reader data: String
786
+ attr_reader format: String
787
+ attr_reader image_index: Integer
788
+ attr_reader page_number: Integer?
789
+ attr_reader width: Integer?
790
+ attr_reader height: Integer?
791
+ attr_reader colorspace: String?
792
+ attr_reader bits_per_component: Integer?
793
+ attr_reader is_mask: bool
794
+ attr_reader description: String?
795
+ attr_reader bounding_box: BoundingBox?
796
+ attr_reader ocr_result: Result?
797
+
798
+ def initialize: (
799
+ data: String,
800
+ format: String,
801
+ image_index: Integer,
802
+ page_number: Integer?,
803
+ width: Integer?,
804
+ height: Integer?,
805
+ colorspace: String?,
806
+ bits_per_component: Integer?,
807
+ is_mask: bool,
808
+ description: String?,
809
+ bounding_box: BoundingBox?,
810
+ ocr_result: Result?
811
+ ) -> void
812
+ def to_h: () -> image_hash
813
+ end
814
+
815
+ # Page content with text and extracted elements (Struct from result.rb)
816
+ class PageContent
817
+ attr_reader page_number: Integer
818
+ attr_reader content: String
819
+ attr_reader tables: Array[Table]
820
+ attr_reader images: Array[Image]?
821
+ attr_reader hierarchy: PageHierarchy?
822
+ attr_reader is_blank: bool?
823
+
824
+ def initialize: (page_number: Integer, content: String, tables: Array[Table], images: Array[Image]?, hierarchy: PageHierarchy?, is_blank: bool?) -> void
825
+ def to_h: () -> Hash[Symbol, untyped]
826
+ end
827
+
828
+ # Hierarchical block element for page hierarchy (Struct from result.rb)
829
+ class HierarchicalBlock
830
+ attr_reader text: String
831
+ attr_reader font_size: Float?
832
+ attr_reader level: String?
833
+ attr_reader bbox: Array[Float]?
834
+
835
+ def initialize: (text: String, font_size: Float?, level: String?, bbox: Array[Float]?) -> void
836
+ def to_h: () -> Hash[Symbol, untyped]
837
+ end
838
+
839
+ # Page hierarchy information (Struct from result.rb)
840
+ class PageHierarchy
841
+ attr_reader block_count: Integer
842
+ attr_reader blocks: Array[HierarchicalBlock]
843
+
844
+ def initialize: (block_count: Integer, blocks: Array[HierarchicalBlock]) -> void
845
+ def to_h: () -> Hash[Symbol, untyped]
846
+ end
847
+
848
+ # Element bounding box coordinates (Struct from result.rb)
849
+ class ElementBoundingBox
850
+ attr_reader x0: Float
851
+ attr_reader y0: Float
852
+ attr_reader x1: Float
853
+ attr_reader y1: Float
854
+
855
+ def initialize: (x0: Float, y0: Float, x1: Float, y1: Float) -> void
856
+ def to_h: () -> Hash[Symbol, untyped]
857
+ end
858
+
859
+ # Element metadata (Struct from result.rb)
860
+ class ElementMetadataStruct
861
+ attr_reader page_number: Integer?
862
+ attr_reader filename: String?
863
+ attr_reader coordinates: ElementBoundingBox?
864
+ attr_reader element_index: Integer?
865
+ attr_reader additional: Hash[String, String]
866
+
867
+ def initialize: (page_number: Integer?, filename: String?, coordinates: ElementBoundingBox?, element_index: Integer?, additional: Hash[String, String]) -> void
868
+ def to_h: () -> Hash[Symbol, untyped]
869
+ end
870
+
871
+ # Structured document element (Struct from result.rb)
872
+ class ElementStruct
873
+ attr_reader element_id: String
874
+ attr_reader element_type: String
875
+ attr_reader text: String
876
+ attr_reader metadata: ElementMetadataStruct
877
+
878
+ def initialize: (element_id: String, element_type: String, text: String, metadata: ElementMetadataStruct) -> void
879
+ def to_h: () -> Hash[Symbol, untyped]
880
+ end
881
+
882
+ # OCR bounding geometry (class from result.rb)
883
+ class OcrBoundingGeometry
884
+ attr_reader type: String
885
+ attr_reader left: Float?
886
+ attr_reader top: Float?
887
+ attr_reader width: Float?
888
+ attr_reader height: Float?
889
+ attr_reader points: Array[Array[Float]]?
890
+ def initialize: (type: String, ?left: Float?, ?top: Float?, ?width: Float?, ?height: Float?, ?points: Array[Array[Float]]?) -> void
891
+ def to_h: () -> ocr_bounding_geometry_hash
892
+ end
893
+
894
+ # OCR confidence scores (class from result.rb)
895
+ class OcrConfidence
896
+ attr_reader detection: Float?
897
+ attr_reader recognition: Float?
898
+ def initialize: (?detection: Float?, ?recognition: Float?) -> void
899
+ def to_h: () -> ocr_confidence_hash
900
+ end
901
+
902
+ # OCR rotation information (class from result.rb)
903
+ class OcrRotation
904
+ attr_reader angle_degrees: Float?
905
+ attr_reader confidence: Float?
906
+ def initialize: (?angle_degrees: Float?, ?confidence: Float?) -> void
907
+ def to_h: () -> ocr_rotation_hash
908
+ end
909
+
910
+ # OCR text element with geometry and metadata (class from result.rb)
911
+ class OcrElement
912
+ attr_reader text: String
913
+ attr_reader geometry: OcrBoundingGeometry?
914
+ attr_reader confidence: OcrConfidence?
915
+ attr_reader level: String?
916
+ attr_reader rotation: OcrRotation?
917
+ attr_reader page_number: Integer?
918
+ attr_reader parent_id: String?
919
+ attr_reader backend_metadata: Hash[String, untyped]?
920
+ def initialize: (text: String, ?geometry: OcrBoundingGeometry?, ?confidence: OcrConfidence?, ?level: String?, ?rotation: OcrRotation?, ?page_number: Integer?, ?parent_id: String?, ?backend_metadata: Hash[String, untyped]?) -> void
921
+ def to_h: () -> ocr_element_hash
922
+ end
923
+
924
+ # Structured Djot document representation (class from djot_content.rb)
925
+ class DjotContent
926
+ attr_reader plain_text: String
927
+ attr_reader blocks: Array[DjotContent::FormattedBlock]
928
+ attr_reader metadata_json: String
929
+ attr_reader tables: Array[untyped]
930
+ attr_reader images: Array[DjotContent::DjotImage]
931
+ attr_reader links: Array[DjotContent::DjotLink]
932
+ attr_reader footnotes: Array[DjotContent::Footnote]
933
+ attr_reader attributes: Hash[String, untyped]?
934
+
935
+ def initialize: (untyped hash) -> void
936
+ def to_h: () -> Hash[Symbol, untyped]
937
+ def metadata: () -> Hash[untyped, untyped]
938
+
939
+ private
940
+
941
+ def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
942
+ def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
943
+ def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
944
+ def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
945
+ def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
946
+
947
+ class FormattedBlock
948
+ attr_reader block_type: String
949
+ attr_reader level: Integer?
950
+ attr_reader content: String?
951
+ attr_reader children: Array[FormattedBlock]?
952
+ attr_reader attributes: Hash[String, untyped]?
953
+
954
+ def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
955
+ def to_h: () -> Hash[Symbol, untyped]
956
+ end
957
+
958
+ class DjotImage
959
+ attr_reader url: String
960
+ attr_reader alt: String?
961
+ attr_reader title: String?
962
+ attr_reader width: Integer?
963
+ attr_reader height: Integer?
964
+
965
+ def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
966
+ def src: () -> String
967
+ def to_h: () -> Hash[Symbol, untyped]
968
+ end
969
+
970
+ class DjotLink
971
+ attr_reader url: String
972
+ attr_reader text: String?
973
+ attr_reader title: String?
974
+ attr_reader link_type: String?
975
+
976
+ def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
977
+ def href: () -> String
978
+ def to_h: () -> Hash[Symbol, untyped]
979
+ end
980
+
981
+ class Footnote
982
+ attr_reader label: String
983
+ attr_reader content: String
984
+
985
+ def initialize: (label: String, content: String) -> void
986
+ def to_h: () -> footnote_hash
987
+ end
988
+ end
989
+
990
+ # Structured document representation (class from document_structure.rb)
991
+ # Note: This is Result::DocumentStructure, distinct from Kreuzberg::DocumentStructure (T::Struct)
992
+ class DocumentStructure
993
+ attr_reader nodes: Array[DocumentNode]
994
+
995
+ def initialize: (Hash[String | Symbol, untyped] hash) -> void
996
+ def to_h: () -> Hash[Symbol, untyped]
997
+
998
+ private
999
+
1000
+ def parse_nodes: (Array[untyped]? nodes_data) -> Array[DocumentNode]
1001
+ end
1002
+
1003
+ # Single node in the document structure tree (class from document_structure.rb)
1004
+ # Note: This is Result::DocumentNode, distinct from Kreuzberg::DocumentNode (T::Struct)
1005
+ class DocumentNode
1006
+ attr_reader id: String
1007
+ attr_reader content: Hash[untyped, untyped]
1008
+ attr_reader parent: Integer?
1009
+ attr_reader children: Array[Integer]
1010
+ attr_reader content_layer: String
1011
+ attr_reader page: Integer?
1012
+ attr_reader page_end: Integer?
1013
+ attr_reader bbox: DocumentBoundingBox?
1014
+ attr_reader annotations: Array[DocumentAnnotation]
1015
+
1016
+ def initialize: (Hash[String | Symbol, untyped] hash) -> void
1017
+ def to_h: () -> Hash[Symbol, untyped]
1018
+
1019
+ private
1020
+
1021
+ def assign_core_fields: (Hash[String | Symbol, untyped] hash) -> void
1022
+ def assign_tree_fields: (Hash[String | Symbol, untyped] hash) -> void
1023
+ def assign_metadata_fields: (Hash[String | Symbol, untyped] hash) -> void
1024
+ def parse_children: (Array[untyped]? children_data) -> Array[Integer]
1025
+ def extract_child_index: (untyped child) -> Integer
1026
+ def parse_bbox: (Hash[String | Symbol, untyped]? bbox_data) -> DocumentBoundingBox?
1027
+ def parse_annotations: (Array[untyped]? annotations_data) -> Array[DocumentAnnotation]
1028
+ end
1029
+
1030
+ # Bounding box for document node positioning (class from document_structure.rb)
1031
+ # Note: This is Result::DocumentBoundingBox, distinct from Kreuzberg::DocumentBoundingBox (T::Struct)
1032
+ class DocumentBoundingBox
1033
+ attr_reader x0: Float?
1034
+ attr_reader y0: Float?
1035
+ attr_reader x1: Float?
1036
+ attr_reader y1: Float?
1037
+
1038
+ def initialize: (Hash[String | Symbol, untyped] hash) -> void
1039
+ def to_h: () -> Hash[Symbol, untyped]
1040
+
1041
+ private
1042
+
1043
+ def extract_float: (Hash[String | Symbol, untyped] hash, String key) -> Float?
1044
+ end
1045
+
1046
+ # Annotation for a document node (class from document_structure.rb)
1047
+ # Note: This is Result::DocumentAnnotation, distinct from Kreuzberg::DocumentAnnotation (T::Struct)
1048
+ class DocumentAnnotation
1049
+ attr_reader start: Integer
1050
+ attr_reader end_offset: Integer
1051
+ attr_reader annotation_type: String
1052
+ attr_reader url: String?
1053
+ attr_reader title: String?
1054
+
1055
+ def initialize: (Hash[String | Symbol, untyped] hash) -> void
1056
+ def to_h: () -> Hash[Symbol, untyped]
1057
+
1058
+ private
1059
+
1060
+ def parse_kind: (Hash[String | Symbol, untyped]? kind_hash) -> void
1061
+ end
1062
+
1063
+ attr_reader content: String
1064
+ attr_reader mime_type: String
1065
+ attr_reader metadata: metadata_hash
1066
+ attr_reader metadata_json: String
1067
+ attr_reader tables: Array[Table]
1068
+ attr_reader detected_languages: Array[String]?
1069
+ attr_reader chunks: Array[Chunk]?
1070
+ attr_reader images: Array[Image]?
1071
+ attr_reader pages: Array[PageContent]?
1072
+ attr_reader elements: Array[ElementStruct]?
1073
+ attr_reader ocr_elements: Array[OcrElement]?
1074
+ attr_reader djot_content: DjotContent?
1075
+ attr_reader document: DocumentStructure?
1076
+ attr_reader extracted_keywords: Array[ExtractedKeyword]?
1077
+ attr_reader quality_score: Float?
1078
+ attr_reader processing_warnings: Array[ProcessingWarning]?
1079
+
1080
+ def initialize: (extraction_result_hash hash) -> void
1081
+ def to_h: () -> Hash[Symbol, untyped]
1082
+ def to_json: (*untyped) -> String
1083
+
1084
+ def page_count: () -> Integer
1085
+ def chunk_count: () -> Integer
1086
+ def detected_language: () -> String?
1087
+ def metadata_field: (String | Symbol name) -> untyped
1088
+
1089
+ private
1090
+
1091
+ def parse_metadata: (String metadata_json) -> metadata_hash
1092
+ def parse_tables: (Array[table_hash]? tables_data) -> Array[Table]
1093
+ def parse_detected_languages: (Array[String]? langs_data) -> Array[String]?
1094
+ def parse_chunks: (Array[chunk_hash]? chunks_data) -> Array[Chunk]?
1095
+ def parse_images: (Array[image_hash]? images_data) -> Array[Image]?
1096
+ def parse_pages: (Array[page_content_hash]? pages_data) -> Array[PageContent]?
1097
+ def parse_elements: (Array[untyped]? elements_data) -> Array[ElementStruct]?
1098
+ def parse_element: (Hash[String, untyped] element_hash) -> ElementStruct
1099
+ def parse_element_coordinates: (Hash[String, untyped]? coordinates_data) -> ElementBoundingBox?
1100
+ def parse_ocr_elements: (Array[ocr_element_hash]? ocr_elements_data) -> Array[OcrElement]?
1101
+ def parse_page_hierarchy: (Hash[String, untyped]? hierarchy_data) -> PageHierarchy?
1102
+ def parse_djot_content: (Hash[String, untyped]? djot_data) -> DjotContent?
1103
+ def parse_document_structure: (Hash[String, untyped]? document_data) -> DocumentStructure?
1104
+ def parse_extracted_keywords: (Array[extracted_keyword_hash]? keywords_data) -> Array[ExtractedKeyword]?
1105
+ def parse_processing_warnings: (Array[processing_warning_hash]? warnings_data) -> Array[ProcessingWarning]
1106
+ def get_value: (Hash[String | Symbol, untyped] hash, String key, ?untyped default) -> untyped
1107
+ def serialize_tables: () -> Array[table_hash]
1108
+ def serialize_chunks: () -> Array[chunk_hash]?
1109
+ def serialize_images: () -> Array[image_hash]?
1110
+ def serialize_pages: () -> Array[Hash[Symbol, untyped]]?
1111
+ def serialize_elements: () -> Array[Hash[Symbol, untyped]]?
1112
+ def serialize_ocr_elements: () -> Array[ocr_element_hash]?
1113
+ def parse_ocr_geometry: (Hash[String, untyped]? data) -> OcrBoundingGeometry?
1114
+ def parse_ocr_confidence: (Hash[String, untyped]? data) -> OcrConfidence?
1115
+ def parse_ocr_rotation: (Hash[String, untyped]? data) -> OcrRotation?
1116
+ end
1117
+
1118
+ # Module methods (extraction API)
1119
+ def self.extract_file_sync: (
1120
+ path: String | Pathname,
1121
+ ?mime_type: String?,
1122
+ ?config: config_input?
1123
+ ) -> Result
1124
+
1125
+ def self.extract_bytes_sync: (
1126
+ data: String,
1127
+ mime_type: String,
1128
+ ?config: config_input?
1129
+ ) -> Result
1130
+
1131
+ def self.batch_extract_files_sync: (
1132
+ paths: Array[String | Pathname],
1133
+ ?config: config_input?
1134
+ ) -> Array[Result]
1135
+
1136
+ def self.batch_extract_bytes_sync: (
1137
+ data_array: Array[String],
1138
+ mime_types: Array[String],
1139
+ ?config: config_input?
1140
+ ) -> Array[Result]
1141
+
1142
+ def self.extract_file: (
1143
+ path: String | Pathname,
1144
+ ?mime_type: String?,
1145
+ ?config: config_input?
1146
+ ) -> Result
1147
+
1148
+ def self.extract_bytes: (
1149
+ data: String,
1150
+ mime_type: String,
1151
+ ?config: config_input?
1152
+ ) -> Result
1153
+
1154
+ def self.batch_extract_files: (
1155
+ paths: Array[String | Pathname],
1156
+ ?config: config_input?
1157
+ ) -> Array[Result]
1158
+
1159
+ def self.batch_extract_bytes: (
1160
+ data_array: Array[String],
1161
+ mime_types: Array[String],
1162
+ ?config: config_input?
1163
+ ) -> Array[Result]
1164
+
1165
+ # Cache API
1166
+ def self.clear_cache: () -> void
1167
+ def self.cache_stats: () -> Hash[Symbol | String, Integer]
1168
+
1169
+ # Config loading (native methods)
1170
+ def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
1171
+ def self._config_discover_native: () -> Hash[Symbol, untyped]?
1172
+
1173
+ # Error introspection (native methods)
1174
+ def self._last_error_code_native: () -> Integer
1175
+ def self._last_panic_context_json_native: () -> String?
1176
+ def self._get_error_details_native: () -> Hash[String, untyped]
1177
+ def self._classify_error_native: (String message) -> Hash[String, untyped]
1178
+ def self._error_code_name_native: (Integer code) -> String
1179
+ def self._error_code_description_native: (Integer code) -> String
1180
+
1181
+ # MIME type detection
1182
+ def self.detect_mime_type: (String data) -> String
1183
+ def self.detect_mime_type_from_path: (String path) -> String
1184
+ def self.get_extensions_for_mime: (String mime_type) -> Array[String]
1185
+ def self.validate_mime_type: (String mime_type) -> String
1186
+
1187
+ # Validation native methods
1188
+ def self._validate_binarization_method_native: (String method) -> Integer
1189
+ def self._validate_ocr_backend_native: (String backend) -> Integer
1190
+ def self._validate_language_code_native: (String code) -> Integer
1191
+ def self._validate_token_reduction_level_native: (String level) -> Integer
1192
+ def self._validate_tesseract_psm_native: (Integer psm) -> Integer
1193
+ def self._validate_tesseract_oem_native: (Integer oem) -> Integer
1194
+ def self._validate_output_format_native: (String format) -> Integer
1195
+ def self._validate_confidence_native: (Float confidence) -> Integer
1196
+ def self._validate_dpi_native: (Integer dpi) -> Integer
1197
+ def self._validate_chunking_params_native: (Integer max_chars, Integer max_overlap) -> Integer
1198
+ def self._get_valid_binarization_methods_native: () -> String
1199
+ def self._get_valid_language_codes_native: () -> String
1200
+ def self._get_valid_ocr_backends_native: () -> String
1201
+ def self._get_valid_token_reduction_levels_native: () -> String
1202
+
1203
+ # Config wrapper functions
1204
+ def self._config_to_json_native: (String config_json) -> String
1205
+ def self._config_get_field_native: (String config_json, String field_name) -> untyped
1206
+ def self._config_merge_native: (String base_json, String override_json) -> String
1207
+
1208
+ # Result wrapper functions
1209
+ def self._result_page_count_native: (untyped result) -> Integer
1210
+ def self._result_chunk_count_native: (untyped result) -> Integer
1211
+ def self._result_detected_language_native: (untyped result) -> String?
1212
+ def self._result_metadata_field_native: (untyped result, String field_name) -> untyped
1213
+
1214
+ # Plugin registration
1215
+ def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
1216
+ def self.unregister_post_processor: (String name) -> void
1217
+ def self.clear_post_processors: () -> void
1218
+ def self.register_validator: (String name, _Validator validator, ?priority: Integer?) -> void
1219
+ def self.unregister_validator: (String name) -> void
1220
+ def self.clear_validators: () -> void
1221
+ def self.register_ocr_backend: (String name, _OcrBackend backend) -> void
1222
+ def self.unregister_ocr_backend: (String name) -> void
1223
+ def self.list_ocr_backends: () -> Array[String]
1224
+ def self.clear_ocr_backends: () -> void
1225
+ def self.unregister_document_extractor: (String name) -> void
1226
+ def self.list_document_extractors: () -> Array[String]
1227
+ def self.clear_document_extractors: () -> void
1228
+ def self.list_post_processors: () -> Array[String]
1229
+ def self.list_validators: () -> Array[String]
1230
+
1231
+ interface _PostProcessor
1232
+ def call: (extraction_result_hash result) -> extraction_result_hash
1233
+ end
1234
+
1235
+ interface _Validator
1236
+ def call: (extraction_result_hash result) -> void
1237
+ end
1238
+
1239
+ interface _OcrBackend
1240
+ def name: () -> String
1241
+ def process_image: (String image_bytes, Hash[Symbol, untyped] config) -> String
1242
+ end
1243
+
1244
+ module ErrorContext
1245
+ def self.last_error_code: () -> Integer
1246
+ def self.last_panic_context: () -> Errors::PanicContext?
1247
+ def self.last_panic_context_json: () -> String?
1248
+ def self.error_details: () -> Hash[String, untyped]
1249
+ def self.classify_error: (String message) -> (Hash[String, untyped] | Integer)
1250
+ def self.error_code_name: (Integer code) -> String
1251
+ def self.error_code_description: (Integer code) -> String
1252
+ end
1253
+
1254
+ module Errors
1255
+ # Panic context information from FFI error introspection
1256
+ class PanicContext
1257
+ attr_reader file: String
1258
+ attr_reader line: Integer
1259
+ attr_reader function: String
1260
+ attr_reader message: String
1261
+ attr_reader timestamp_secs: Integer
1262
+
1263
+ def initialize: (
1264
+ file: String,
1265
+ line: Integer,
1266
+ function: String,
1267
+ message: String,
1268
+ timestamp_secs: Integer
1269
+ ) -> void
1270
+ def to_s: () -> String
1271
+ def to_h: () -> Hash[Symbol, String | Integer]
1272
+ def self.from_json: (String) -> PanicContext?
1273
+
1274
+ private
1275
+
1276
+ def self.with_defaults: (Hash[Symbol, untyped] sliced) -> {file: String, line: Integer, function: String, message: String, timestamp_secs: Integer}
1277
+ end
1278
+
1279
+ class Error < StandardError
1280
+ attr_reader panic_context: PanicContext?
1281
+ attr_reader error_code: Integer?
1282
+
1283
+ def initialize: (String message, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
1284
+ end
1285
+
1286
+ class ValidationError < Error
1287
+ end
1288
+
1289
+ class ParsingError < Error
1290
+ attr_reader context: Hash[untyped, untyped]?
1291
+
1292
+ def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
1293
+ end
1294
+
1295
+ class OCRError < Error
1296
+ attr_reader context: Hash[untyped, untyped]?
1297
+
1298
+ def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
1299
+ end
1300
+
1301
+ class MissingDependencyError < Error
1302
+ attr_reader dependency: String?
1303
+
1304
+ def initialize: (String message, ?dependency: String?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
1305
+ end
1306
+
1307
+ class IOError < Error
1308
+ end
1309
+
1310
+ class PluginError < Error
1311
+ end
1312
+
1313
+ class UnsupportedFormatError < Error
1314
+ end
1315
+ end
1316
+
1317
+ # Internal modules (prepended to Kreuzberg singleton)
1318
+ # These are not checked by steep - see Steepfile
1319
+ module CacheAPI : Object
1320
+ end
1321
+
1322
+ module ExtractionAPI : Object
1323
+ end
1324
+
1325
+ module PostProcessorProtocol
1326
+ def call: (extraction_result_hash result) -> extraction_result_hash
1327
+ end
1328
+
1329
+ module ValidatorProtocol
1330
+ def call: (extraction_result_hash result) -> void
1331
+ end
1332
+
1333
+ module OcrBackendProtocol
1334
+ def name: () -> String
1335
+ def process_image: (String image_bytes, Hash[Symbol, untyped] config) -> String
1336
+ end
1337
+ end