kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
data/sig/kreuzberg.rbs
ADDED
|
@@ -0,0 +1,1337 @@
|
|
|
1
|
+
# Type signatures for Kreuzberg document intelligence framework
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
VERSION: String
|
|
5
|
+
|
|
6
|
+
# Error code constants
|
|
7
|
+
ERROR_CODE_SUCCESS: Integer
|
|
8
|
+
ERROR_CODE_GENERIC: Integer
|
|
9
|
+
ERROR_CODE_PANIC: Integer
|
|
10
|
+
ERROR_CODE_INVALID_ARGUMENT: Integer
|
|
11
|
+
ERROR_CODE_IO: Integer
|
|
12
|
+
ERROR_CODE_PARSING: Integer
|
|
13
|
+
ERROR_CODE_OCR: Integer
|
|
14
|
+
ERROR_CODE_MISSING_DEPENDENCY: Integer
|
|
15
|
+
|
|
16
|
+
# Semantic element type classification (T.type_alias)
|
|
17
|
+
type element_type = 'title' | 'narrative_text' | 'heading' | 'list_item' | 'table' | 'image' | 'page_break' | 'code_block' | 'block_quote' | 'footer' | 'header'
|
|
18
|
+
|
|
19
|
+
# Bounding box coordinates for element positioning (T::Struct from types.rb)
|
|
20
|
+
class BoundingBox attr_reader x0: Float
|
|
21
|
+
attr_reader y0: Float
|
|
22
|
+
attr_reader x1: Float
|
|
23
|
+
attr_reader y1: Float
|
|
24
|
+
|
|
25
|
+
def initialize: (x0: Float, y0: Float, x1: Float, y1: Float) -> void
|
|
26
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Metadata for a semantic element (T::Struct from types.rb)
|
|
30
|
+
class ElementMetadata attr_reader page_number: Integer?
|
|
31
|
+
attr_reader filename: String?
|
|
32
|
+
attr_reader coordinates: BoundingBox?
|
|
33
|
+
attr_reader element_index: Integer?
|
|
34
|
+
attr_reader additional: Hash[String, String]
|
|
35
|
+
|
|
36
|
+
def initialize: (page_number: Integer?, filename: String?, coordinates: BoundingBox?, element_index: Integer?, additional: Hash[String, String]) -> void
|
|
37
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Semantic element extracted from document (T::Struct from types.rb)
|
|
41
|
+
class Element attr_reader element_id: String
|
|
42
|
+
attr_reader element_type: String
|
|
43
|
+
attr_reader text: String
|
|
44
|
+
attr_reader metadata: ElementMetadata
|
|
45
|
+
|
|
46
|
+
def initialize: (element_id: String, element_type: String, text: String, metadata: ElementMetadata) -> void
|
|
47
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Header/Heading metadata (T::Struct from types.rb)
|
|
51
|
+
class HeaderMetadata attr_reader level: Integer
|
|
52
|
+
attr_reader text: String
|
|
53
|
+
attr_reader id: String?
|
|
54
|
+
attr_reader depth: Integer
|
|
55
|
+
attr_reader html_offset: Integer
|
|
56
|
+
|
|
57
|
+
def initialize: (level: Integer, text: String, id: String?, depth: Integer, html_offset: Integer) -> void
|
|
58
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Link metadata (T::Struct from types.rb)
|
|
62
|
+
class LinkMetadata attr_reader href: String
|
|
63
|
+
attr_reader text: String
|
|
64
|
+
attr_reader title: String?
|
|
65
|
+
attr_reader link_type: String
|
|
66
|
+
attr_reader rel: Array[String]
|
|
67
|
+
attr_reader attributes: Hash[String, String]
|
|
68
|
+
|
|
69
|
+
def initialize: (href: String, text: String, title: String?, link_type: String, rel: Array[String], attributes: Hash[String, String]) -> void
|
|
70
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Image metadata (T::Struct from types.rb)
|
|
74
|
+
class ImageMetadata attr_reader src: String
|
|
75
|
+
attr_reader alt: String?
|
|
76
|
+
attr_reader title: String?
|
|
77
|
+
attr_reader dimensions: Array[Integer]?
|
|
78
|
+
attr_reader image_type: String
|
|
79
|
+
attr_reader attributes: Hash[String, String]
|
|
80
|
+
|
|
81
|
+
def initialize: (src: String, alt: String?, title: String?, dimensions: Array[Integer]?, image_type: String, attributes: Hash[String, String]) -> void
|
|
82
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Structured data metadata (T::Struct from types.rb)
|
|
86
|
+
class StructuredData attr_reader data_type: String
|
|
87
|
+
attr_reader raw_json: String
|
|
88
|
+
attr_reader schema_type: String?
|
|
89
|
+
|
|
90
|
+
def initialize: (data_type: String, raw_json: String, schema_type: String?) -> void
|
|
91
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# HTML metadata (T::Struct from types.rb)
|
|
95
|
+
class HtmlMetadata attr_reader title: String?
|
|
96
|
+
attr_reader description: String?
|
|
97
|
+
attr_reader author: String?
|
|
98
|
+
attr_reader copyright: String?
|
|
99
|
+
attr_reader keywords: Array[String]
|
|
100
|
+
attr_reader canonical_url: String?
|
|
101
|
+
attr_reader language: String?
|
|
102
|
+
attr_reader text_direction: String?
|
|
103
|
+
attr_reader mime_type: String?
|
|
104
|
+
attr_reader charset: String?
|
|
105
|
+
attr_reader generator: String?
|
|
106
|
+
attr_reader viewport: String?
|
|
107
|
+
attr_reader theme_color: String?
|
|
108
|
+
attr_reader application_name: String?
|
|
109
|
+
attr_reader robots: String?
|
|
110
|
+
attr_reader open_graph: Hash[String, String]
|
|
111
|
+
attr_reader twitter_card: Hash[String, String]
|
|
112
|
+
attr_reader meta_tags: Hash[String, String]
|
|
113
|
+
attr_reader headers: Array[HeaderMetadata]
|
|
114
|
+
attr_reader links: Array[LinkMetadata]
|
|
115
|
+
attr_reader images: Array[ImageMetadata]
|
|
116
|
+
attr_reader structured_data: Array[StructuredData]
|
|
117
|
+
|
|
118
|
+
def initialize: (
|
|
119
|
+
title: String?,
|
|
120
|
+
description: String?,
|
|
121
|
+
author: String?,
|
|
122
|
+
copyright: String?,
|
|
123
|
+
keywords: Array[String],
|
|
124
|
+
canonical_url: String?,
|
|
125
|
+
language: String?,
|
|
126
|
+
text_direction: String?,
|
|
127
|
+
mime_type: String?,
|
|
128
|
+
charset: String?,
|
|
129
|
+
generator: String?,
|
|
130
|
+
viewport: String?,
|
|
131
|
+
theme_color: String?,
|
|
132
|
+
application_name: String?,
|
|
133
|
+
robots: String?,
|
|
134
|
+
open_graph: Hash[String, String],
|
|
135
|
+
twitter_card: Hash[String, String],
|
|
136
|
+
meta_tags: Hash[String, String],
|
|
137
|
+
headers: Array[HeaderMetadata],
|
|
138
|
+
links: Array[LinkMetadata],
|
|
139
|
+
images: Array[ImageMetadata],
|
|
140
|
+
structured_data: Array[StructuredData]
|
|
141
|
+
) -> void
|
|
142
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Extracted keyword with relevance metadata (T::Struct from types.rb)
|
|
146
|
+
class ExtractedKeyword attr_reader text: String
|
|
147
|
+
attr_reader score: Float
|
|
148
|
+
attr_reader algorithm: String
|
|
149
|
+
attr_reader positions: Array[Integer]?
|
|
150
|
+
|
|
151
|
+
def initialize: (text: String, score: Float, algorithm: String, ?positions: Array[Integer]?) -> void
|
|
152
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Processing warning from a pipeline stage (T::Struct from types.rb)
|
|
156
|
+
class ProcessingWarning attr_reader source: String
|
|
157
|
+
attr_reader message: String
|
|
158
|
+
|
|
159
|
+
def initialize: (source: String, message: String) -> void
|
|
160
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Bounding box for document node positioning (T::Struct from types.rb)
|
|
164
|
+
class DocumentBoundingBox attr_reader x0: Float
|
|
165
|
+
attr_reader y0: Float
|
|
166
|
+
attr_reader x1: Float
|
|
167
|
+
attr_reader y1: Float
|
|
168
|
+
|
|
169
|
+
def initialize: (x0: Float, y0: Float, x1: Float, y1: Float) -> void
|
|
170
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Annotation for a document node (T::Struct from types.rb)
|
|
174
|
+
class DocumentAnnotation attr_reader key: String
|
|
175
|
+
attr_reader value: String
|
|
176
|
+
|
|
177
|
+
def initialize: (key: String, value: String) -> void
|
|
178
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Single node in the document structure tree (T::Struct from types.rb)
|
|
182
|
+
class DocumentNode attr_reader id: String
|
|
183
|
+
attr_reader content: String
|
|
184
|
+
attr_reader parent: Integer?
|
|
185
|
+
attr_reader children: Array[Integer]
|
|
186
|
+
attr_reader content_layer: String
|
|
187
|
+
attr_reader page: Integer?
|
|
188
|
+
attr_reader page_end: Integer?
|
|
189
|
+
attr_reader bbox: DocumentBoundingBox?
|
|
190
|
+
attr_reader annotations: Array[DocumentAnnotation]
|
|
191
|
+
|
|
192
|
+
def initialize: (
|
|
193
|
+
id: String,
|
|
194
|
+
content: String,
|
|
195
|
+
parent: Integer?,
|
|
196
|
+
children: Array[Integer],
|
|
197
|
+
content_layer: String,
|
|
198
|
+
page: Integer?,
|
|
199
|
+
page_end: Integer?,
|
|
200
|
+
bbox: DocumentBoundingBox?,
|
|
201
|
+
annotations: Array[DocumentAnnotation]
|
|
202
|
+
) -> void
|
|
203
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Structured document representation (T::Struct from types.rb)
|
|
207
|
+
class DocumentStructure attr_reader nodes: Array[DocumentNode]
|
|
208
|
+
|
|
209
|
+
def initialize: (nodes: Array[DocumentNode]) -> void
|
|
210
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Config namespace (defined in lib/kreuzberg/config.rb)
|
|
214
|
+
module Config
|
|
215
|
+
class OCR
|
|
216
|
+
attr_reader backend: String
|
|
217
|
+
attr_reader language: String
|
|
218
|
+
attr_reader tesseract_config: Tesseract?
|
|
219
|
+
attr_reader paddle_ocr_config: PaddleOcr?
|
|
220
|
+
attr_reader element_config: OcrElementConfig?
|
|
221
|
+
|
|
222
|
+
def initialize: (?backend: String, ?language: String, ?tesseract_config: (Tesseract | Hash[Symbol, untyped])?, ?paddle_ocr_config: (PaddleOcr | Hash[Symbol, untyped])?, ?element_config: (OcrElementConfig | Hash[Symbol, untyped])?) -> void
|
|
223
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
class Tesseract
|
|
227
|
+
attr_reader options: Hash[Symbol, untyped]
|
|
228
|
+
|
|
229
|
+
def initialize: (**untyped options) -> void
|
|
230
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
class PaddleOcr
|
|
234
|
+
attr_reader language: String?
|
|
235
|
+
attr_reader cache_dir: String?
|
|
236
|
+
attr_reader use_angle_cls: bool?
|
|
237
|
+
attr_reader enable_table_detection: bool?
|
|
238
|
+
attr_reader det_db_thresh: Float?
|
|
239
|
+
attr_reader det_db_box_thresh: Float?
|
|
240
|
+
attr_reader det_db_unclip_ratio: Float?
|
|
241
|
+
attr_reader det_limit_side_len: Integer?
|
|
242
|
+
attr_reader rec_batch_num: Integer?
|
|
243
|
+
def initialize: (?language: String?, ?cache_dir: String?, ?use_angle_cls: bool?, ?enable_table_detection: bool?, ?det_db_thresh: Float?, ?det_db_box_thresh: Float?, ?det_db_unclip_ratio: Float?, ?det_limit_side_len: Integer?, ?rec_batch_num: Integer?) -> void
|
|
244
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
class OcrElementConfig
|
|
248
|
+
attr_reader include_elements: bool
|
|
249
|
+
attr_reader min_level: String?
|
|
250
|
+
attr_reader min_confidence: Float?
|
|
251
|
+
attr_reader build_hierarchy: bool
|
|
252
|
+
def initialize: (?include_elements: bool, ?min_level: String?, ?min_confidence: Float?, ?build_hierarchy: bool) -> void
|
|
253
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
class Chunking
|
|
257
|
+
attr_reader max_chars: Integer
|
|
258
|
+
attr_reader max_overlap: Integer
|
|
259
|
+
attr_reader preset: String?
|
|
260
|
+
attr_reader embedding: Embedding?
|
|
261
|
+
attr_reader enabled: bool?
|
|
262
|
+
|
|
263
|
+
def initialize: (
|
|
264
|
+
?max_chars: Integer?,
|
|
265
|
+
?max_overlap: Integer?,
|
|
266
|
+
?preset: String?,
|
|
267
|
+
?embedding: (Embedding | Hash[Symbol, untyped])?,
|
|
268
|
+
?chunk_size: Integer?,
|
|
269
|
+
?chunk_overlap: Integer?,
|
|
270
|
+
?enabled: bool
|
|
271
|
+
) -> void
|
|
272
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
class Embedding
|
|
276
|
+
attr_reader model: Hash[Symbol, untyped]
|
|
277
|
+
attr_reader normalize: bool?
|
|
278
|
+
attr_reader batch_size: Integer?
|
|
279
|
+
attr_reader show_download_progress: bool?
|
|
280
|
+
attr_reader cache_dir: String?
|
|
281
|
+
|
|
282
|
+
def initialize: (
|
|
283
|
+
?model: Hash[Symbol, untyped],
|
|
284
|
+
?normalize: bool?,
|
|
285
|
+
?batch_size: Integer?,
|
|
286
|
+
?show_download_progress: bool?,
|
|
287
|
+
?cache_dir: String?
|
|
288
|
+
) -> void
|
|
289
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
class LanguageDetection
|
|
293
|
+
attr_reader enabled: bool
|
|
294
|
+
attr_reader min_confidence: Float
|
|
295
|
+
attr_reader detect_multiple: bool
|
|
296
|
+
|
|
297
|
+
def initialize: (?enabled: bool, ?min_confidence: Float, ?detect_multiple: bool) -> void
|
|
298
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
class FontConfig
|
|
302
|
+
attr_accessor enabled: bool
|
|
303
|
+
attr_accessor custom_font_dirs: Array[String]?
|
|
304
|
+
|
|
305
|
+
def initialize: (?enabled: bool, ?custom_font_dirs: Array[String]?) -> void
|
|
306
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
class Hierarchy
|
|
310
|
+
attr_reader enabled: bool
|
|
311
|
+
attr_reader k_clusters: Integer
|
|
312
|
+
attr_reader include_bbox: bool
|
|
313
|
+
attr_reader ocr_coverage_threshold: Float?
|
|
314
|
+
|
|
315
|
+
def initialize: (?enabled: bool, ?k_clusters: Integer, ?include_bbox: bool, ?ocr_coverage_threshold: Float?) -> void
|
|
316
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
317
|
+
def self.from_h: (Hash[Symbol, untyped]?) -> Hierarchy?
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
class PDF
|
|
321
|
+
attr_reader extract_images: bool
|
|
322
|
+
attr_reader passwords: Array[String]?
|
|
323
|
+
attr_reader extract_metadata: bool
|
|
324
|
+
attr_reader font_config: FontConfig?
|
|
325
|
+
attr_reader hierarchy: Hierarchy?
|
|
326
|
+
|
|
327
|
+
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?) -> void
|
|
328
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
class ImageExtraction
|
|
332
|
+
attr_reader extract_images: bool
|
|
333
|
+
attr_reader target_dpi: Integer
|
|
334
|
+
attr_reader max_image_dimension: Integer
|
|
335
|
+
attr_reader auto_adjust_dpi: bool
|
|
336
|
+
attr_reader min_dpi: Integer
|
|
337
|
+
attr_reader max_dpi: Integer
|
|
338
|
+
|
|
339
|
+
def initialize: (
|
|
340
|
+
?extract_images: bool,
|
|
341
|
+
?target_dpi: Integer,
|
|
342
|
+
?max_image_dimension: Integer,
|
|
343
|
+
?auto_adjust_dpi: bool,
|
|
344
|
+
?min_dpi: Integer,
|
|
345
|
+
?max_dpi: Integer
|
|
346
|
+
) -> void
|
|
347
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
class ImagePreprocessing
|
|
351
|
+
attr_reader target_dpi: Integer
|
|
352
|
+
attr_reader auto_rotate: bool
|
|
353
|
+
attr_reader deskew: bool
|
|
354
|
+
attr_reader denoise: bool
|
|
355
|
+
attr_reader contrast_enhance: bool
|
|
356
|
+
attr_reader binarization_method: String
|
|
357
|
+
attr_reader invert_colors: bool
|
|
358
|
+
|
|
359
|
+
def initialize: (
|
|
360
|
+
?target_dpi: Integer,
|
|
361
|
+
?auto_rotate: bool,
|
|
362
|
+
?deskew: bool,
|
|
363
|
+
?denoise: bool,
|
|
364
|
+
?contrast_enhance: bool,
|
|
365
|
+
?binarization_method: String,
|
|
366
|
+
?invert_colors: bool
|
|
367
|
+
) -> void
|
|
368
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
class TokenReduction
|
|
372
|
+
attr_reader mode: String
|
|
373
|
+
attr_reader preserve_important_words: bool
|
|
374
|
+
|
|
375
|
+
def initialize: (?mode: String, ?preserve_important_words: bool) -> void
|
|
376
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
class PostProcessor
|
|
380
|
+
attr_reader enabled: bool
|
|
381
|
+
attr_reader enabled_processors: Array[String]?
|
|
382
|
+
attr_reader disabled_processors: Array[String]?
|
|
383
|
+
|
|
384
|
+
def initialize: (?enabled: bool, ?enabled_processors: Array[String]?, ?disabled_processors: Array[String]?) -> void
|
|
385
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
class HtmlPreprocessing
|
|
389
|
+
attr_reader enabled: bool?
|
|
390
|
+
attr_reader preset: Symbol?
|
|
391
|
+
attr_reader remove_navigation: bool?
|
|
392
|
+
attr_reader remove_forms: bool?
|
|
393
|
+
|
|
394
|
+
def initialize: (?enabled: bool?, ?preset: Symbol?, ?remove_navigation: bool?, ?remove_forms: bool?) -> void
|
|
395
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
class HtmlOptions
|
|
399
|
+
attr_reader options: Hash[Symbol, untyped]
|
|
400
|
+
|
|
401
|
+
def initialize: (**untyped options) -> void
|
|
402
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
class KeywordYakeParams
|
|
406
|
+
attr_reader window_size: Integer
|
|
407
|
+
|
|
408
|
+
def initialize: (?window_size: Integer) -> void
|
|
409
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
class KeywordRakeParams
|
|
413
|
+
attr_reader min_word_length: Integer
|
|
414
|
+
attr_reader max_words_per_phrase: Integer
|
|
415
|
+
|
|
416
|
+
def initialize: (?min_word_length: Integer, ?max_words_per_phrase: Integer) -> void
|
|
417
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
class Keywords
|
|
421
|
+
attr_reader algorithm: String?
|
|
422
|
+
attr_reader max_keywords: Integer?
|
|
423
|
+
attr_reader min_score: Float?
|
|
424
|
+
attr_reader ngram_range: Array[Integer]?
|
|
425
|
+
attr_reader language: String?
|
|
426
|
+
attr_reader yake_params: KeywordYakeParams?
|
|
427
|
+
attr_reader rake_params: KeywordRakeParams?
|
|
428
|
+
|
|
429
|
+
def initialize: (
|
|
430
|
+
?algorithm: (Symbol | String)?,
|
|
431
|
+
?max_keywords: Integer?,
|
|
432
|
+
?min_score: Float?,
|
|
433
|
+
?ngram_range: Array[Integer]?,
|
|
434
|
+
?language: (Symbol | String)?,
|
|
435
|
+
?yake_params: (KeywordYakeParams | Hash[Symbol, untyped])?,
|
|
436
|
+
?rake_params: (KeywordRakeParams | Hash[Symbol, untyped])?
|
|
437
|
+
) -> void
|
|
438
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
class PageConfig
|
|
442
|
+
attr_reader extract_pages: bool
|
|
443
|
+
attr_reader insert_page_markers: bool
|
|
444
|
+
attr_reader marker_format: String
|
|
445
|
+
|
|
446
|
+
def initialize: (?extract_pages: bool, ?insert_page_markers: bool, ?marker_format: String) -> void
|
|
447
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
class Extraction
|
|
451
|
+
attr_reader use_cache: bool
|
|
452
|
+
attr_reader enable_quality_processing: bool
|
|
453
|
+
attr_reader force_ocr: bool
|
|
454
|
+
attr_reader include_document_structure: bool
|
|
455
|
+
attr_reader ocr: OCR?
|
|
456
|
+
attr_reader chunking: Chunking?
|
|
457
|
+
attr_reader language_detection: LanguageDetection?
|
|
458
|
+
attr_reader pdf_options: PDF?
|
|
459
|
+
attr_reader images: ImageExtraction?
|
|
460
|
+
attr_reader postprocessor: PostProcessor?
|
|
461
|
+
attr_reader token_reduction: TokenReduction?
|
|
462
|
+
attr_reader keywords: Keywords?
|
|
463
|
+
attr_reader html_options: HtmlOptions?
|
|
464
|
+
attr_reader pages: PageConfig?
|
|
465
|
+
attr_reader max_concurrent_extractions: Integer?
|
|
466
|
+
attr_reader output_format: String?
|
|
467
|
+
attr_reader result_format: String?
|
|
468
|
+
attr_reader security_limits: Hash[String, Integer]?
|
|
469
|
+
|
|
470
|
+
alias image_extraction images
|
|
471
|
+
|
|
472
|
+
def self.from_file: (String path) -> Extraction
|
|
473
|
+
def self.discover: () -> Extraction?
|
|
474
|
+
def initialize: (
|
|
475
|
+
?use_cache: bool,
|
|
476
|
+
?enable_quality_processing: bool,
|
|
477
|
+
?force_ocr: bool,
|
|
478
|
+
?include_document_structure: bool,
|
|
479
|
+
?ocr: (OCR | Hash[Symbol, untyped])?,
|
|
480
|
+
?chunking: (Chunking | Hash[Symbol, untyped])?,
|
|
481
|
+
?language_detection: (LanguageDetection | Hash[Symbol, untyped])?,
|
|
482
|
+
?pdf_options: (PDF | Hash[Symbol, untyped])?,
|
|
483
|
+
?image_extraction: (ImageExtraction | Hash[Symbol, untyped])?,
|
|
484
|
+
?postprocessor: (PostProcessor | Hash[Symbol, untyped])?,
|
|
485
|
+
?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
|
|
486
|
+
?keywords: (Keywords | Hash[Symbol, untyped])?,
|
|
487
|
+
?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
|
|
488
|
+
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
489
|
+
?max_concurrent_extractions: Integer?,
|
|
490
|
+
?output_format: String?,
|
|
491
|
+
?result_format: String?
|
|
492
|
+
) -> void
|
|
493
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
494
|
+
def to_json: (*untyped) -> String
|
|
495
|
+
def get_field: (String | Symbol field_name) -> untyped
|
|
496
|
+
def merge: (Extraction | Hash[Symbol, untyped] other) -> Extraction
|
|
497
|
+
def merge!: (Extraction | Hash[Symbol, untyped] other) -> self
|
|
498
|
+
def []: (Symbol | String key) -> untyped
|
|
499
|
+
def []=: (Symbol | String key, untyped value) -> untyped
|
|
500
|
+
def output_format=: (String? value) -> String?
|
|
501
|
+
def result_format=: (String? value) -> String?
|
|
502
|
+
|
|
503
|
+
private
|
|
504
|
+
|
|
505
|
+
def normalize_config: [T] (T | Hash[Symbol, untyped] | nil value, Class klass) -> T?
|
|
506
|
+
def extract_from_hash: (Hash[Symbol, untyped]? hash, Hash[Symbol, untyped] defaults) -> Hash[Symbol, untyped]
|
|
507
|
+
def assign_attributes: (Hash[Symbol, untyped] params) -> void
|
|
508
|
+
def validate_output_format: (untyped value) -> String?
|
|
509
|
+
def validate_result_format: (untyped value) -> String?
|
|
510
|
+
def update_from_merged: (Extraction merged) -> void
|
|
511
|
+
end
|
|
512
|
+
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
# Alias for Config::Extraction (for API consistency with other language bindings)
|
|
516
|
+
ExtractionConfig: singleton(Config::Extraction)
|
|
517
|
+
|
|
518
|
+
# Alias for Config::PageConfig (for API consistency with other language bindings)
|
|
519
|
+
PageConfig: singleton(Config::PageConfig)
|
|
520
|
+
|
|
521
|
+
# Keyword algorithm constants
|
|
522
|
+
module KeywordAlgorithm
|
|
523
|
+
YAKE: Symbol
|
|
524
|
+
RAKE: Symbol
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
# Extraction result type
|
|
528
|
+
type extraction_result_hash = {
|
|
529
|
+
content: String,
|
|
530
|
+
mime_type: String,
|
|
531
|
+
metadata_json: String,
|
|
532
|
+
metadata: Hash[String, untyped],
|
|
533
|
+
tables: Array[table_hash]?,
|
|
534
|
+
detected_languages: Array[String]?,
|
|
535
|
+
chunks: Array[chunk_hash]?,
|
|
536
|
+
images: Array[image_hash]?,
|
|
537
|
+
pages: Array[page_content_hash]?,
|
|
538
|
+
elements: Array[element_hash]?,
|
|
539
|
+
ocr_elements: Array[ocr_element_hash]?,
|
|
540
|
+
djot_content: djot_content_hash?,
|
|
541
|
+
document: document_structure_hash?,
|
|
542
|
+
extracted_keywords: Array[extracted_keyword_hash]?,
|
|
543
|
+
quality_score: Float?,
|
|
544
|
+
processing_warnings: Array[processing_warning_hash]?
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
type extracted_keyword_hash = {
|
|
548
|
+
text: String,
|
|
549
|
+
score: Float,
|
|
550
|
+
algorithm: String,
|
|
551
|
+
positions: Array[Integer]?
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
type processing_warning_hash = {
|
|
555
|
+
source: String,
|
|
556
|
+
message: String
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
type page_content_hash = {
|
|
560
|
+
page_number: Integer,
|
|
561
|
+
content: String,
|
|
562
|
+
tables: Array[table_hash],
|
|
563
|
+
images: Array[image_hash],
|
|
564
|
+
is_blank: bool?
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
type djot_content_hash = {
|
|
568
|
+
plain_text: String,
|
|
569
|
+
blocks: Array[formatted_block_hash],
|
|
570
|
+
metadata_json: String,
|
|
571
|
+
tables: Array[table_hash],
|
|
572
|
+
images: Array[djot_image_hash],
|
|
573
|
+
links: Array[djot_link_hash],
|
|
574
|
+
footnotes: Array[footnote_hash],
|
|
575
|
+
attributes: Hash[String, attributes_hash]?
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
type formatted_block_hash = {
|
|
579
|
+
block_type: String,
|
|
580
|
+
level: Integer?,
|
|
581
|
+
content: String?,
|
|
582
|
+
children: Array[formatted_block_hash]?,
|
|
583
|
+
attributes: attributes_hash?
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
type djot_image_hash = {
|
|
587
|
+
url: String,
|
|
588
|
+
alt: String?,
|
|
589
|
+
title: String?,
|
|
590
|
+
attributes: attributes_hash?
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
type djot_link_hash = {
|
|
594
|
+
url: String,
|
|
595
|
+
text: String,
|
|
596
|
+
title: String?,
|
|
597
|
+
link_type: String?
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
type footnote_hash = {
|
|
601
|
+
label: String,
|
|
602
|
+
content: String
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
type attributes_hash = Hash[String, String | Integer | bool | Array[String] | nil]
|
|
606
|
+
|
|
607
|
+
type document_bounding_box_hash = {
|
|
608
|
+
x0: Float,
|
|
609
|
+
y0: Float,
|
|
610
|
+
x1: Float,
|
|
611
|
+
y1: Float
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
type document_annotation_hash = {
|
|
615
|
+
start: Integer,
|
|
616
|
+
end: Integer,
|
|
617
|
+
kind: {
|
|
618
|
+
annotation_type: String,
|
|
619
|
+
url: String?,
|
|
620
|
+
title: String?
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
type document_node_hash = {
|
|
625
|
+
id: String,
|
|
626
|
+
content: Hash[untyped, untyped],
|
|
627
|
+
parent: Integer?,
|
|
628
|
+
children: Array[Integer],
|
|
629
|
+
content_layer: String,
|
|
630
|
+
page: Integer?,
|
|
631
|
+
page_end: Integer?,
|
|
632
|
+
bbox: document_bounding_box_hash?,
|
|
633
|
+
annotations: Array[document_annotation_hash]
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
type document_structure_hash = {
|
|
637
|
+
nodes: Array[document_node_hash]
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
type element_hash = {
|
|
641
|
+
element_id: String,
|
|
642
|
+
element_type: String,
|
|
643
|
+
text: String,
|
|
644
|
+
metadata: Hash[String, untyped]?
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
type ocr_confidence_hash = {
|
|
648
|
+
detection: Float?,
|
|
649
|
+
recognition: Float?
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
type ocr_rotation_hash = {
|
|
653
|
+
angle_degrees: Float?,
|
|
654
|
+
confidence: Float?
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
type ocr_bounding_geometry_hash = {
|
|
658
|
+
type: String,
|
|
659
|
+
left: Float?,
|
|
660
|
+
top: Float?,
|
|
661
|
+
width: Float?,
|
|
662
|
+
height: Float?,
|
|
663
|
+
points: Array[Array[Float]]?
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
type ocr_element_hash = {
|
|
667
|
+
text: String,
|
|
668
|
+
geometry: ocr_bounding_geometry_hash?,
|
|
669
|
+
confidence: ocr_confidence_hash?,
|
|
670
|
+
level: String?,
|
|
671
|
+
rotation: ocr_rotation_hash?,
|
|
672
|
+
page_number: Integer?,
|
|
673
|
+
parent_id: String?,
|
|
674
|
+
backend_metadata: Hash[String, untyped]?
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
type table_hash = {
|
|
678
|
+
cells: Array[Array[String]],
|
|
679
|
+
markdown: String,
|
|
680
|
+
page_number: Integer,
|
|
681
|
+
bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
type chunk_hash = {
|
|
685
|
+
content: String,
|
|
686
|
+
byte_start: Integer,
|
|
687
|
+
byte_end: Integer,
|
|
688
|
+
token_count: Integer?,
|
|
689
|
+
chunk_index: Integer,
|
|
690
|
+
total_chunks: Integer,
|
|
691
|
+
first_page: Integer?,
|
|
692
|
+
last_page: Integer?,
|
|
693
|
+
embedding: Array[Float]?
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
type image_hash = {
|
|
697
|
+
data: String,
|
|
698
|
+
format: String,
|
|
699
|
+
image_index: Integer,
|
|
700
|
+
page_number: Integer?,
|
|
701
|
+
width: Integer?,
|
|
702
|
+
height: Integer?,
|
|
703
|
+
colorspace: String?,
|
|
704
|
+
bits_per_component: Integer?,
|
|
705
|
+
is_mask: bool,
|
|
706
|
+
description: String?,
|
|
707
|
+
bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?,
|
|
708
|
+
ocr_result: extraction_result_hash?
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
# Metadata hash type - represents the parsed JSON metadata from extraction results.
|
|
712
|
+
# Fields correspond to the Rust Metadata struct (serialized as JSON and parsed back).
|
|
713
|
+
#
|
|
714
|
+
# Common fields:
|
|
715
|
+
# "title" => String? - Document title
|
|
716
|
+
# "subject" => String? - Document subject or description
|
|
717
|
+
# "authors" => Array[String]? - Primary author(s)
|
|
718
|
+
# "keywords" => Array[String]? - Keywords/tags
|
|
719
|
+
# "language" => String? - Primary language (ISO 639 code)
|
|
720
|
+
# "created_at" => String? - Creation timestamp (ISO 8601)
|
|
721
|
+
# "modified_at" => String? - Last modification timestamp (ISO 8601)
|
|
722
|
+
# "created_by" => String? - User who created the document
|
|
723
|
+
# "modified_by" => String? - User who last modified the document
|
|
724
|
+
# "pages" => Hash? - Page/slide/sheet structure with boundaries
|
|
725
|
+
# "format_type" => String? - Format discriminator (pdf, docx, excel, etc.)
|
|
726
|
+
# "image_preprocessing" => Hash? - Image preprocessing metadata
|
|
727
|
+
# "json_schema" => untyped? - JSON schema for structured data extraction
|
|
728
|
+
# "error" => Hash? - Error metadata (for batch operations)
|
|
729
|
+
# "extraction_duration_ms" => Integer? - Extraction duration in milliseconds
|
|
730
|
+
# "category" => String? - Document category
|
|
731
|
+
# "tags" => Array[String]? - Document tags
|
|
732
|
+
# "document_version" => String? - Document version string
|
|
733
|
+
# "abstract_text" => String? - Abstract or summary text
|
|
734
|
+
# "output_format" => String? - Output format identifier
|
|
735
|
+
type metadata_hash = Hash[String, untyped]
|
|
736
|
+
|
|
737
|
+
type config_hash = Hash[Symbol, untyped]
|
|
738
|
+
type config_input = config_hash | _ToH
|
|
739
|
+
|
|
740
|
+
interface _ToH
|
|
741
|
+
def to_h: () -> config_hash
|
|
742
|
+
end
|
|
743
|
+
|
|
744
|
+
# Extraction result wrapper
|
|
745
|
+
class Result
|
|
746
|
+
# Table structure (Struct from result.rb)
|
|
747
|
+
class Table
|
|
748
|
+
attr_reader cells: Array[Array[String]]
|
|
749
|
+
attr_reader markdown: String
|
|
750
|
+
attr_reader page_number: Integer
|
|
751
|
+
attr_reader bounding_box: BoundingBox?
|
|
752
|
+
|
|
753
|
+
def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer, bounding_box: BoundingBox?) -> void
|
|
754
|
+
def to_h: () -> table_hash
|
|
755
|
+
end
|
|
756
|
+
|
|
757
|
+
# Text chunk (Struct from result.rb)
|
|
758
|
+
class Chunk
|
|
759
|
+
attr_reader content: String
|
|
760
|
+
attr_reader byte_start: Integer
|
|
761
|
+
attr_reader byte_end: Integer
|
|
762
|
+
attr_reader token_count: Integer?
|
|
763
|
+
attr_reader chunk_index: Integer
|
|
764
|
+
attr_reader total_chunks: Integer
|
|
765
|
+
attr_reader first_page: Integer?
|
|
766
|
+
attr_reader last_page: Integer?
|
|
767
|
+
attr_reader embedding: Array[Float]?
|
|
768
|
+
|
|
769
|
+
def initialize: (
|
|
770
|
+
content: String,
|
|
771
|
+
byte_start: Integer,
|
|
772
|
+
byte_end: Integer,
|
|
773
|
+
token_count: Integer?,
|
|
774
|
+
chunk_index: Integer,
|
|
775
|
+
total_chunks: Integer,
|
|
776
|
+
first_page: Integer?,
|
|
777
|
+
last_page: Integer?,
|
|
778
|
+
embedding: Array[Float]?
|
|
779
|
+
) -> void
|
|
780
|
+
def to_h: () -> chunk_hash
|
|
781
|
+
end
|
|
782
|
+
|
|
783
|
+
# Extracted image (Struct from result.rb)
|
|
784
|
+
class Image
|
|
785
|
+
attr_reader data: String
|
|
786
|
+
attr_reader format: String
|
|
787
|
+
attr_reader image_index: Integer
|
|
788
|
+
attr_reader page_number: Integer?
|
|
789
|
+
attr_reader width: Integer?
|
|
790
|
+
attr_reader height: Integer?
|
|
791
|
+
attr_reader colorspace: String?
|
|
792
|
+
attr_reader bits_per_component: Integer?
|
|
793
|
+
attr_reader is_mask: bool
|
|
794
|
+
attr_reader description: String?
|
|
795
|
+
attr_reader bounding_box: BoundingBox?
|
|
796
|
+
attr_reader ocr_result: Result?
|
|
797
|
+
|
|
798
|
+
def initialize: (
|
|
799
|
+
data: String,
|
|
800
|
+
format: String,
|
|
801
|
+
image_index: Integer,
|
|
802
|
+
page_number: Integer?,
|
|
803
|
+
width: Integer?,
|
|
804
|
+
height: Integer?,
|
|
805
|
+
colorspace: String?,
|
|
806
|
+
bits_per_component: Integer?,
|
|
807
|
+
is_mask: bool,
|
|
808
|
+
description: String?,
|
|
809
|
+
bounding_box: BoundingBox?,
|
|
810
|
+
ocr_result: Result?
|
|
811
|
+
) -> void
|
|
812
|
+
def to_h: () -> image_hash
|
|
813
|
+
end
|
|
814
|
+
|
|
815
|
+
# Page content with text and extracted elements (Struct from result.rb)
|
|
816
|
+
class PageContent
|
|
817
|
+
attr_reader page_number: Integer
|
|
818
|
+
attr_reader content: String
|
|
819
|
+
attr_reader tables: Array[Table]
|
|
820
|
+
attr_reader images: Array[Image]?
|
|
821
|
+
attr_reader hierarchy: PageHierarchy?
|
|
822
|
+
attr_reader is_blank: bool?
|
|
823
|
+
|
|
824
|
+
def initialize: (page_number: Integer, content: String, tables: Array[Table], images: Array[Image]?, hierarchy: PageHierarchy?, is_blank: bool?) -> void
|
|
825
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
826
|
+
end
|
|
827
|
+
|
|
828
|
+
# Hierarchical block element for page hierarchy (Struct from result.rb)
|
|
829
|
+
class HierarchicalBlock
|
|
830
|
+
attr_reader text: String
|
|
831
|
+
attr_reader font_size: Float?
|
|
832
|
+
attr_reader level: String?
|
|
833
|
+
attr_reader bbox: Array[Float]?
|
|
834
|
+
|
|
835
|
+
def initialize: (text: String, font_size: Float?, level: String?, bbox: Array[Float]?) -> void
|
|
836
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
837
|
+
end
|
|
838
|
+
|
|
839
|
+
# Page hierarchy information (Struct from result.rb)
|
|
840
|
+
class PageHierarchy
|
|
841
|
+
attr_reader block_count: Integer
|
|
842
|
+
attr_reader blocks: Array[HierarchicalBlock]
|
|
843
|
+
|
|
844
|
+
def initialize: (block_count: Integer, blocks: Array[HierarchicalBlock]) -> void
|
|
845
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
846
|
+
end
|
|
847
|
+
|
|
848
|
+
# Element bounding box coordinates (Struct from result.rb)
|
|
849
|
+
class ElementBoundingBox
|
|
850
|
+
attr_reader x0: Float
|
|
851
|
+
attr_reader y0: Float
|
|
852
|
+
attr_reader x1: Float
|
|
853
|
+
attr_reader y1: Float
|
|
854
|
+
|
|
855
|
+
def initialize: (x0: Float, y0: Float, x1: Float, y1: Float) -> void
|
|
856
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
857
|
+
end
|
|
858
|
+
|
|
859
|
+
# Element metadata (Struct from result.rb)
|
|
860
|
+
class ElementMetadataStruct
|
|
861
|
+
attr_reader page_number: Integer?
|
|
862
|
+
attr_reader filename: String?
|
|
863
|
+
attr_reader coordinates: ElementBoundingBox?
|
|
864
|
+
attr_reader element_index: Integer?
|
|
865
|
+
attr_reader additional: Hash[String, String]
|
|
866
|
+
|
|
867
|
+
def initialize: (page_number: Integer?, filename: String?, coordinates: ElementBoundingBox?, element_index: Integer?, additional: Hash[String, String]) -> void
|
|
868
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
869
|
+
end
|
|
870
|
+
|
|
871
|
+
# Structured document element (Struct from result.rb)
|
|
872
|
+
class ElementStruct
|
|
873
|
+
attr_reader element_id: String
|
|
874
|
+
attr_reader element_type: String
|
|
875
|
+
attr_reader text: String
|
|
876
|
+
attr_reader metadata: ElementMetadataStruct
|
|
877
|
+
|
|
878
|
+
def initialize: (element_id: String, element_type: String, text: String, metadata: ElementMetadataStruct) -> void
|
|
879
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
880
|
+
end
|
|
881
|
+
|
|
882
|
+
# OCR bounding geometry (class from result.rb)
|
|
883
|
+
class OcrBoundingGeometry
|
|
884
|
+
attr_reader type: String
|
|
885
|
+
attr_reader left: Float?
|
|
886
|
+
attr_reader top: Float?
|
|
887
|
+
attr_reader width: Float?
|
|
888
|
+
attr_reader height: Float?
|
|
889
|
+
attr_reader points: Array[Array[Float]]?
|
|
890
|
+
def initialize: (type: String, ?left: Float?, ?top: Float?, ?width: Float?, ?height: Float?, ?points: Array[Array[Float]]?) -> void
|
|
891
|
+
def to_h: () -> ocr_bounding_geometry_hash
|
|
892
|
+
end
|
|
893
|
+
|
|
894
|
+
# OCR confidence scores (class from result.rb)
|
|
895
|
+
class OcrConfidence
|
|
896
|
+
attr_reader detection: Float?
|
|
897
|
+
attr_reader recognition: Float?
|
|
898
|
+
def initialize: (?detection: Float?, ?recognition: Float?) -> void
|
|
899
|
+
def to_h: () -> ocr_confidence_hash
|
|
900
|
+
end
|
|
901
|
+
|
|
902
|
+
# OCR rotation information (class from result.rb)
|
|
903
|
+
class OcrRotation
|
|
904
|
+
attr_reader angle_degrees: Float?
|
|
905
|
+
attr_reader confidence: Float?
|
|
906
|
+
def initialize: (?angle_degrees: Float?, ?confidence: Float?) -> void
|
|
907
|
+
def to_h: () -> ocr_rotation_hash
|
|
908
|
+
end
|
|
909
|
+
|
|
910
|
+
# OCR text element with geometry and metadata (class from result.rb)
|
|
911
|
+
class OcrElement
|
|
912
|
+
attr_reader text: String
|
|
913
|
+
attr_reader geometry: OcrBoundingGeometry?
|
|
914
|
+
attr_reader confidence: OcrConfidence?
|
|
915
|
+
attr_reader level: String?
|
|
916
|
+
attr_reader rotation: OcrRotation?
|
|
917
|
+
attr_reader page_number: Integer?
|
|
918
|
+
attr_reader parent_id: String?
|
|
919
|
+
attr_reader backend_metadata: Hash[String, untyped]?
|
|
920
|
+
def initialize: (text: String, ?geometry: OcrBoundingGeometry?, ?confidence: OcrConfidence?, ?level: String?, ?rotation: OcrRotation?, ?page_number: Integer?, ?parent_id: String?, ?backend_metadata: Hash[String, untyped]?) -> void
|
|
921
|
+
def to_h: () -> ocr_element_hash
|
|
922
|
+
end
|
|
923
|
+
|
|
924
|
+
# Structured Djot document representation (class from djot_content.rb)
|
|
925
|
+
class DjotContent
|
|
926
|
+
attr_reader plain_text: String
|
|
927
|
+
attr_reader blocks: Array[DjotContent::FormattedBlock]
|
|
928
|
+
attr_reader metadata_json: String
|
|
929
|
+
attr_reader tables: Array[untyped]
|
|
930
|
+
attr_reader images: Array[DjotContent::DjotImage]
|
|
931
|
+
attr_reader links: Array[DjotContent::DjotLink]
|
|
932
|
+
attr_reader footnotes: Array[DjotContent::Footnote]
|
|
933
|
+
attr_reader attributes: Hash[String, untyped]?
|
|
934
|
+
|
|
935
|
+
def initialize: (untyped hash) -> void
|
|
936
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
937
|
+
def metadata: () -> Hash[untyped, untyped]
|
|
938
|
+
|
|
939
|
+
private
|
|
940
|
+
|
|
941
|
+
def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
|
|
942
|
+
def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
|
|
943
|
+
def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
|
|
944
|
+
def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
|
|
945
|
+
def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
|
|
946
|
+
|
|
947
|
+
class FormattedBlock
|
|
948
|
+
attr_reader block_type: String
|
|
949
|
+
attr_reader level: Integer?
|
|
950
|
+
attr_reader content: String?
|
|
951
|
+
attr_reader children: Array[FormattedBlock]?
|
|
952
|
+
attr_reader attributes: Hash[String, untyped]?
|
|
953
|
+
|
|
954
|
+
def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
|
|
955
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
956
|
+
end
|
|
957
|
+
|
|
958
|
+
class DjotImage
|
|
959
|
+
attr_reader url: String
|
|
960
|
+
attr_reader alt: String?
|
|
961
|
+
attr_reader title: String?
|
|
962
|
+
attr_reader width: Integer?
|
|
963
|
+
attr_reader height: Integer?
|
|
964
|
+
|
|
965
|
+
def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
|
|
966
|
+
def src: () -> String
|
|
967
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
968
|
+
end
|
|
969
|
+
|
|
970
|
+
class DjotLink
|
|
971
|
+
attr_reader url: String
|
|
972
|
+
attr_reader text: String?
|
|
973
|
+
attr_reader title: String?
|
|
974
|
+
attr_reader link_type: String?
|
|
975
|
+
|
|
976
|
+
def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
|
|
977
|
+
def href: () -> String
|
|
978
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
979
|
+
end
|
|
980
|
+
|
|
981
|
+
class Footnote
|
|
982
|
+
attr_reader label: String
|
|
983
|
+
attr_reader content: String
|
|
984
|
+
|
|
985
|
+
def initialize: (label: String, content: String) -> void
|
|
986
|
+
def to_h: () -> footnote_hash
|
|
987
|
+
end
|
|
988
|
+
end
|
|
989
|
+
|
|
990
|
+
# Structured document representation (class from document_structure.rb)
|
|
991
|
+
# Note: This is Result::DocumentStructure, distinct from Kreuzberg::DocumentStructure (T::Struct)
|
|
992
|
+
class DocumentStructure
|
|
993
|
+
attr_reader nodes: Array[DocumentNode]
|
|
994
|
+
|
|
995
|
+
def initialize: (Hash[String | Symbol, untyped] hash) -> void
|
|
996
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
997
|
+
|
|
998
|
+
private
|
|
999
|
+
|
|
1000
|
+
def parse_nodes: (Array[untyped]? nodes_data) -> Array[DocumentNode]
|
|
1001
|
+
end
|
|
1002
|
+
|
|
1003
|
+
# Single node in the document structure tree (class from document_structure.rb)
|
|
1004
|
+
# Note: This is Result::DocumentNode, distinct from Kreuzberg::DocumentNode (T::Struct)
|
|
1005
|
+
class DocumentNode
|
|
1006
|
+
attr_reader id: String
|
|
1007
|
+
attr_reader content: Hash[untyped, untyped]
|
|
1008
|
+
attr_reader parent: Integer?
|
|
1009
|
+
attr_reader children: Array[Integer]
|
|
1010
|
+
attr_reader content_layer: String
|
|
1011
|
+
attr_reader page: Integer?
|
|
1012
|
+
attr_reader page_end: Integer?
|
|
1013
|
+
attr_reader bbox: DocumentBoundingBox?
|
|
1014
|
+
attr_reader annotations: Array[DocumentAnnotation]
|
|
1015
|
+
|
|
1016
|
+
def initialize: (Hash[String | Symbol, untyped] hash) -> void
|
|
1017
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
1018
|
+
|
|
1019
|
+
private
|
|
1020
|
+
|
|
1021
|
+
def assign_core_fields: (Hash[String | Symbol, untyped] hash) -> void
|
|
1022
|
+
def assign_tree_fields: (Hash[String | Symbol, untyped] hash) -> void
|
|
1023
|
+
def assign_metadata_fields: (Hash[String | Symbol, untyped] hash) -> void
|
|
1024
|
+
def parse_children: (Array[untyped]? children_data) -> Array[Integer]
|
|
1025
|
+
def extract_child_index: (untyped child) -> Integer
|
|
1026
|
+
def parse_bbox: (Hash[String | Symbol, untyped]? bbox_data) -> DocumentBoundingBox?
|
|
1027
|
+
def parse_annotations: (Array[untyped]? annotations_data) -> Array[DocumentAnnotation]
|
|
1028
|
+
end
|
|
1029
|
+
|
|
1030
|
+
# Bounding box for document node positioning (class from document_structure.rb)
|
|
1031
|
+
# Note: This is Result::DocumentBoundingBox, distinct from Kreuzberg::DocumentBoundingBox (T::Struct)
|
|
1032
|
+
class DocumentBoundingBox
|
|
1033
|
+
attr_reader x0: Float?
|
|
1034
|
+
attr_reader y0: Float?
|
|
1035
|
+
attr_reader x1: Float?
|
|
1036
|
+
attr_reader y1: Float?
|
|
1037
|
+
|
|
1038
|
+
def initialize: (Hash[String | Symbol, untyped] hash) -> void
|
|
1039
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
1040
|
+
|
|
1041
|
+
private
|
|
1042
|
+
|
|
1043
|
+
def extract_float: (Hash[String | Symbol, untyped] hash, String key) -> Float?
|
|
1044
|
+
end
|
|
1045
|
+
|
|
1046
|
+
# Annotation for a document node (class from document_structure.rb)
|
|
1047
|
+
# Note: This is Result::DocumentAnnotation, distinct from Kreuzberg::DocumentAnnotation (T::Struct)
|
|
1048
|
+
class DocumentAnnotation
|
|
1049
|
+
attr_reader start: Integer
|
|
1050
|
+
attr_reader end_offset: Integer
|
|
1051
|
+
attr_reader annotation_type: String
|
|
1052
|
+
attr_reader url: String?
|
|
1053
|
+
attr_reader title: String?
|
|
1054
|
+
|
|
1055
|
+
def initialize: (Hash[String | Symbol, untyped] hash) -> void
|
|
1056
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
1057
|
+
|
|
1058
|
+
private
|
|
1059
|
+
|
|
1060
|
+
def parse_kind: (Hash[String | Symbol, untyped]? kind_hash) -> void
|
|
1061
|
+
end
|
|
1062
|
+
|
|
1063
|
+
attr_reader content: String
|
|
1064
|
+
attr_reader mime_type: String
|
|
1065
|
+
attr_reader metadata: metadata_hash
|
|
1066
|
+
attr_reader metadata_json: String
|
|
1067
|
+
attr_reader tables: Array[Table]
|
|
1068
|
+
attr_reader detected_languages: Array[String]?
|
|
1069
|
+
attr_reader chunks: Array[Chunk]?
|
|
1070
|
+
attr_reader images: Array[Image]?
|
|
1071
|
+
attr_reader pages: Array[PageContent]?
|
|
1072
|
+
attr_reader elements: Array[ElementStruct]?
|
|
1073
|
+
attr_reader ocr_elements: Array[OcrElement]?
|
|
1074
|
+
attr_reader djot_content: DjotContent?
|
|
1075
|
+
attr_reader document: DocumentStructure?
|
|
1076
|
+
attr_reader extracted_keywords: Array[ExtractedKeyword]?
|
|
1077
|
+
attr_reader quality_score: Float?
|
|
1078
|
+
attr_reader processing_warnings: Array[ProcessingWarning]?
|
|
1079
|
+
|
|
1080
|
+
def initialize: (extraction_result_hash hash) -> void
|
|
1081
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
1082
|
+
def to_json: (*untyped) -> String
|
|
1083
|
+
|
|
1084
|
+
def page_count: () -> Integer
|
|
1085
|
+
def chunk_count: () -> Integer
|
|
1086
|
+
def detected_language: () -> String?
|
|
1087
|
+
def metadata_field: (String | Symbol name) -> untyped
|
|
1088
|
+
|
|
1089
|
+
private
|
|
1090
|
+
|
|
1091
|
+
def parse_metadata: (String metadata_json) -> metadata_hash
|
|
1092
|
+
def parse_tables: (Array[table_hash]? tables_data) -> Array[Table]
|
|
1093
|
+
def parse_detected_languages: (Array[String]? langs_data) -> Array[String]?
|
|
1094
|
+
def parse_chunks: (Array[chunk_hash]? chunks_data) -> Array[Chunk]?
|
|
1095
|
+
def parse_images: (Array[image_hash]? images_data) -> Array[Image]?
|
|
1096
|
+
def parse_pages: (Array[page_content_hash]? pages_data) -> Array[PageContent]?
|
|
1097
|
+
def parse_elements: (Array[untyped]? elements_data) -> Array[ElementStruct]?
|
|
1098
|
+
def parse_element: (Hash[String, untyped] element_hash) -> ElementStruct
|
|
1099
|
+
def parse_element_coordinates: (Hash[String, untyped]? coordinates_data) -> ElementBoundingBox?
|
|
1100
|
+
def parse_ocr_elements: (Array[ocr_element_hash]? ocr_elements_data) -> Array[OcrElement]?
|
|
1101
|
+
def parse_page_hierarchy: (Hash[String, untyped]? hierarchy_data) -> PageHierarchy?
|
|
1102
|
+
def parse_djot_content: (Hash[String, untyped]? djot_data) -> DjotContent?
|
|
1103
|
+
def parse_document_structure: (Hash[String, untyped]? document_data) -> DocumentStructure?
|
|
1104
|
+
def parse_extracted_keywords: (Array[extracted_keyword_hash]? keywords_data) -> Array[ExtractedKeyword]?
|
|
1105
|
+
def parse_processing_warnings: (Array[processing_warning_hash]? warnings_data) -> Array[ProcessingWarning]
|
|
1106
|
+
def get_value: (Hash[String | Symbol, untyped] hash, String key, ?untyped default) -> untyped
|
|
1107
|
+
def serialize_tables: () -> Array[table_hash]
|
|
1108
|
+
def serialize_chunks: () -> Array[chunk_hash]?
|
|
1109
|
+
def serialize_images: () -> Array[image_hash]?
|
|
1110
|
+
def serialize_pages: () -> Array[Hash[Symbol, untyped]]?
|
|
1111
|
+
def serialize_elements: () -> Array[Hash[Symbol, untyped]]?
|
|
1112
|
+
def serialize_ocr_elements: () -> Array[ocr_element_hash]?
|
|
1113
|
+
def parse_ocr_geometry: (Hash[String, untyped]? data) -> OcrBoundingGeometry?
|
|
1114
|
+
def parse_ocr_confidence: (Hash[String, untyped]? data) -> OcrConfidence?
|
|
1115
|
+
def parse_ocr_rotation: (Hash[String, untyped]? data) -> OcrRotation?
|
|
1116
|
+
end
|
|
1117
|
+
|
|
1118
|
+
# Module methods (extraction API)
|
|
1119
|
+
def self.extract_file_sync: (
|
|
1120
|
+
path: String | Pathname,
|
|
1121
|
+
?mime_type: String?,
|
|
1122
|
+
?config: config_input?
|
|
1123
|
+
) -> Result
|
|
1124
|
+
|
|
1125
|
+
def self.extract_bytes_sync: (
|
|
1126
|
+
data: String,
|
|
1127
|
+
mime_type: String,
|
|
1128
|
+
?config: config_input?
|
|
1129
|
+
) -> Result
|
|
1130
|
+
|
|
1131
|
+
def self.batch_extract_files_sync: (
|
|
1132
|
+
paths: Array[String | Pathname],
|
|
1133
|
+
?config: config_input?
|
|
1134
|
+
) -> Array[Result]
|
|
1135
|
+
|
|
1136
|
+
def self.batch_extract_bytes_sync: (
|
|
1137
|
+
data_array: Array[String],
|
|
1138
|
+
mime_types: Array[String],
|
|
1139
|
+
?config: config_input?
|
|
1140
|
+
) -> Array[Result]
|
|
1141
|
+
|
|
1142
|
+
def self.extract_file: (
|
|
1143
|
+
path: String | Pathname,
|
|
1144
|
+
?mime_type: String?,
|
|
1145
|
+
?config: config_input?
|
|
1146
|
+
) -> Result
|
|
1147
|
+
|
|
1148
|
+
def self.extract_bytes: (
|
|
1149
|
+
data: String,
|
|
1150
|
+
mime_type: String,
|
|
1151
|
+
?config: config_input?
|
|
1152
|
+
) -> Result
|
|
1153
|
+
|
|
1154
|
+
def self.batch_extract_files: (
|
|
1155
|
+
paths: Array[String | Pathname],
|
|
1156
|
+
?config: config_input?
|
|
1157
|
+
) -> Array[Result]
|
|
1158
|
+
|
|
1159
|
+
def self.batch_extract_bytes: (
|
|
1160
|
+
data_array: Array[String],
|
|
1161
|
+
mime_types: Array[String],
|
|
1162
|
+
?config: config_input?
|
|
1163
|
+
) -> Array[Result]
|
|
1164
|
+
|
|
1165
|
+
# Cache API
|
|
1166
|
+
def self.clear_cache: () -> void
|
|
1167
|
+
def self.cache_stats: () -> Hash[Symbol | String, Integer]
|
|
1168
|
+
|
|
1169
|
+
# Config loading (native methods)
|
|
1170
|
+
def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
|
|
1171
|
+
def self._config_discover_native: () -> Hash[Symbol, untyped]?
|
|
1172
|
+
|
|
1173
|
+
# Error introspection (native methods)
|
|
1174
|
+
def self._last_error_code_native: () -> Integer
|
|
1175
|
+
def self._last_panic_context_json_native: () -> String?
|
|
1176
|
+
def self._get_error_details_native: () -> Hash[String, untyped]
|
|
1177
|
+
def self._classify_error_native: (String message) -> Hash[String, untyped]
|
|
1178
|
+
def self._error_code_name_native: (Integer code) -> String
|
|
1179
|
+
def self._error_code_description_native: (Integer code) -> String
|
|
1180
|
+
|
|
1181
|
+
# MIME type detection
|
|
1182
|
+
def self.detect_mime_type: (String data) -> String
|
|
1183
|
+
def self.detect_mime_type_from_path: (String path) -> String
|
|
1184
|
+
def self.get_extensions_for_mime: (String mime_type) -> Array[String]
|
|
1185
|
+
def self.validate_mime_type: (String mime_type) -> String
|
|
1186
|
+
|
|
1187
|
+
# Validation native methods
|
|
1188
|
+
def self._validate_binarization_method_native: (String method) -> Integer
|
|
1189
|
+
def self._validate_ocr_backend_native: (String backend) -> Integer
|
|
1190
|
+
def self._validate_language_code_native: (String code) -> Integer
|
|
1191
|
+
def self._validate_token_reduction_level_native: (String level) -> Integer
|
|
1192
|
+
def self._validate_tesseract_psm_native: (Integer psm) -> Integer
|
|
1193
|
+
def self._validate_tesseract_oem_native: (Integer oem) -> Integer
|
|
1194
|
+
def self._validate_output_format_native: (String format) -> Integer
|
|
1195
|
+
def self._validate_confidence_native: (Float confidence) -> Integer
|
|
1196
|
+
def self._validate_dpi_native: (Integer dpi) -> Integer
|
|
1197
|
+
def self._validate_chunking_params_native: (Integer max_chars, Integer max_overlap) -> Integer
|
|
1198
|
+
def self._get_valid_binarization_methods_native: () -> String
|
|
1199
|
+
def self._get_valid_language_codes_native: () -> String
|
|
1200
|
+
def self._get_valid_ocr_backends_native: () -> String
|
|
1201
|
+
def self._get_valid_token_reduction_levels_native: () -> String
|
|
1202
|
+
|
|
1203
|
+
# Config wrapper functions
|
|
1204
|
+
def self._config_to_json_native: (String config_json) -> String
|
|
1205
|
+
def self._config_get_field_native: (String config_json, String field_name) -> untyped
|
|
1206
|
+
def self._config_merge_native: (String base_json, String override_json) -> String
|
|
1207
|
+
|
|
1208
|
+
# Result wrapper functions
|
|
1209
|
+
def self._result_page_count_native: (untyped result) -> Integer
|
|
1210
|
+
def self._result_chunk_count_native: (untyped result) -> Integer
|
|
1211
|
+
def self._result_detected_language_native: (untyped result) -> String?
|
|
1212
|
+
def self._result_metadata_field_native: (untyped result, String field_name) -> untyped
|
|
1213
|
+
|
|
1214
|
+
# Plugin registration
|
|
1215
|
+
def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
|
|
1216
|
+
def self.unregister_post_processor: (String name) -> void
|
|
1217
|
+
def self.clear_post_processors: () -> void
|
|
1218
|
+
def self.register_validator: (String name, _Validator validator, ?priority: Integer?) -> void
|
|
1219
|
+
def self.unregister_validator: (String name) -> void
|
|
1220
|
+
def self.clear_validators: () -> void
|
|
1221
|
+
def self.register_ocr_backend: (String name, _OcrBackend backend) -> void
|
|
1222
|
+
def self.unregister_ocr_backend: (String name) -> void
|
|
1223
|
+
def self.list_ocr_backends: () -> Array[String]
|
|
1224
|
+
def self.clear_ocr_backends: () -> void
|
|
1225
|
+
def self.unregister_document_extractor: (String name) -> void
|
|
1226
|
+
def self.list_document_extractors: () -> Array[String]
|
|
1227
|
+
def self.clear_document_extractors: () -> void
|
|
1228
|
+
def self.list_post_processors: () -> Array[String]
|
|
1229
|
+
def self.list_validators: () -> Array[String]
|
|
1230
|
+
|
|
1231
|
+
interface _PostProcessor
|
|
1232
|
+
def call: (extraction_result_hash result) -> extraction_result_hash
|
|
1233
|
+
end
|
|
1234
|
+
|
|
1235
|
+
interface _Validator
|
|
1236
|
+
def call: (extraction_result_hash result) -> void
|
|
1237
|
+
end
|
|
1238
|
+
|
|
1239
|
+
interface _OcrBackend
|
|
1240
|
+
def name: () -> String
|
|
1241
|
+
def process_image: (String image_bytes, Hash[Symbol, untyped] config) -> String
|
|
1242
|
+
end
|
|
1243
|
+
|
|
1244
|
+
module ErrorContext
|
|
1245
|
+
def self.last_error_code: () -> Integer
|
|
1246
|
+
def self.last_panic_context: () -> Errors::PanicContext?
|
|
1247
|
+
def self.last_panic_context_json: () -> String?
|
|
1248
|
+
def self.error_details: () -> Hash[String, untyped]
|
|
1249
|
+
def self.classify_error: (String message) -> (Hash[String, untyped] | Integer)
|
|
1250
|
+
def self.error_code_name: (Integer code) -> String
|
|
1251
|
+
def self.error_code_description: (Integer code) -> String
|
|
1252
|
+
end
|
|
1253
|
+
|
|
1254
|
+
module Errors
|
|
1255
|
+
# Panic context information from FFI error introspection
|
|
1256
|
+
class PanicContext
|
|
1257
|
+
attr_reader file: String
|
|
1258
|
+
attr_reader line: Integer
|
|
1259
|
+
attr_reader function: String
|
|
1260
|
+
attr_reader message: String
|
|
1261
|
+
attr_reader timestamp_secs: Integer
|
|
1262
|
+
|
|
1263
|
+
def initialize: (
|
|
1264
|
+
file: String,
|
|
1265
|
+
line: Integer,
|
|
1266
|
+
function: String,
|
|
1267
|
+
message: String,
|
|
1268
|
+
timestamp_secs: Integer
|
|
1269
|
+
) -> void
|
|
1270
|
+
def to_s: () -> String
|
|
1271
|
+
def to_h: () -> Hash[Symbol, String | Integer]
|
|
1272
|
+
def self.from_json: (String) -> PanicContext?
|
|
1273
|
+
|
|
1274
|
+
private
|
|
1275
|
+
|
|
1276
|
+
def self.with_defaults: (Hash[Symbol, untyped] sliced) -> {file: String, line: Integer, function: String, message: String, timestamp_secs: Integer}
|
|
1277
|
+
end
|
|
1278
|
+
|
|
1279
|
+
class Error < StandardError
|
|
1280
|
+
attr_reader panic_context: PanicContext?
|
|
1281
|
+
attr_reader error_code: Integer?
|
|
1282
|
+
|
|
1283
|
+
def initialize: (String message, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
1284
|
+
end
|
|
1285
|
+
|
|
1286
|
+
class ValidationError < Error
|
|
1287
|
+
end
|
|
1288
|
+
|
|
1289
|
+
class ParsingError < Error
|
|
1290
|
+
attr_reader context: Hash[untyped, untyped]?
|
|
1291
|
+
|
|
1292
|
+
def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
1293
|
+
end
|
|
1294
|
+
|
|
1295
|
+
class OCRError < Error
|
|
1296
|
+
attr_reader context: Hash[untyped, untyped]?
|
|
1297
|
+
|
|
1298
|
+
def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
1299
|
+
end
|
|
1300
|
+
|
|
1301
|
+
class MissingDependencyError < Error
|
|
1302
|
+
attr_reader dependency: String?
|
|
1303
|
+
|
|
1304
|
+
def initialize: (String message, ?dependency: String?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
|
|
1305
|
+
end
|
|
1306
|
+
|
|
1307
|
+
class IOError < Error
|
|
1308
|
+
end
|
|
1309
|
+
|
|
1310
|
+
class PluginError < Error
|
|
1311
|
+
end
|
|
1312
|
+
|
|
1313
|
+
class UnsupportedFormatError < Error
|
|
1314
|
+
end
|
|
1315
|
+
end
|
|
1316
|
+
|
|
1317
|
+
# Internal modules (prepended to Kreuzberg singleton)
|
|
1318
|
+
# These are not checked by steep - see Steepfile
|
|
1319
|
+
module CacheAPI : Object
|
|
1320
|
+
end
|
|
1321
|
+
|
|
1322
|
+
module ExtractionAPI : Object
|
|
1323
|
+
end
|
|
1324
|
+
|
|
1325
|
+
module PostProcessorProtocol
|
|
1326
|
+
def call: (extraction_result_hash result) -> extraction_result_hash
|
|
1327
|
+
end
|
|
1328
|
+
|
|
1329
|
+
module ValidatorProtocol
|
|
1330
|
+
def call: (extraction_result_hash result) -> void
|
|
1331
|
+
end
|
|
1332
|
+
|
|
1333
|
+
module OcrBackendProtocol
|
|
1334
|
+
def name: () -> String
|
|
1335
|
+
def process_image: (String image_bytes, Hash[Symbol, untyped] config) -> String
|
|
1336
|
+
end
|
|
1337
|
+
end
|