kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
data/sig/kreuzberg.rbs
CHANGED
|
@@ -25,6 +25,8 @@ module Kreuzberg
|
|
|
25
25
|
end
|
|
26
26
|
|
|
27
27
|
class Tesseract
|
|
28
|
+
attr_reader options: Hash[Symbol, untyped]
|
|
29
|
+
|
|
28
30
|
def initialize: (**untyped options) -> void
|
|
29
31
|
def to_h: () -> Hash[Symbol, untyped]
|
|
30
32
|
end
|
|
@@ -82,13 +84,25 @@ module Kreuzberg
|
|
|
82
84
|
def to_h: () -> Hash[Symbol, untyped]
|
|
83
85
|
end
|
|
84
86
|
|
|
87
|
+
class Hierarchy
|
|
88
|
+
attr_reader enabled: bool
|
|
89
|
+
attr_reader k_clusters: Integer
|
|
90
|
+
attr_reader include_bbox: bool
|
|
91
|
+
attr_reader ocr_coverage_threshold: Float?
|
|
92
|
+
|
|
93
|
+
def initialize: (?enabled: bool, ?k_clusters: Integer, ?include_bbox: bool, ?ocr_coverage_threshold: Float?) -> void
|
|
94
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
95
|
+
def self.from_h: (Hash[Symbol, untyped]?) -> Hierarchy?
|
|
96
|
+
end
|
|
97
|
+
|
|
85
98
|
class PDF
|
|
86
99
|
attr_reader extract_images: bool
|
|
87
100
|
attr_reader passwords: Array[String]?
|
|
88
101
|
attr_reader extract_metadata: bool
|
|
89
102
|
attr_reader font_config: FontConfig?
|
|
103
|
+
attr_reader hierarchy: Hierarchy?
|
|
90
104
|
|
|
91
|
-
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?) -> void
|
|
105
|
+
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?) -> void
|
|
92
106
|
def to_h: () -> Hash[Symbol, untyped]
|
|
93
107
|
end
|
|
94
108
|
|
|
@@ -160,19 +174,44 @@ module Kreuzberg
|
|
|
160
174
|
end
|
|
161
175
|
|
|
162
176
|
class HtmlOptions
|
|
177
|
+
attr_reader options: Hash[Symbol, untyped]
|
|
178
|
+
|
|
163
179
|
def initialize: (**untyped options) -> void
|
|
164
180
|
def to_h: () -> Hash[Symbol, untyped]
|
|
165
181
|
end
|
|
166
182
|
|
|
183
|
+
class KeywordYakeParams
|
|
184
|
+
attr_reader window_size: Integer
|
|
185
|
+
|
|
186
|
+
def initialize: (?window_size: Integer) -> void
|
|
187
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
class KeywordRakeParams
|
|
191
|
+
attr_reader min_word_length: Integer
|
|
192
|
+
attr_reader max_words_per_phrase: Integer
|
|
193
|
+
|
|
194
|
+
def initialize: (?min_word_length: Integer, ?max_words_per_phrase: Integer) -> void
|
|
195
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
196
|
+
end
|
|
197
|
+
|
|
167
198
|
class Keywords
|
|
199
|
+
attr_reader algorithm: String?
|
|
200
|
+
attr_reader max_keywords: Integer?
|
|
201
|
+
attr_reader min_score: Float?
|
|
202
|
+
attr_reader ngram_range: Array[Integer]?
|
|
203
|
+
attr_reader language: String?
|
|
204
|
+
attr_reader yake_params: KeywordYakeParams?
|
|
205
|
+
attr_reader rake_params: KeywordRakeParams?
|
|
206
|
+
|
|
168
207
|
def initialize: (
|
|
169
|
-
?algorithm: Symbol?,
|
|
208
|
+
?algorithm: (Symbol | String)?,
|
|
170
209
|
?max_keywords: Integer?,
|
|
171
210
|
?min_score: Float?,
|
|
172
211
|
?ngram_range: Array[Integer]?,
|
|
173
|
-
?language: Symbol?,
|
|
174
|
-
?yake_params: Hash[Symbol, untyped]?,
|
|
175
|
-
?rake_params: Hash[Symbol, untyped]?
|
|
212
|
+
?language: (Symbol | String)?,
|
|
213
|
+
?yake_params: (KeywordYakeParams | Hash[Symbol, untyped])?,
|
|
214
|
+
?rake_params: (KeywordRakeParams | Hash[Symbol, untyped])?
|
|
176
215
|
) -> void
|
|
177
216
|
def to_h: () -> Hash[Symbol, untyped]
|
|
178
217
|
end
|
|
@@ -194,7 +233,7 @@ module Kreuzberg
|
|
|
194
233
|
attr_reader chunking: Chunking?
|
|
195
234
|
attr_reader language_detection: LanguageDetection?
|
|
196
235
|
attr_reader pdf_options: PDF?
|
|
197
|
-
attr_reader
|
|
236
|
+
attr_reader images: ImageExtraction?
|
|
198
237
|
attr_reader postprocessor: PostProcessor?
|
|
199
238
|
attr_reader token_reduction: TokenReduction?
|
|
200
239
|
attr_reader keywords: Keywords?
|
|
@@ -204,7 +243,10 @@ module Kreuzberg
|
|
|
204
243
|
attr_reader output_format: String?
|
|
205
244
|
attr_reader result_format: String?
|
|
206
245
|
|
|
246
|
+
alias image_extraction images
|
|
247
|
+
|
|
207
248
|
def self.from_file: (String path) -> Extraction
|
|
249
|
+
def self.discover: () -> Extraction?
|
|
208
250
|
def initialize: (
|
|
209
251
|
?use_cache: bool,
|
|
210
252
|
?enable_quality_processing: bool,
|
|
@@ -224,10 +266,23 @@ module Kreuzberg
|
|
|
224
266
|
?result_format: String?
|
|
225
267
|
) -> void
|
|
226
268
|
def to_h: () -> Hash[Symbol, untyped]
|
|
269
|
+
def to_json: (*untyped) -> String
|
|
270
|
+
def get_field: (String | Symbol field_name) -> untyped
|
|
271
|
+
def merge: (Extraction | Hash[Symbol, untyped] other) -> Extraction
|
|
272
|
+
def merge!: (Extraction | Hash[Symbol, untyped] other) -> self
|
|
273
|
+
def []: (Symbol | String key) -> untyped
|
|
274
|
+
def []=: (Symbol | String key, untyped value) -> untyped
|
|
275
|
+
def output_format=: (String? value) -> String?
|
|
276
|
+
def result_format=: (String? value) -> String?
|
|
227
277
|
|
|
228
278
|
private
|
|
229
279
|
|
|
230
280
|
def normalize_config: [T] (T | Hash[Symbol, untyped] | nil value, Class klass) -> T?
|
|
281
|
+
def extract_from_hash: (Hash[Symbol, untyped]? hash, Hash[Symbol, untyped] defaults) -> Hash[Symbol, untyped]
|
|
282
|
+
def assign_attributes: (Hash[Symbol, untyped] params) -> void
|
|
283
|
+
def validate_output_format: (untyped value) -> String?
|
|
284
|
+
def validate_result_format: (untyped value) -> String?
|
|
285
|
+
def update_from_merged: (Extraction merged) -> void
|
|
231
286
|
end
|
|
232
287
|
|
|
233
288
|
end
|
|
@@ -249,14 +304,23 @@ module Kreuzberg
|
|
|
249
304
|
content: String,
|
|
250
305
|
mime_type: String,
|
|
251
306
|
metadata_json: String,
|
|
307
|
+
metadata: Hash[String, untyped],
|
|
252
308
|
tables: Array[table_hash]?,
|
|
253
309
|
detected_languages: Array[String]?,
|
|
254
310
|
chunks: Array[chunk_hash]?,
|
|
255
311
|
images: Array[image_hash]?,
|
|
312
|
+
pages: Array[page_content_hash]?,
|
|
256
313
|
elements: Array[element_hash]?,
|
|
257
314
|
djot_content: djot_content_hash?
|
|
258
315
|
}
|
|
259
316
|
|
|
317
|
+
type page_content_hash = {
|
|
318
|
+
page_number: Integer,
|
|
319
|
+
content: String,
|
|
320
|
+
tables: Array[table_hash],
|
|
321
|
+
images: Array[image_hash]
|
|
322
|
+
}
|
|
323
|
+
|
|
260
324
|
type djot_content_hash = {
|
|
261
325
|
plain_text: String,
|
|
262
326
|
blocks: Array[formatted_block_hash],
|
|
@@ -315,8 +379,8 @@ module Kreuzberg
|
|
|
315
379
|
byte_start: Integer,
|
|
316
380
|
byte_end: Integer,
|
|
317
381
|
token_count: Integer?,
|
|
318
|
-
chunk_index: Integer
|
|
319
|
-
total_chunks: Integer
|
|
382
|
+
chunk_index: Integer,
|
|
383
|
+
total_chunks: Integer,
|
|
320
384
|
first_page: Integer?,
|
|
321
385
|
last_page: Integer?,
|
|
322
386
|
embedding: Array[Float]?
|
|
@@ -331,7 +395,7 @@ module Kreuzberg
|
|
|
331
395
|
height: Integer?,
|
|
332
396
|
colorspace: String?,
|
|
333
397
|
bits_per_component: Integer?,
|
|
334
|
-
is_mask: bool
|
|
398
|
+
is_mask: bool,
|
|
335
399
|
description: String?,
|
|
336
400
|
ocr_result: extraction_result_hash?
|
|
337
401
|
}
|
|
@@ -361,8 +425,8 @@ module Kreuzberg
|
|
|
361
425
|
attr_reader byte_start: Integer
|
|
362
426
|
attr_reader byte_end: Integer
|
|
363
427
|
attr_reader token_count: Integer?
|
|
364
|
-
attr_reader chunk_index: Integer
|
|
365
|
-
attr_reader total_chunks: Integer
|
|
428
|
+
attr_reader chunk_index: Integer
|
|
429
|
+
attr_reader total_chunks: Integer
|
|
366
430
|
attr_reader first_page: Integer?
|
|
367
431
|
attr_reader last_page: Integer?
|
|
368
432
|
attr_reader embedding: Array[Float]?
|
|
@@ -372,8 +436,8 @@ module Kreuzberg
|
|
|
372
436
|
byte_start: Integer,
|
|
373
437
|
byte_end: Integer,
|
|
374
438
|
token_count: Integer?,
|
|
375
|
-
chunk_index: Integer
|
|
376
|
-
total_chunks: Integer
|
|
439
|
+
chunk_index: Integer,
|
|
440
|
+
total_chunks: Integer,
|
|
377
441
|
first_page: Integer?,
|
|
378
442
|
last_page: Integer?,
|
|
379
443
|
embedding: Array[Float]?
|
|
@@ -390,7 +454,7 @@ module Kreuzberg
|
|
|
390
454
|
attr_reader height: Integer?
|
|
391
455
|
attr_reader colorspace: String?
|
|
392
456
|
attr_reader bits_per_component: Integer?
|
|
393
|
-
attr_reader is_mask: bool
|
|
457
|
+
attr_reader is_mask: bool
|
|
394
458
|
attr_reader description: String?
|
|
395
459
|
attr_reader ocr_result: Result?
|
|
396
460
|
|
|
@@ -403,18 +467,83 @@ module Kreuzberg
|
|
|
403
467
|
height: Integer?,
|
|
404
468
|
colorspace: String?,
|
|
405
469
|
bits_per_component: Integer?,
|
|
406
|
-
is_mask: bool
|
|
470
|
+
is_mask: bool,
|
|
407
471
|
description: String?,
|
|
408
472
|
ocr_result: Result?
|
|
409
473
|
) -> void
|
|
410
474
|
def to_h: () -> image_hash
|
|
411
475
|
end
|
|
412
476
|
|
|
477
|
+
# Page content with text and extracted elements
|
|
478
|
+
class PageContent
|
|
479
|
+
attr_reader page_number: Integer
|
|
480
|
+
attr_reader content: String
|
|
481
|
+
attr_reader tables: Array[Table]
|
|
482
|
+
attr_reader images: Array[Image]?
|
|
483
|
+
attr_reader hierarchy: PageHierarchy?
|
|
484
|
+
|
|
485
|
+
def initialize: (page_number: Integer, content: String, tables: Array[Table], images: Array[Image]?, hierarchy: PageHierarchy?) -> void
|
|
486
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
# Hierarchical block element (for page hierarchy)
|
|
490
|
+
class HierarchicalBlock
|
|
491
|
+
attr_reader text: String
|
|
492
|
+
attr_reader font_size: Float?
|
|
493
|
+
attr_reader level: String?
|
|
494
|
+
attr_reader bbox: Array[Float]?
|
|
495
|
+
|
|
496
|
+
def initialize: (text: String, font_size: Float?, level: String?, bbox: Array[Float]?) -> void
|
|
497
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
# Page hierarchy information
|
|
501
|
+
class PageHierarchy
|
|
502
|
+
attr_reader block_count: Integer
|
|
503
|
+
attr_reader blocks: Array[HierarchicalBlock]
|
|
504
|
+
|
|
505
|
+
def initialize: (block_count: Integer, blocks: Array[HierarchicalBlock]) -> void
|
|
506
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
# Element bounding box coordinates
|
|
510
|
+
class ElementBoundingBox
|
|
511
|
+
attr_reader x0: Float
|
|
512
|
+
attr_reader y0: Float
|
|
513
|
+
attr_reader x1: Float
|
|
514
|
+
attr_reader y1: Float
|
|
515
|
+
|
|
516
|
+
def initialize: (x0: Float, y0: Float, x1: Float, y1: Float) -> void
|
|
517
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
# Element metadata
|
|
521
|
+
class ElementMetadataStruct
|
|
522
|
+
attr_reader page_number: Integer?
|
|
523
|
+
attr_reader filename: String?
|
|
524
|
+
attr_reader coordinates: ElementBoundingBox?
|
|
525
|
+
attr_reader element_index: Integer?
|
|
526
|
+
attr_reader additional: Hash[String, String]
|
|
527
|
+
|
|
528
|
+
def initialize: (page_number: Integer?, filename: String?, coordinates: ElementBoundingBox?, element_index: Integer?, additional: Hash[String, String]) -> void
|
|
529
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
530
|
+
end
|
|
531
|
+
|
|
532
|
+
# Structured document element
|
|
533
|
+
class ElementStruct
|
|
534
|
+
attr_reader element_id: String
|
|
535
|
+
attr_reader element_type: String
|
|
536
|
+
attr_reader text: String
|
|
537
|
+
attr_reader metadata: ElementMetadataStruct
|
|
538
|
+
|
|
539
|
+
def initialize: (element_id: String, element_type: String, text: String, metadata: ElementMetadataStruct) -> void
|
|
540
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
541
|
+
end
|
|
542
|
+
|
|
413
543
|
# Structured Djot document representation
|
|
414
544
|
class DjotContent
|
|
415
545
|
attr_reader plain_text: String
|
|
416
546
|
attr_reader blocks: Array[DjotContent::FormattedBlock]
|
|
417
|
-
attr_reader metadata: Hash[untyped, untyped]
|
|
418
547
|
attr_reader metadata_json: String
|
|
419
548
|
attr_reader tables: Array[untyped]
|
|
420
549
|
attr_reader images: Array[DjotContent::DjotImage]
|
|
@@ -424,6 +553,7 @@ module Kreuzberg
|
|
|
424
553
|
|
|
425
554
|
def initialize: (untyped hash) -> void
|
|
426
555
|
def to_h: () -> Hash[Symbol, untyped]
|
|
556
|
+
def metadata: () -> Hash[untyped, untyped]
|
|
427
557
|
|
|
428
558
|
private
|
|
429
559
|
|
|
@@ -484,64 +614,84 @@ module Kreuzberg
|
|
|
484
614
|
attr_reader detected_languages: Array[String]?
|
|
485
615
|
attr_reader chunks: Array[Chunk]?
|
|
486
616
|
attr_reader images: Array[Image]?
|
|
617
|
+
attr_reader pages: Array[PageContent]?
|
|
618
|
+
attr_reader elements: Array[ElementStruct]?
|
|
487
619
|
attr_reader djot_content: DjotContent?
|
|
488
620
|
|
|
489
621
|
def initialize: (extraction_result_hash hash) -> void
|
|
490
622
|
def to_h: () -> Hash[Symbol, untyped]
|
|
491
623
|
def to_json: (*untyped) -> String
|
|
492
624
|
|
|
625
|
+
def page_count: () -> Integer
|
|
626
|
+
def chunk_count: () -> Integer
|
|
627
|
+
def detected_language: () -> String?
|
|
628
|
+
def metadata_field: (String | Symbol name) -> untyped
|
|
629
|
+
|
|
493
630
|
private
|
|
494
631
|
|
|
495
632
|
def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
|
|
496
633
|
def parse_tables: (Array[table_hash]? tables_data) -> Array[Table]
|
|
497
634
|
def parse_detected_languages: (Array[String]? langs_data) -> Array[String]?
|
|
498
635
|
def parse_chunks: (Array[chunk_hash]? chunks_data) -> Array[Chunk]?
|
|
636
|
+
def parse_images: (Array[image_hash]? images_data) -> Array[Image]?
|
|
637
|
+
def parse_pages: (Array[page_content_hash]? pages_data) -> Array[PageContent]?
|
|
638
|
+
def parse_elements: (Array[untyped]? elements_data) -> Array[ElementStruct]?
|
|
639
|
+
def parse_element: (Hash[String, untyped] element_hash) -> ElementStruct
|
|
640
|
+
def parse_element_coordinates: (Hash[String, untyped]? coordinates_data) -> ElementBoundingBox?
|
|
641
|
+
def parse_page_hierarchy: (Hash[String, untyped]? hierarchy_data) -> PageHierarchy?
|
|
642
|
+
def parse_djot_content: (Hash[String, untyped]? djot_data) -> DjotContent?
|
|
643
|
+
def get_value: (Hash[String | Symbol, untyped] hash, String key, ?untyped default) -> untyped
|
|
644
|
+
def serialize_tables: () -> Array[table_hash]
|
|
645
|
+
def serialize_chunks: () -> Array[chunk_hash]?
|
|
646
|
+
def serialize_images: () -> Array[image_hash]?
|
|
647
|
+
def serialize_pages: () -> Array[Hash[Symbol, untyped]]?
|
|
648
|
+
def serialize_elements: () -> Array[Hash[Symbol, untyped]]?
|
|
499
649
|
end
|
|
500
650
|
|
|
501
651
|
# Module methods (extraction API)
|
|
502
652
|
def self.extract_file_sync: (
|
|
503
|
-
String | Pathname
|
|
653
|
+
path: String | Pathname,
|
|
504
654
|
?mime_type: String?,
|
|
505
655
|
?config: config_input?
|
|
506
656
|
) -> Result
|
|
507
657
|
|
|
508
658
|
def self.extract_bytes_sync: (
|
|
509
|
-
String
|
|
510
|
-
String
|
|
659
|
+
data: String,
|
|
660
|
+
mime_type: String,
|
|
511
661
|
?config: config_input?
|
|
512
662
|
) -> Result
|
|
513
663
|
|
|
514
664
|
def self.batch_extract_files_sync: (
|
|
515
|
-
Array[String | Pathname]
|
|
665
|
+
paths: Array[String | Pathname],
|
|
516
666
|
?config: config_input?
|
|
517
667
|
) -> Array[Result]
|
|
518
668
|
|
|
519
669
|
def self.batch_extract_bytes_sync: (
|
|
520
|
-
Array[String]
|
|
521
|
-
Array[String]
|
|
670
|
+
data_array: Array[String],
|
|
671
|
+
mime_types: Array[String],
|
|
522
672
|
?config: config_input?
|
|
523
673
|
) -> Array[Result]
|
|
524
674
|
|
|
525
675
|
def self.extract_file: (
|
|
526
|
-
String | Pathname
|
|
676
|
+
path: String | Pathname,
|
|
527
677
|
?mime_type: String?,
|
|
528
678
|
?config: config_input?
|
|
529
679
|
) -> Result
|
|
530
680
|
|
|
531
681
|
def self.extract_bytes: (
|
|
532
|
-
String
|
|
533
|
-
String
|
|
682
|
+
data: String,
|
|
683
|
+
mime_type: String,
|
|
534
684
|
?config: config_input?
|
|
535
685
|
) -> Result
|
|
536
686
|
|
|
537
687
|
def self.batch_extract_files: (
|
|
538
|
-
Array[String | Pathname]
|
|
688
|
+
paths: Array[String | Pathname],
|
|
539
689
|
?config: config_input?
|
|
540
690
|
) -> Array[Result]
|
|
541
691
|
|
|
542
692
|
def self.batch_extract_bytes: (
|
|
543
|
-
Array[String]
|
|
544
|
-
Array[String]
|
|
693
|
+
data_array: Array[String],
|
|
694
|
+
mime_types: Array[String],
|
|
545
695
|
?config: config_input?
|
|
546
696
|
) -> Array[Result]
|
|
547
697
|
|
|
@@ -549,17 +699,51 @@ module Kreuzberg
|
|
|
549
699
|
def self.clear_cache: () -> void
|
|
550
700
|
def self.cache_stats: () -> Hash[Symbol | String, Integer]
|
|
551
701
|
|
|
552
|
-
# Config loading (native
|
|
702
|
+
# Config loading (native methods)
|
|
553
703
|
def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
|
|
704
|
+
def self._config_discover_native: () -> Hash[Symbol, untyped]?
|
|
554
705
|
|
|
555
706
|
# Error introspection (native methods)
|
|
556
707
|
def self._last_error_code_native: () -> Integer
|
|
557
708
|
def self._last_panic_context_json_native: () -> String?
|
|
558
|
-
def self._get_error_details_native: () -> Hash[
|
|
559
|
-
def self._classify_error_native: (String message) ->
|
|
709
|
+
def self._get_error_details_native: () -> Hash[String, untyped]
|
|
710
|
+
def self._classify_error_native: (String message) -> Hash[String, untyped]
|
|
560
711
|
def self._error_code_name_native: (Integer code) -> String
|
|
561
712
|
def self._error_code_description_native: (Integer code) -> String
|
|
562
713
|
|
|
714
|
+
# MIME type detection
|
|
715
|
+
def self.detect_mime_type: (String data) -> String
|
|
716
|
+
def self.detect_mime_type_from_path: (String path) -> String
|
|
717
|
+
def self.get_extensions_for_mime: (String mime_type) -> Array[String]
|
|
718
|
+
def self.validate_mime_type: (String mime_type) -> String
|
|
719
|
+
|
|
720
|
+
# Validation native methods
|
|
721
|
+
def self._validate_binarization_method_native: (String method) -> Integer
|
|
722
|
+
def self._validate_ocr_backend_native: (String backend) -> Integer
|
|
723
|
+
def self._validate_language_code_native: (String code) -> Integer
|
|
724
|
+
def self._validate_token_reduction_level_native: (String level) -> Integer
|
|
725
|
+
def self._validate_tesseract_psm_native: (Integer psm) -> Integer
|
|
726
|
+
def self._validate_tesseract_oem_native: (Integer oem) -> Integer
|
|
727
|
+
def self._validate_output_format_native: (String format) -> Integer
|
|
728
|
+
def self._validate_confidence_native: (Float confidence) -> Integer
|
|
729
|
+
def self._validate_dpi_native: (Integer dpi) -> Integer
|
|
730
|
+
def self._validate_chunking_params_native: (Integer max_chars, Integer max_overlap) -> Integer
|
|
731
|
+
def self._get_valid_binarization_methods_native: () -> String
|
|
732
|
+
def self._get_valid_language_codes_native: () -> String
|
|
733
|
+
def self._get_valid_ocr_backends_native: () -> String
|
|
734
|
+
def self._get_valid_token_reduction_levels_native: () -> String
|
|
735
|
+
|
|
736
|
+
# Config wrapper functions
|
|
737
|
+
def self._config_to_json_native: (String config_json) -> String
|
|
738
|
+
def self._config_get_field_native: (String config_json, String field_name) -> untyped
|
|
739
|
+
def self._config_merge_native: (String base_json, String override_json) -> String
|
|
740
|
+
|
|
741
|
+
# Result wrapper functions
|
|
742
|
+
def self._result_page_count_native: (untyped result) -> Integer
|
|
743
|
+
def self._result_chunk_count_native: (untyped result) -> Integer
|
|
744
|
+
def self._result_detected_language_native: (untyped result) -> String?
|
|
745
|
+
def self._result_metadata_field_native: (untyped result, String field_name) -> untyped
|
|
746
|
+
|
|
563
747
|
# Plugin registration
|
|
564
748
|
def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
|
|
565
749
|
def self.unregister_post_processor: (String name) -> void
|
|
@@ -567,7 +751,15 @@ module Kreuzberg
|
|
|
567
751
|
def self.register_validator: (String name, _Validator validator, ?priority: Integer?) -> void
|
|
568
752
|
def self.unregister_validator: (String name) -> void
|
|
569
753
|
def self.clear_validators: () -> void
|
|
570
|
-
def self.register_ocr_backend: (_OcrBackend backend) -> void
|
|
754
|
+
def self.register_ocr_backend: (String name, _OcrBackend backend) -> void
|
|
755
|
+
def self.unregister_ocr_backend: (String name) -> void
|
|
756
|
+
def self.list_ocr_backends: () -> Array[String]
|
|
757
|
+
def self.clear_ocr_backends: () -> void
|
|
758
|
+
def self.unregister_document_extractor: (String name) -> void
|
|
759
|
+
def self.list_document_extractors: () -> Array[String]
|
|
760
|
+
def self.clear_document_extractors: () -> void
|
|
761
|
+
def self.list_post_processors: () -> Array[String]
|
|
762
|
+
def self.list_validators: () -> Array[String]
|
|
571
763
|
|
|
572
764
|
interface _PostProcessor
|
|
573
765
|
def call: (extraction_result_hash result) -> extraction_result_hash
|
|
@@ -579,15 +771,15 @@ module Kreuzberg
|
|
|
579
771
|
|
|
580
772
|
interface _OcrBackend
|
|
581
773
|
def name: () -> String
|
|
582
|
-
def
|
|
774
|
+
def process_image: (String image_bytes, Hash[Symbol, untyped] config) -> String
|
|
583
775
|
end
|
|
584
776
|
|
|
585
777
|
module ErrorContext
|
|
586
778
|
def self.last_error_code: () -> Integer
|
|
587
779
|
def self.last_panic_context: () -> Errors::PanicContext?
|
|
588
780
|
def self.last_panic_context_json: () -> String?
|
|
589
|
-
def self.error_details: () -> Hash[
|
|
590
|
-
def self.classify_error: (String message) -> Integer
|
|
781
|
+
def self.error_details: () -> Hash[String, untyped]
|
|
782
|
+
def self.classify_error: (String message) -> (Hash[String, untyped] | Integer)
|
|
591
783
|
def self.error_code_name: (Integer code) -> String
|
|
592
784
|
def self.error_code_description: (Integer code) -> String
|
|
593
785
|
end
|
|
@@ -673,7 +865,6 @@ module Kreuzberg
|
|
|
673
865
|
|
|
674
866
|
module OcrBackendProtocol
|
|
675
867
|
def name: () -> String
|
|
676
|
-
def
|
|
677
|
-
def process_image: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
|
|
868
|
+
def process_image: (String image_bytes, Hash[Symbol, untyped] config) -> String
|
|
678
869
|
end
|
|
679
870
|
end
|
|
@@ -439,12 +439,14 @@ RSpec.describe 'Batch Operations' do
|
|
|
439
439
|
|
|
440
440
|
it 'batch results have consistent structure' do
|
|
441
441
|
paths = []
|
|
442
|
+
tempfiles = []
|
|
442
443
|
|
|
443
444
|
3.times do |i|
|
|
444
445
|
file = Tempfile.new(["struct_#{i}", '.txt'])
|
|
445
446
|
file.write("Structure test #{i}")
|
|
446
447
|
file.close
|
|
447
448
|
paths << file.path
|
|
449
|
+
tempfiles << file
|
|
448
450
|
end
|
|
449
451
|
|
|
450
452
|
config = Kreuzberg::Config::Extraction.new
|
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "4.2.
|
|
6
|
+
version = "4.2.7"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.91"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -43,11 +43,12 @@ tracing = "0.1"
|
|
|
43
43
|
# Utilities
|
|
44
44
|
ahash = "0.8.12"
|
|
45
45
|
base64 = "0.22.1"
|
|
46
|
+
bytes = { version = "1", features = ["serde"] }
|
|
46
47
|
hex = "0.4.3"
|
|
47
48
|
toml = "0.9.11"
|
|
48
49
|
num_cpus = "1.17.0"
|
|
49
50
|
once_cell = "1.21.3"
|
|
50
|
-
html-to-markdown-rs = { version = "2.24.
|
|
51
|
+
html-to-markdown-rs = { version = "2.24.4", default-features = false }
|
|
51
52
|
reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
|
|
52
53
|
image = { version = "0.25.9", default-features = false }
|
|
53
54
|
lzma-rust2 = { version = "0.15.7" }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.7"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -120,6 +120,7 @@ async-trait = { workspace = true }
|
|
|
120
120
|
base64 = { workspace = true }
|
|
121
121
|
base64-simd = "0.8"
|
|
122
122
|
bitvec = "1.0"
|
|
123
|
+
bytes = { workspace = true }
|
|
123
124
|
dashmap = "6.1"
|
|
124
125
|
dirs = "6.0"
|
|
125
126
|
simdutf8 = { version = "0.1", optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.7 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
use axum::{
|
|
4
4
|
Json,
|
|
5
5
|
body::to_bytes,
|
|
6
|
-
extract::{FromRequest, Request, rejection::JsonRejection},
|
|
6
|
+
extract::{FromRequest, Multipart, Request, rejection::JsonRejection},
|
|
7
7
|
http::StatusCode,
|
|
8
8
|
response::{IntoResponse, Response},
|
|
9
9
|
};
|
|
@@ -63,6 +63,34 @@ where
|
|
|
63
63
|
}
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
+
/// Custom Multipart extractor that returns JSON error responses instead of plain text.
|
|
67
|
+
///
|
|
68
|
+
/// This wraps axum's `Multipart` extractor but uses `ApiError` as the rejection type,
|
|
69
|
+
/// ensuring that multipart parsing errors are returned as JSON with proper content type.
|
|
70
|
+
pub struct MultipartApi(pub Multipart);
|
|
71
|
+
|
|
72
|
+
impl<S> FromRequest<S> for MultipartApi
|
|
73
|
+
where
|
|
74
|
+
S: Send + Sync,
|
|
75
|
+
{
|
|
76
|
+
type Rejection = ApiError;
|
|
77
|
+
|
|
78
|
+
async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
|
|
79
|
+
match Multipart::from_request(req, state).await {
|
|
80
|
+
Ok(multipart) => Ok(MultipartApi(multipart)),
|
|
81
|
+
Err(rejection) => Err(ApiError {
|
|
82
|
+
status: StatusCode::BAD_REQUEST,
|
|
83
|
+
body: ErrorResponse {
|
|
84
|
+
error_type: "MultipartError".to_string(),
|
|
85
|
+
message: rejection.body_text(),
|
|
86
|
+
traceback: None,
|
|
87
|
+
status_code: StatusCode::BAD_REQUEST.as_u16(),
|
|
88
|
+
},
|
|
89
|
+
}),
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
66
94
|
/// API-specific error wrapper.
|
|
67
95
|
#[derive(Debug)]
|
|
68
96
|
pub struct ApiError {
|