kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
data/sig/kreuzberg.rbs CHANGED
@@ -25,6 +25,8 @@ module Kreuzberg
25
25
  end
26
26
 
27
27
  class Tesseract
28
+ attr_reader options: Hash[Symbol, untyped]
29
+
28
30
  def initialize: (**untyped options) -> void
29
31
  def to_h: () -> Hash[Symbol, untyped]
30
32
  end
@@ -82,13 +84,25 @@ module Kreuzberg
82
84
  def to_h: () -> Hash[Symbol, untyped]
83
85
  end
84
86
 
87
+ class Hierarchy
88
+ attr_reader enabled: bool
89
+ attr_reader k_clusters: Integer
90
+ attr_reader include_bbox: bool
91
+ attr_reader ocr_coverage_threshold: Float?
92
+
93
+ def initialize: (?enabled: bool, ?k_clusters: Integer, ?include_bbox: bool, ?ocr_coverage_threshold: Float?) -> void
94
+ def to_h: () -> Hash[Symbol, untyped]
95
+ def self.from_h: (Hash[Symbol, untyped]?) -> Hierarchy?
96
+ end
97
+
85
98
  class PDF
86
99
  attr_reader extract_images: bool
87
100
  attr_reader passwords: Array[String]?
88
101
  attr_reader extract_metadata: bool
89
102
  attr_reader font_config: FontConfig?
103
+ attr_reader hierarchy: Hierarchy?
90
104
 
91
- def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?) -> void
105
+ def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?) -> void
92
106
  def to_h: () -> Hash[Symbol, untyped]
93
107
  end
94
108
 
@@ -160,19 +174,44 @@ module Kreuzberg
160
174
  end
161
175
 
162
176
  class HtmlOptions
177
+ attr_reader options: Hash[Symbol, untyped]
178
+
163
179
  def initialize: (**untyped options) -> void
164
180
  def to_h: () -> Hash[Symbol, untyped]
165
181
  end
166
182
 
183
+ class KeywordYakeParams
184
+ attr_reader window_size: Integer
185
+
186
+ def initialize: (?window_size: Integer) -> void
187
+ def to_h: () -> Hash[Symbol, untyped]
188
+ end
189
+
190
+ class KeywordRakeParams
191
+ attr_reader min_word_length: Integer
192
+ attr_reader max_words_per_phrase: Integer
193
+
194
+ def initialize: (?min_word_length: Integer, ?max_words_per_phrase: Integer) -> void
195
+ def to_h: () -> Hash[Symbol, untyped]
196
+ end
197
+
167
198
  class Keywords
199
+ attr_reader algorithm: String?
200
+ attr_reader max_keywords: Integer?
201
+ attr_reader min_score: Float?
202
+ attr_reader ngram_range: Array[Integer]?
203
+ attr_reader language: String?
204
+ attr_reader yake_params: KeywordYakeParams?
205
+ attr_reader rake_params: KeywordRakeParams?
206
+
168
207
  def initialize: (
169
- ?algorithm: Symbol?,
208
+ ?algorithm: (Symbol | String)?,
170
209
  ?max_keywords: Integer?,
171
210
  ?min_score: Float?,
172
211
  ?ngram_range: Array[Integer]?,
173
- ?language: Symbol?,
174
- ?yake_params: Hash[Symbol, untyped]?,
175
- ?rake_params: Hash[Symbol, untyped]?
212
+ ?language: (Symbol | String)?,
213
+ ?yake_params: (KeywordYakeParams | Hash[Symbol, untyped])?,
214
+ ?rake_params: (KeywordRakeParams | Hash[Symbol, untyped])?
176
215
  ) -> void
177
216
  def to_h: () -> Hash[Symbol, untyped]
178
217
  end
@@ -194,7 +233,7 @@ module Kreuzberg
194
233
  attr_reader chunking: Chunking?
195
234
  attr_reader language_detection: LanguageDetection?
196
235
  attr_reader pdf_options: PDF?
197
- attr_reader image_extraction: ImageExtraction?
236
+ attr_reader images: ImageExtraction?
198
237
  attr_reader postprocessor: PostProcessor?
199
238
  attr_reader token_reduction: TokenReduction?
200
239
  attr_reader keywords: Keywords?
@@ -204,7 +243,10 @@ module Kreuzberg
204
243
  attr_reader output_format: String?
205
244
  attr_reader result_format: String?
206
245
 
246
+ alias image_extraction images
247
+
207
248
  def self.from_file: (String path) -> Extraction
249
+ def self.discover: () -> Extraction?
208
250
  def initialize: (
209
251
  ?use_cache: bool,
210
252
  ?enable_quality_processing: bool,
@@ -224,10 +266,23 @@ module Kreuzberg
224
266
  ?result_format: String?
225
267
  ) -> void
226
268
  def to_h: () -> Hash[Symbol, untyped]
269
+ def to_json: (*untyped) -> String
270
+ def get_field: (String | Symbol field_name) -> untyped
271
+ def merge: (Extraction | Hash[Symbol, untyped] other) -> Extraction
272
+ def merge!: (Extraction | Hash[Symbol, untyped] other) -> self
273
+ def []: (Symbol | String key) -> untyped
274
+ def []=: (Symbol | String key, untyped value) -> untyped
275
+ def output_format=: (String? value) -> String?
276
+ def result_format=: (String? value) -> String?
227
277
 
228
278
  private
229
279
 
230
280
  def normalize_config: [T] (T | Hash[Symbol, untyped] | nil value, Class klass) -> T?
281
+ def extract_from_hash: (Hash[Symbol, untyped]? hash, Hash[Symbol, untyped] defaults) -> Hash[Symbol, untyped]
282
+ def assign_attributes: (Hash[Symbol, untyped] params) -> void
283
+ def validate_output_format: (untyped value) -> String?
284
+ def validate_result_format: (untyped value) -> String?
285
+ def update_from_merged: (Extraction merged) -> void
231
286
  end
232
287
 
233
288
  end
@@ -249,14 +304,23 @@ module Kreuzberg
249
304
  content: String,
250
305
  mime_type: String,
251
306
  metadata_json: String,
307
+ metadata: Hash[String, untyped],
252
308
  tables: Array[table_hash]?,
253
309
  detected_languages: Array[String]?,
254
310
  chunks: Array[chunk_hash]?,
255
311
  images: Array[image_hash]?,
312
+ pages: Array[page_content_hash]?,
256
313
  elements: Array[element_hash]?,
257
314
  djot_content: djot_content_hash?
258
315
  }
259
316
 
317
+ type page_content_hash = {
318
+ page_number: Integer,
319
+ content: String,
320
+ tables: Array[table_hash],
321
+ images: Array[image_hash]
322
+ }
323
+
260
324
  type djot_content_hash = {
261
325
  plain_text: String,
262
326
  blocks: Array[formatted_block_hash],
@@ -315,8 +379,8 @@ module Kreuzberg
315
379
  byte_start: Integer,
316
380
  byte_end: Integer,
317
381
  token_count: Integer?,
318
- chunk_index: Integer?,
319
- total_chunks: Integer?,
382
+ chunk_index: Integer,
383
+ total_chunks: Integer,
320
384
  first_page: Integer?,
321
385
  last_page: Integer?,
322
386
  embedding: Array[Float]?
@@ -331,7 +395,7 @@ module Kreuzberg
331
395
  height: Integer?,
332
396
  colorspace: String?,
333
397
  bits_per_component: Integer?,
334
- is_mask: bool?,
398
+ is_mask: bool,
335
399
  description: String?,
336
400
  ocr_result: extraction_result_hash?
337
401
  }
@@ -361,8 +425,8 @@ module Kreuzberg
361
425
  attr_reader byte_start: Integer
362
426
  attr_reader byte_end: Integer
363
427
  attr_reader token_count: Integer?
364
- attr_reader chunk_index: Integer?
365
- attr_reader total_chunks: Integer?
428
+ attr_reader chunk_index: Integer
429
+ attr_reader total_chunks: Integer
366
430
  attr_reader first_page: Integer?
367
431
  attr_reader last_page: Integer?
368
432
  attr_reader embedding: Array[Float]?
@@ -372,8 +436,8 @@ module Kreuzberg
372
436
  byte_start: Integer,
373
437
  byte_end: Integer,
374
438
  token_count: Integer?,
375
- chunk_index: Integer?,
376
- total_chunks: Integer?,
439
+ chunk_index: Integer,
440
+ total_chunks: Integer,
377
441
  first_page: Integer?,
378
442
  last_page: Integer?,
379
443
  embedding: Array[Float]?
@@ -390,7 +454,7 @@ module Kreuzberg
390
454
  attr_reader height: Integer?
391
455
  attr_reader colorspace: String?
392
456
  attr_reader bits_per_component: Integer?
393
- attr_reader is_mask: bool?
457
+ attr_reader is_mask: bool
394
458
  attr_reader description: String?
395
459
  attr_reader ocr_result: Result?
396
460
 
@@ -403,18 +467,83 @@ module Kreuzberg
403
467
  height: Integer?,
404
468
  colorspace: String?,
405
469
  bits_per_component: Integer?,
406
- is_mask: bool?,
470
+ is_mask: bool,
407
471
  description: String?,
408
472
  ocr_result: Result?
409
473
  ) -> void
410
474
  def to_h: () -> image_hash
411
475
  end
412
476
 
477
+ # Page content with text and extracted elements
478
+ class PageContent
479
+ attr_reader page_number: Integer
480
+ attr_reader content: String
481
+ attr_reader tables: Array[Table]
482
+ attr_reader images: Array[Image]?
483
+ attr_reader hierarchy: PageHierarchy?
484
+
485
+ def initialize: (page_number: Integer, content: String, tables: Array[Table], images: Array[Image]?, hierarchy: PageHierarchy?) -> void
486
+ def to_h: () -> Hash[Symbol, untyped]
487
+ end
488
+
489
+ # Hierarchical block element (for page hierarchy)
490
+ class HierarchicalBlock
491
+ attr_reader text: String
492
+ attr_reader font_size: Float?
493
+ attr_reader level: String?
494
+ attr_reader bbox: Array[Float]?
495
+
496
+ def initialize: (text: String, font_size: Float?, level: String?, bbox: Array[Float]?) -> void
497
+ def to_h: () -> Hash[Symbol, untyped]
498
+ end
499
+
500
+ # Page hierarchy information
501
+ class PageHierarchy
502
+ attr_reader block_count: Integer
503
+ attr_reader blocks: Array[HierarchicalBlock]
504
+
505
+ def initialize: (block_count: Integer, blocks: Array[HierarchicalBlock]) -> void
506
+ def to_h: () -> Hash[Symbol, untyped]
507
+ end
508
+
509
+ # Element bounding box coordinates
510
+ class ElementBoundingBox
511
+ attr_reader x0: Float
512
+ attr_reader y0: Float
513
+ attr_reader x1: Float
514
+ attr_reader y1: Float
515
+
516
+ def initialize: (x0: Float, y0: Float, x1: Float, y1: Float) -> void
517
+ def to_h: () -> Hash[Symbol, untyped]
518
+ end
519
+
520
+ # Element metadata
521
+ class ElementMetadataStruct
522
+ attr_reader page_number: Integer?
523
+ attr_reader filename: String?
524
+ attr_reader coordinates: ElementBoundingBox?
525
+ attr_reader element_index: Integer?
526
+ attr_reader additional: Hash[String, String]
527
+
528
+ def initialize: (page_number: Integer?, filename: String?, coordinates: ElementBoundingBox?, element_index: Integer?, additional: Hash[String, String]) -> void
529
+ def to_h: () -> Hash[Symbol, untyped]
530
+ end
531
+
532
+ # Structured document element
533
+ class ElementStruct
534
+ attr_reader element_id: String
535
+ attr_reader element_type: String
536
+ attr_reader text: String
537
+ attr_reader metadata: ElementMetadataStruct
538
+
539
+ def initialize: (element_id: String, element_type: String, text: String, metadata: ElementMetadataStruct) -> void
540
+ def to_h: () -> Hash[Symbol, untyped]
541
+ end
542
+
413
543
  # Structured Djot document representation
414
544
  class DjotContent
415
545
  attr_reader plain_text: String
416
546
  attr_reader blocks: Array[DjotContent::FormattedBlock]
417
- attr_reader metadata: Hash[untyped, untyped]
418
547
  attr_reader metadata_json: String
419
548
  attr_reader tables: Array[untyped]
420
549
  attr_reader images: Array[DjotContent::DjotImage]
@@ -424,6 +553,7 @@ module Kreuzberg
424
553
 
425
554
  def initialize: (untyped hash) -> void
426
555
  def to_h: () -> Hash[Symbol, untyped]
556
+ def metadata: () -> Hash[untyped, untyped]
427
557
 
428
558
  private
429
559
 
@@ -484,64 +614,84 @@ module Kreuzberg
484
614
  attr_reader detected_languages: Array[String]?
485
615
  attr_reader chunks: Array[Chunk]?
486
616
  attr_reader images: Array[Image]?
617
+ attr_reader pages: Array[PageContent]?
618
+ attr_reader elements: Array[ElementStruct]?
487
619
  attr_reader djot_content: DjotContent?
488
620
 
489
621
  def initialize: (extraction_result_hash hash) -> void
490
622
  def to_h: () -> Hash[Symbol, untyped]
491
623
  def to_json: (*untyped) -> String
492
624
 
625
+ def page_count: () -> Integer
626
+ def chunk_count: () -> Integer
627
+ def detected_language: () -> String?
628
+ def metadata_field: (String | Symbol name) -> untyped
629
+
493
630
  private
494
631
 
495
632
  def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
496
633
  def parse_tables: (Array[table_hash]? tables_data) -> Array[Table]
497
634
  def parse_detected_languages: (Array[String]? langs_data) -> Array[String]?
498
635
  def parse_chunks: (Array[chunk_hash]? chunks_data) -> Array[Chunk]?
636
+ def parse_images: (Array[image_hash]? images_data) -> Array[Image]?
637
+ def parse_pages: (Array[page_content_hash]? pages_data) -> Array[PageContent]?
638
+ def parse_elements: (Array[untyped]? elements_data) -> Array[ElementStruct]?
639
+ def parse_element: (Hash[String, untyped] element_hash) -> ElementStruct
640
+ def parse_element_coordinates: (Hash[String, untyped]? coordinates_data) -> ElementBoundingBox?
641
+ def parse_page_hierarchy: (Hash[String, untyped]? hierarchy_data) -> PageHierarchy?
642
+ def parse_djot_content: (Hash[String, untyped]? djot_data) -> DjotContent?
643
+ def get_value: (Hash[String | Symbol, untyped] hash, String key, ?untyped default) -> untyped
644
+ def serialize_tables: () -> Array[table_hash]
645
+ def serialize_chunks: () -> Array[chunk_hash]?
646
+ def serialize_images: () -> Array[image_hash]?
647
+ def serialize_pages: () -> Array[Hash[Symbol, untyped]]?
648
+ def serialize_elements: () -> Array[Hash[Symbol, untyped]]?
499
649
  end
500
650
 
501
651
  # Module methods (extraction API)
502
652
  def self.extract_file_sync: (
503
- String | Pathname path,
653
+ path: String | Pathname,
504
654
  ?mime_type: String?,
505
655
  ?config: config_input?
506
656
  ) -> Result
507
657
 
508
658
  def self.extract_bytes_sync: (
509
- String data,
510
- String mime_type,
659
+ data: String,
660
+ mime_type: String,
511
661
  ?config: config_input?
512
662
  ) -> Result
513
663
 
514
664
  def self.batch_extract_files_sync: (
515
- Array[String | Pathname] paths,
665
+ paths: Array[String | Pathname],
516
666
  ?config: config_input?
517
667
  ) -> Array[Result]
518
668
 
519
669
  def self.batch_extract_bytes_sync: (
520
- Array[String] data_array,
521
- Array[String] mime_types,
670
+ data_array: Array[String],
671
+ mime_types: Array[String],
522
672
  ?config: config_input?
523
673
  ) -> Array[Result]
524
674
 
525
675
  def self.extract_file: (
526
- String | Pathname path,
676
+ path: String | Pathname,
527
677
  ?mime_type: String?,
528
678
  ?config: config_input?
529
679
  ) -> Result
530
680
 
531
681
  def self.extract_bytes: (
532
- String data,
533
- String mime_type,
682
+ data: String,
683
+ mime_type: String,
534
684
  ?config: config_input?
535
685
  ) -> Result
536
686
 
537
687
  def self.batch_extract_files: (
538
- Array[String | Pathname] paths,
688
+ paths: Array[String | Pathname],
539
689
  ?config: config_input?
540
690
  ) -> Array[Result]
541
691
 
542
692
  def self.batch_extract_bytes: (
543
- Array[String] data_array,
544
- Array[String] mime_types,
693
+ data_array: Array[String],
694
+ mime_types: Array[String],
545
695
  ?config: config_input?
546
696
  ) -> Array[Result]
547
697
 
@@ -549,17 +699,51 @@ module Kreuzberg
549
699
  def self.clear_cache: () -> void
550
700
  def self.cache_stats: () -> Hash[Symbol | String, Integer]
551
701
 
552
- # Config loading (native method)
702
+ # Config loading (native methods)
553
703
  def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
704
+ def self._config_discover_native: () -> Hash[Symbol, untyped]?
554
705
 
555
706
  # Error introspection (native methods)
556
707
  def self._last_error_code_native: () -> Integer
557
708
  def self._last_panic_context_json_native: () -> String?
558
- def self._get_error_details_native: () -> Hash[Symbol, untyped]
559
- def self._classify_error_native: (String message) -> Integer
709
+ def self._get_error_details_native: () -> Hash[String, untyped]
710
+ def self._classify_error_native: (String message) -> Hash[String, untyped]
560
711
  def self._error_code_name_native: (Integer code) -> String
561
712
  def self._error_code_description_native: (Integer code) -> String
562
713
 
714
+ # MIME type detection
715
+ def self.detect_mime_type: (String data) -> String
716
+ def self.detect_mime_type_from_path: (String path) -> String
717
+ def self.get_extensions_for_mime: (String mime_type) -> Array[String]
718
+ def self.validate_mime_type: (String mime_type) -> String
719
+
720
+ # Validation native methods
721
+ def self._validate_binarization_method_native: (String method) -> Integer
722
+ def self._validate_ocr_backend_native: (String backend) -> Integer
723
+ def self._validate_language_code_native: (String code) -> Integer
724
+ def self._validate_token_reduction_level_native: (String level) -> Integer
725
+ def self._validate_tesseract_psm_native: (Integer psm) -> Integer
726
+ def self._validate_tesseract_oem_native: (Integer oem) -> Integer
727
+ def self._validate_output_format_native: (String format) -> Integer
728
+ def self._validate_confidence_native: (Float confidence) -> Integer
729
+ def self._validate_dpi_native: (Integer dpi) -> Integer
730
+ def self._validate_chunking_params_native: (Integer max_chars, Integer max_overlap) -> Integer
731
+ def self._get_valid_binarization_methods_native: () -> String
732
+ def self._get_valid_language_codes_native: () -> String
733
+ def self._get_valid_ocr_backends_native: () -> String
734
+ def self._get_valid_token_reduction_levels_native: () -> String
735
+
736
+ # Config wrapper functions
737
+ def self._config_to_json_native: (String config_json) -> String
738
+ def self._config_get_field_native: (String config_json, String field_name) -> untyped
739
+ def self._config_merge_native: (String base_json, String override_json) -> String
740
+
741
+ # Result wrapper functions
742
+ def self._result_page_count_native: (untyped result) -> Integer
743
+ def self._result_chunk_count_native: (untyped result) -> Integer
744
+ def self._result_detected_language_native: (untyped result) -> String?
745
+ def self._result_metadata_field_native: (untyped result, String field_name) -> untyped
746
+
563
747
  # Plugin registration
564
748
  def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
565
749
  def self.unregister_post_processor: (String name) -> void
@@ -567,7 +751,15 @@ module Kreuzberg
567
751
  def self.register_validator: (String name, _Validator validator, ?priority: Integer?) -> void
568
752
  def self.unregister_validator: (String name) -> void
569
753
  def self.clear_validators: () -> void
570
- def self.register_ocr_backend: (_OcrBackend backend) -> void
754
+ def self.register_ocr_backend: (String name, _OcrBackend backend) -> void
755
+ def self.unregister_ocr_backend: (String name) -> void
756
+ def self.list_ocr_backends: () -> Array[String]
757
+ def self.clear_ocr_backends: () -> void
758
+ def self.unregister_document_extractor: (String name) -> void
759
+ def self.list_document_extractors: () -> Array[String]
760
+ def self.clear_document_extractors: () -> void
761
+ def self.list_post_processors: () -> Array[String]
762
+ def self.list_validators: () -> Array[String]
571
763
 
572
764
  interface _PostProcessor
573
765
  def call: (extraction_result_hash result) -> extraction_result_hash
@@ -579,15 +771,15 @@ module Kreuzberg
579
771
 
580
772
  interface _OcrBackend
581
773
  def name: () -> String
582
- def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
774
+ def process_image: (String image_bytes, Hash[Symbol, untyped] config) -> String
583
775
  end
584
776
 
585
777
  module ErrorContext
586
778
  def self.last_error_code: () -> Integer
587
779
  def self.last_panic_context: () -> Errors::PanicContext?
588
780
  def self.last_panic_context_json: () -> String?
589
- def self.error_details: () -> Hash[Symbol, untyped]
590
- def self.classify_error: (String message) -> Integer
781
+ def self.error_details: () -> Hash[String, untyped]
782
+ def self.classify_error: (String message) -> (Hash[String, untyped] | Integer)
591
783
  def self.error_code_name: (Integer code) -> String
592
784
  def self.error_code_description: (Integer code) -> String
593
785
  end
@@ -673,7 +865,6 @@ module Kreuzberg
673
865
 
674
866
  module OcrBackendProtocol
675
867
  def name: () -> String
676
- def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
677
- def process_image: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
868
+ def process_image: (String image_bytes, Hash[Symbol, untyped] config) -> String
678
869
  end
679
870
  end
@@ -439,12 +439,14 @@ RSpec.describe 'Batch Operations' do
439
439
 
440
440
  it 'batch results have consistent structure' do
441
441
  paths = []
442
+ tempfiles = []
442
443
 
443
444
  3.times do |i|
444
445
  file = Tempfile.new(["struct_#{i}", '.txt'])
445
446
  file.write("Structure test #{i}")
446
447
  file.close
447
448
  paths << file.path
449
+ tempfiles << file
448
450
  end
449
451
 
450
452
  config = Kreuzberg::Config::Extraction.new
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.6"
6
+ version = "4.2.7"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -43,11 +43,12 @@ tracing = "0.1"
43
43
  # Utilities
44
44
  ahash = "0.8.12"
45
45
  base64 = "0.22.1"
46
+ bytes = { version = "1", features = ["serde"] }
46
47
  hex = "0.4.3"
47
48
  toml = "0.9.11"
48
49
  num_cpus = "1.17.0"
49
50
  once_cell = "1.21.3"
50
- html-to-markdown-rs = { version = "2.24.1", default-features = false }
51
+ html-to-markdown-rs = { version = "2.24.4", default-features = false }
51
52
  reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
52
53
  image = { version = "0.25.9", default-features = false }
53
54
  lzma-rust2 = { version = "0.15.7" }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.6"
3
+ version = "4.2.7"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -120,6 +120,7 @@ async-trait = { workspace = true }
120
120
  base64 = { workspace = true }
121
121
  base64-simd = "0.8"
122
122
  bitvec = "1.0"
123
+ bytes = { workspace = true }
123
124
  dashmap = "6.1"
124
125
  dirs = "6.0"
125
126
  simdutf8 = { version = "0.1", optional = true }
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.6 Release**
20
+ > **🚀 Version 4.2.7 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -3,7 +3,7 @@
3
3
  use axum::{
4
4
  Json,
5
5
  body::to_bytes,
6
- extract::{FromRequest, Request, rejection::JsonRejection},
6
+ extract::{FromRequest, Multipart, Request, rejection::JsonRejection},
7
7
  http::StatusCode,
8
8
  response::{IntoResponse, Response},
9
9
  };
@@ -63,6 +63,34 @@ where
63
63
  }
64
64
  }
65
65
 
66
+ /// Custom Multipart extractor that returns JSON error responses instead of plain text.
67
+ ///
68
+ /// This wraps axum's `Multipart` extractor but uses `ApiError` as the rejection type,
69
+ /// ensuring that multipart parsing errors are returned as JSON with proper content type.
70
+ pub struct MultipartApi(pub Multipart);
71
+
72
+ impl<S> FromRequest<S> for MultipartApi
73
+ where
74
+ S: Send + Sync,
75
+ {
76
+ type Rejection = ApiError;
77
+
78
+ async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
79
+ match Multipart::from_request(req, state).await {
80
+ Ok(multipart) => Ok(MultipartApi(multipart)),
81
+ Err(rejection) => Err(ApiError {
82
+ status: StatusCode::BAD_REQUEST,
83
+ body: ErrorResponse {
84
+ error_type: "MultipartError".to_string(),
85
+ message: rejection.body_text(),
86
+ traceback: None,
87
+ status_code: StatusCode::BAD_REQUEST.as_u16(),
88
+ },
89
+ }),
90
+ }
91
+ }
92
+ }
93
+
66
94
  /// API-specific error wrapper.
67
95
  #[derive(Debug)]
68
96
  pub struct ApiError {