kreuzberg 4.3.4 → 4.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/ext/kreuzberg_rb/native/src/lib.rs +5 -5
  6. data/ext/kreuzberg_rb/native/src/result.rs +40 -0
  7. data/lib/kreuzberg/result.rb +44 -20
  8. data/lib/kreuzberg/version.rb +1 -1
  9. data/lib/kreuzberg.rb +9 -0
  10. data/sig/kreuzberg.rbs +7 -2
  11. data/vendor/Cargo.toml +3 -3
  12. data/vendor/kreuzberg/Cargo.toml +5 -5
  13. data/vendor/kreuzberg/README.md +1 -1
  14. data/vendor/kreuzberg/src/api/router.rs +2 -2
  15. data/vendor/kreuzberg/src/chunking/core.rs +2 -0
  16. data/vendor/kreuzberg/src/chunking/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/core/pipeline/format.rs +17 -6
  18. data/vendor/kreuzberg/src/core/pipeline/tests.rs +30 -3
  19. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -0
  20. data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +1 -0
  21. data/vendor/kreuzberg/src/extraction/transform/mod.rs +2 -0
  22. data/vendor/kreuzberg/src/extractors/csv.rs +1 -0
  23. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -0
  24. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -0
  25. data/vendor/kreuzberg/src/extractors/docx.rs +2 -0
  26. data/vendor/kreuzberg/src/extractors/excel.rs +1 -0
  27. data/vendor/kreuzberg/src/extractors/html.rs +15 -8
  28. data/vendor/kreuzberg/src/extractors/jats/elements.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/latex/environments.rs +1 -0
  31. data/vendor/kreuzberg/src/extractors/markdown.rs +2 -0
  32. data/vendor/kreuzberg/src/extractors/odt.rs +1 -0
  33. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  34. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +185 -10
  35. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +84 -12
  36. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  37. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +1 -0
  38. data/vendor/kreuzberg/src/mcp/format.rs +2 -1
  39. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  40. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +2 -1
  41. data/vendor/kreuzberg/src/paddle_ocr/config.rs +26 -6
  42. data/vendor/kreuzberg/src/pdf/fonts.rs +3 -1
  43. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +78 -9
  44. data/vendor/kreuzberg/src/pdf/hierarchy/extraction.rs +57 -7
  45. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +1 -1
  46. data/vendor/kreuzberg/src/pdf/markdown.rs +2014 -0
  47. data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
  48. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +2 -0
  49. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +6 -0
  50. data/vendor/kreuzberg/src/plugins/mod.rs +6 -0
  51. data/vendor/kreuzberg/src/plugins/ocr.rs +6 -0
  52. data/vendor/kreuzberg/src/plugins/processor/mod.rs +1 -0
  53. data/vendor/kreuzberg/src/plugins/validator/mod.rs +1 -0
  54. data/vendor/kreuzberg/src/types/extraction.rs +6 -0
  55. data/vendor/kreuzberg/src/types/mod.rs +167 -0
  56. data/vendor/kreuzberg/src/types/tables.rs +121 -0
  57. data/vendor/kreuzberg/tests/config_behavioral.rs +1 -0
  58. data/vendor/kreuzberg/tests/dump_pdf_markdown.rs +83 -0
  59. data/vendor/kreuzberg/tests/helpers/mod.rs +3 -0
  60. data/vendor/kreuzberg/tests/pdf_markdown_all_docs.rs +282 -0
  61. data/vendor/kreuzberg/tests/pdf_markdown_extraction.rs +108 -0
  62. data/vendor/kreuzberg/tests/pdf_table_detection.rs +285 -0
  63. data/vendor/kreuzberg/tests/pdf_text_merging.rs +9 -0
  64. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +4 -0
  65. data/vendor/kreuzberg-ffi/Cargo.toml +2 -2
  66. data/vendor/kreuzberg-ffi/src/helpers.rs +1 -0
  67. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  68. data/vendor/kreuzberg-paddle-ocr/src/crnn_net.rs +58 -0
  69. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  70. data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text/char.rs +15 -0
  71. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  72. metadata +7 -4
  73. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  74. data/lib/libpdfium.so +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f3c080ecb465d2860ccaef34aff627bf155689e912b0ff48ae72d99a16f045cf
4
- data.tar.gz: 8cd3f4aae7a19c229bfb2451811233f59e04c39a9aa9d6d9d69f9e12d747ec04
3
+ metadata.gz: 3936788c6812a84428d0467330f573c20c9b569c399eab105cc2815d777b2141
4
+ data.tar.gz: 1b115f87bc4a40960584de9459725d0afabfefe6244ddea434461a2c6f36a647
5
5
  SHA512:
6
- metadata.gz: c1948824a088e4d6296bd8f161bd59624a252763b47ef21b2b0763d8be0a472de814f95273e03fd82b6c1f8c66aea2fde8fd70c3315ad296be1eaaf09cf3e64c
7
- data.tar.gz: 46340a09386141c8ddc19c4c23f9a3b9e4b74cfa8070d3aeec6ae0e339aec114cf3d83a5d08ef1d9734f293186829b075f1cb79f619fd59133148f9571fa9a4a
6
+ metadata.gz: 73fb7522dcd091b449d5146e65f008bded545a6c49a35d77468bd9bfa61d30a7413dc7364eef6ce79cca4606c5c710ea3f574509743569a7fb4f8a7bc579f402
7
+ data.tar.gz: e21fb401768da5005a1edb720b0c00c82aaa9a8ef60b6f2bab3587b3c8c94cd8fdcc220daeb8545bae6523bf8be22dbc1dab7c966ae8f30e888342d96c7e5df2
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.3.4)
4
+ kreuzberg (4.3.5)
5
5
  rb_sys (~> 0.9.119)
6
6
 
7
7
  GEM
@@ -59,7 +59,7 @@ GEM
59
59
  prism (~> 1.5)
60
60
  mutex_m (0.3.0)
61
61
  parallel (1.27.0)
62
- parser (3.3.10.1)
62
+ parser (3.3.10.2)
63
63
  ast (~> 2.4.1)
64
64
  racc
65
65
  prism (1.9.0)
@@ -210,7 +210,7 @@ CHECKSUMS
210
210
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
211
211
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
212
212
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
213
- kreuzberg (4.3.4)
213
+ kreuzberg (4.3.5)
214
214
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
215
215
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
216
216
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -219,7 +219,7 @@ CHECKSUMS
219
219
  minitest (6.0.1) sha256=7854c74f48e2e975969062833adc4013f249a4b212f5e7b9d5c040bf838d54bb
220
220
  mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
221
221
  parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
222
- parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
222
+ parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
223
223
  prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
224
224
  pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
225
225
  pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.4" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.5" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.3.4"
40
+ version = "4.3.5"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -100,19 +100,19 @@ pub fn ruby_cache_stats() -> Result<RHash, Error> {
100
100
 
101
101
  // Validation wrapper functions
102
102
  pub fn validate_binarization_method(method: String) -> Result<i32, Error> {
103
- unsafe { Ok(kreuzberg_validate_binarization_method(method.as_ptr() as *const i8)) }
103
+ unsafe { Ok(kreuzberg_validate_binarization_method(method.as_ptr() as *const std::os::raw::c_char)) }
104
104
  }
105
105
 
106
106
  pub fn validate_ocr_backend(backend: String) -> Result<i32, Error> {
107
- unsafe { Ok(kreuzberg_validate_ocr_backend(backend.as_ptr() as *const i8)) }
107
+ unsafe { Ok(kreuzberg_validate_ocr_backend(backend.as_ptr() as *const std::os::raw::c_char)) }
108
108
  }
109
109
 
110
110
  pub fn validate_language_code(code: String) -> Result<i32, Error> {
111
- unsafe { Ok(kreuzberg_validate_language_code(code.as_ptr() as *const i8)) }
111
+ unsafe { Ok(kreuzberg_validate_language_code(code.as_ptr() as *const std::os::raw::c_char)) }
112
112
  }
113
113
 
114
114
  pub fn validate_token_reduction_level(level: String) -> Result<i32, Error> {
115
- unsafe { Ok(kreuzberg_validate_token_reduction_level(level.as_ptr() as *const i8)) }
115
+ unsafe { Ok(kreuzberg_validate_token_reduction_level(level.as_ptr() as *const std::os::raw::c_char)) }
116
116
  }
117
117
 
118
118
  pub fn validate_tesseract_psm(psm: i32) -> Result<i32, Error> {
@@ -124,7 +124,7 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<i32, Error> {
124
124
  }
125
125
 
126
126
  pub fn validate_output_format(format: String) -> Result<i32, Error> {
127
- unsafe { Ok(kreuzberg_validate_output_format(format.as_ptr() as *const i8)) }
127
+ unsafe { Ok(kreuzberg_validate_output_format(format.as_ptr() as *const std::os::raw::c_char)) }
128
128
  }
129
129
 
130
130
  pub fn validate_confidence(confidence: f64) -> Result<i32, Error> {
@@ -53,6 +53,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
53
53
  table_hash.aset("cells", cells_array)?;
54
54
  table_hash.aset("markdown", table.markdown)?;
55
55
  table_hash.aset("page_number", table.page_number)?;
56
+ if let Some(bbox) = table.bounding_box {
57
+ let bbox_hash = ruby.hash_new();
58
+ bbox_hash.aset("x0", bbox.x0)?;
59
+ bbox_hash.aset("y0", bbox.y0)?;
60
+ bbox_hash.aset("x1", bbox.x1)?;
61
+ bbox_hash.aset("y1", bbox.y1)?;
62
+ table_hash.aset("bounding_box", bbox_hash)?;
63
+ } else {
64
+ table_hash.aset("bounding_box", ruby.qnil().as_value())?;
65
+ }
56
66
 
57
67
  tables_array.push(table_hash)?;
58
68
  }
@@ -164,6 +174,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
164
174
  } else {
165
175
  image_hash.aset("ocr_result", ruby.qnil().as_value())?;
166
176
  }
177
+ if let Some(bbox) = image.bounding_box {
178
+ let bbox_hash = ruby.hash_new();
179
+ bbox_hash.aset("x0", bbox.x0)?;
180
+ bbox_hash.aset("y0", bbox.y0)?;
181
+ bbox_hash.aset("x1", bbox.x1)?;
182
+ bbox_hash.aset("y1", bbox.y1)?;
183
+ image_hash.aset("bounding_box", bbox_hash)?;
184
+ } else {
185
+ image_hash.aset("bounding_box", ruby.qnil().as_value())?;
186
+ }
167
187
  images_array.push(image_hash)?;
168
188
  }
169
189
  set_hash_entry(ruby, &hash, "images", images_array.into_value_with(ruby))?;
@@ -191,6 +211,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
191
211
  table_hash.aset("cells", cells_array)?;
192
212
  table_hash.aset("markdown", table.markdown.clone())?;
193
213
  table_hash.aset("page_number", table.page_number as i64)?;
214
+ if let Some(ref bbox) = table.bounding_box {
215
+ let bbox_hash = ruby.hash_new();
216
+ bbox_hash.aset("x0", bbox.x0)?;
217
+ bbox_hash.aset("y0", bbox.y0)?;
218
+ bbox_hash.aset("x1", bbox.x1)?;
219
+ bbox_hash.aset("y1", bbox.y1)?;
220
+ table_hash.aset("bounding_box", bbox_hash)?;
221
+ } else {
222
+ table_hash.aset("bounding_box", ruby.qnil().as_value())?;
223
+ }
194
224
 
195
225
  tables_array.push(table_hash)?;
196
226
  }
@@ -248,6 +278,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
248
278
  } else {
249
279
  image_hash.aset("ocr_result", ruby.qnil().as_value())?;
250
280
  }
281
+ if let Some(ref bbox) = image.bounding_box {
282
+ let bbox_hash = ruby.hash_new();
283
+ bbox_hash.aset("x0", bbox.x0)?;
284
+ bbox_hash.aset("y0", bbox.y0)?;
285
+ bbox_hash.aset("x1", bbox.x1)?;
286
+ bbox_hash.aset("y1", bbox.y1)?;
287
+ image_hash.aset("bounding_box", bbox_hash)?;
288
+ } else {
289
+ image_hash.aset("bounding_box", ruby.qnil().as_value())?;
290
+ }
251
291
  images_array.push(image_hash)?;
252
292
  }
253
293
  page_hash.aset("images", images_array)?;
@@ -22,9 +22,11 @@ module Kreuzberg
22
22
  # @return [String] Markdown representation
23
23
  # @!attribute [r] page_number
24
24
  # @return [Integer] Page number where table was found
25
- Table = Struct.new(:cells, :markdown, :page_number, keyword_init: true) do
25
+ # @!attribute [r] bounding_box
26
+ # @return [BoundingBox, nil] Bounding box of the table on the page
27
+ Table = Struct.new(:cells, :markdown, :page_number, :bounding_box, keyword_init: true) do
26
28
  def to_h
27
- { cells: cells, markdown: markdown, page_number: page_number }
29
+ { cells: cells, markdown: markdown, page_number: page_number, bounding_box: bounding_box&.to_h }
28
30
  end
29
31
  end
30
32
 
@@ -78,6 +80,7 @@ module Kreuzberg
78
80
  :bits_per_component,
79
81
  :is_mask,
80
82
  :description,
83
+ :bounding_box,
81
84
  :ocr_result,
82
85
  keyword_init: true
83
86
  ) do
@@ -93,6 +96,7 @@ module Kreuzberg
93
96
  bits_per_component: bits_per_component,
94
97
  is_mask: is_mask,
95
98
  description: description,
99
+ bounding_box: bounding_box&.to_h,
96
100
  ocr_result: ocr_result&.to_h
97
101
  }
98
102
  end
@@ -486,10 +490,12 @@ module Kreuzberg
486
490
  return [] if tables_data.nil? || tables_data.empty?
487
491
 
488
492
  tables_data.map do |table_hash|
493
+ bounding_box = parse_bounding_box(table_hash['bounding_box'])
489
494
  Table.new(
490
495
  cells: table_hash['cells'] || [],
491
496
  markdown: table_hash['markdown'] || '',
492
- page_number: table_hash['page_number'] || 0
497
+ page_number: table_hash['page_number'] || 0,
498
+ bounding_box: bounding_box
493
499
  )
494
500
  end
495
501
  end
@@ -521,23 +527,26 @@ module Kreuzberg
521
527
  def parse_images(images_data)
522
528
  return nil if images_data.nil?
523
529
 
524
- images_data.map do |image_hash|
525
- data = image_hash['data']
526
- data = data.dup.force_encoding(Encoding::BINARY) if data.respond_to?(:force_encoding)
527
- Image.new(
528
- data: data,
529
- format: image_hash['format'],
530
- image_index: image_hash['image_index'],
531
- page_number: image_hash['page_number'],
532
- width: image_hash['width'],
533
- height: image_hash['height'],
534
- colorspace: image_hash['colorspace'],
535
- bits_per_component: image_hash['bits_per_component'],
536
- is_mask: image_hash['is_mask'],
537
- description: image_hash['description'],
538
- ocr_result: image_hash['ocr_result'] ? Result.new(image_hash['ocr_result']) : nil
539
- )
540
- end
530
+ images_data.map { |image_hash| parse_single_image(image_hash) }
531
+ end
532
+
533
+ def parse_single_image(image_hash)
534
+ data = image_hash['data']
535
+ data = data.dup.force_encoding(Encoding::BINARY) if data.respond_to?(:force_encoding)
536
+ Image.new(
537
+ data: data,
538
+ format: image_hash['format'],
539
+ image_index: image_hash['image_index'],
540
+ page_number: image_hash['page_number'],
541
+ width: image_hash['width'],
542
+ height: image_hash['height'],
543
+ colorspace: image_hash['colorspace'],
544
+ bits_per_component: image_hash['bits_per_component'],
545
+ is_mask: image_hash['is_mask'],
546
+ description: image_hash['description'],
547
+ bounding_box: parse_bounding_box(image_hash['bounding_box']),
548
+ ocr_result: image_hash['ocr_result'] ? Result.new(image_hash['ocr_result']) : nil
549
+ )
541
550
  end
542
551
 
543
552
  def parse_pages(pages_data)
@@ -610,6 +619,21 @@ module Kreuzberg
610
619
  )
611
620
  end
612
621
 
622
+ def parse_bounding_box(bounding_box_data)
623
+ return nil if bounding_box_data.nil?
624
+
625
+ # If it's already a BoundingBox object, return it
626
+ return bounding_box_data if bounding_box_data.is_a?(BoundingBox)
627
+
628
+ # Otherwise parse from hash
629
+ BoundingBox.new(
630
+ x0: bounding_box_data['x0'].to_f,
631
+ y0: bounding_box_data['y0'].to_f,
632
+ x1: bounding_box_data['x1'].to_f,
633
+ y1: bounding_box_data['y1'].to_f
634
+ )
635
+ end
636
+
613
637
  def parse_ocr_elements(ocr_elements_data)
614
638
  return nil if ocr_elements_data.nil?
615
639
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.3.4'
4
+ VERSION = '4.3.5'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -21,11 +21,20 @@ module Kreuzberg
21
21
  autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
22
22
  autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
23
23
 
24
+ autoload :BoundingBox, 'kreuzberg/types'
25
+ autoload :ElementMetadata, 'kreuzberg/types'
26
+ autoload :Element, 'kreuzberg/types'
24
27
  autoload :HtmlMetadata, 'kreuzberg/types'
25
28
  autoload :HeaderMetadata, 'kreuzberg/types'
26
29
  autoload :LinkMetadata, 'kreuzberg/types'
27
30
  autoload :ImageMetadata, 'kreuzberg/types'
28
31
  autoload :StructuredData, 'kreuzberg/types'
32
+ autoload :ExtractedKeyword, 'kreuzberg/types'
33
+ autoload :ProcessingWarning, 'kreuzberg/types'
34
+ autoload :DocumentBoundingBox, 'kreuzberg/types'
35
+ autoload :DocumentAnnotation, 'kreuzberg/types'
36
+ autoload :DocumentNode, 'kreuzberg/types'
37
+ autoload :DocumentStructure, 'kreuzberg/types'
29
38
 
30
39
  ExtractionConfig = Config::Extraction
31
40
  PageConfig = Config::PageConfig
data/sig/kreuzberg.rbs CHANGED
@@ -677,7 +677,8 @@ module Kreuzberg
677
677
  type table_hash = {
678
678
  cells: Array[Array[String]],
679
679
  markdown: String,
680
- page_number: Integer
680
+ page_number: Integer,
681
+ bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?
681
682
  }
682
683
 
683
684
  type chunk_hash = {
@@ -703,6 +704,7 @@ module Kreuzberg
703
704
  bits_per_component: Integer?,
704
705
  is_mask: bool,
705
706
  description: String?,
707
+ bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?,
706
708
  ocr_result: extraction_result_hash?
707
709
  }
708
710
 
@@ -746,8 +748,9 @@ module Kreuzberg
746
748
  attr_reader cells: Array[Array[String]]
747
749
  attr_reader markdown: String
748
750
  attr_reader page_number: Integer
751
+ attr_reader bounding_box: BoundingBox?
749
752
 
750
- def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer) -> void
753
+ def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer, bounding_box: BoundingBox?) -> void
751
754
  def to_h: () -> table_hash
752
755
  end
753
756
 
@@ -789,6 +792,7 @@ module Kreuzberg
789
792
  attr_reader bits_per_component: Integer?
790
793
  attr_reader is_mask: bool
791
794
  attr_reader description: String?
795
+ attr_reader bounding_box: BoundingBox?
792
796
  attr_reader ocr_result: Result?
793
797
 
794
798
  def initialize: (
@@ -802,6 +806,7 @@ module Kreuzberg
802
806
  bits_per_component: Integer?,
803
807
  is_mask: bool,
804
808
  description: String?,
809
+ bounding_box: BoundingBox?,
805
810
  ocr_result: Result?
806
811
  ) -> void
807
812
  def to_h: () -> image_hash
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.3.4"
5
+ version = "4.3.5"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -21,7 +21,7 @@ console_error_panic_hook = "0.1"
21
21
  criterion = { version = "0.8", features = ["html_reports"] }
22
22
  getrandom = { version = "0.4.1", features = ["wasm_js"] }
23
23
  hex = "0.4.3"
24
- html-to-markdown-rs = { version = "2.25.0", default-features = false }
24
+ html-to-markdown-rs = { version = "2.25.1", default-features = false }
25
25
  image = { version = "0.25.9", default-features = false }
26
26
  js-sys = "0.3"
27
27
  libc = "0.2.182"
@@ -37,7 +37,7 @@ serde_json = { version = "1.0.149" }
37
37
  tempfile = "3.25.0"
38
38
  thiserror = "2.0.18"
39
39
  tokio = { version = "1.49.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
40
- toml = "1.0.1"
40
+ toml = "1.0.2"
41
41
  tracing = "0.1"
42
42
  wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
43
43
  wasm-bindgen-futures = "0.4"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.3.4"
3
+ version = "4.3.5"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -154,7 +154,7 @@ serde = { version = "1.0.228", features = ["derive"] }
154
154
  serde_json = { version = "1.0.149" }
155
155
  serde_yaml_ng = "0.10.0"
156
156
  jotdown = "0.9"
157
- toml = "1.0.1"
157
+ toml = "1.0.2"
158
158
  mime_guess = "2.0"
159
159
  rmp-serde = "1.3"
160
160
  thiserror = "2.0.18"
@@ -167,11 +167,11 @@ lopdf = { version = "0.39.0", optional = true }
167
167
  calamine = { version = "0.33.0", features = ["dates"], optional = true }
168
168
  polars = { version = "0.53.0", default-features = false, features = ["ipc"], optional = true }
169
169
  roxmltree = { version = "0.21.1", optional = true }
170
- zip = { version = "8.0.0", optional = true, default-features = false, features = [
170
+ zip = { version = "8.1.0", optional = true, default-features = false, features = [
171
171
  "deflate-flate2",
172
172
  ] }
173
173
  mail-parser = { version = "0.11.2", optional = true }
174
- html-to-markdown-rs = { version = "2.25.0", default-features = false , features = [
174
+ html-to-markdown-rs = { version = "2.25.1", default-features = false , features = [
175
175
  "inline-images", "metadata", ], optional = true }
176
176
  cfb = { version = "0.14.0", optional = true }
177
177
  quick-xml = { version = "0.39.1", features = ["serialize"], optional = true }
@@ -236,7 +236,7 @@ sha2 = { version = "0.10", optional = true }
236
236
  tempfile = "3.25.0"
237
237
  filetime = "0.2"
238
238
  tar = "0.4.44"
239
- zip = { version = "8.0.0", default-features = false, features = ["deflate-flate2"] }
239
+ zip = { version = "8.1.0", default-features = false, features = ["deflate-flate2"] }
240
240
  serial_test = "3.3.1"
241
241
  anyhow = "1.0"
242
242
  tokio-test = "0.4"
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.3.4 Release**
20
+ > **🚀 Version 4.3.5 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -99,7 +99,7 @@ pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits
99
99
  /// # Examples
100
100
  ///
101
101
  /// ```no_run
102
- /// use kreuzberg::{ExtractionConfig, api::create_router_with_limits, core::ServerConfig};
102
+ /// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits_and_server_config, ApiSizeLimits}, core::ServerConfig};
103
103
  ///
104
104
  /// # #[tokio::main]
105
105
  /// # async fn main() -> kreuzberg::Result<()> {
@@ -108,7 +108,7 @@ pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits
108
108
  /// server_config.cors_origins = vec!["https://example.com".to_string()];
109
109
  /// let router = create_router_with_limits_and_server_config(
110
110
  /// extraction_config,
111
- /// Default::default(),
111
+ /// ApiSizeLimits::default(),
112
112
  /// server_config
113
113
  /// );
114
114
  /// # Ok(())
@@ -37,6 +37,8 @@ use super::validation::validate_utf8_boundaries;
37
37
  /// overlap: 50,
38
38
  /// trim: true,
39
39
  /// chunker_type: ChunkerType::Text,
40
+ /// embedding: None,
41
+ /// preset: None,
40
42
  /// };
41
43
  /// let result = chunk_text("Long text...", &config, None)?;
42
44
  /// assert!(!result.chunks.is_empty());
@@ -27,6 +27,8 @@
27
27
  //! overlap: 50,
28
28
  //! trim: true,
29
29
  //! chunker_type: ChunkerType::Text,
30
+ //! embedding: None,
31
+ //! preset: None,
30
32
  //! };
31
33
  //!
32
34
  //! let long_text = "This is a very long document...".repeat(100);
@@ -23,10 +23,12 @@ use std::borrow::Cow;
23
23
  /// * `result` - The extraction result to modify
24
24
  /// * `output_format` - The desired output format
25
25
  pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputFormat) {
26
- // Check if content was already formatted during extraction
27
- let already_formatted = match &*result.mime_type {
28
- "text/markdown" if output_format == OutputFormat::Markdown => true,
29
- "text/djot" if output_format == OutputFormat::Djot => true,
26
+ // Check if content was already formatted during extraction.
27
+ // Since extractors now preserve original MIME types, detect by checking
28
+ // metadata.output_format which is set by extractors that pre-format.
29
+ let already_formatted = match result.metadata.output_format.as_deref() {
30
+ Some("markdown") if output_format == OutputFormat::Markdown => true,
31
+ Some("djot") if output_format == OutputFormat::Djot => true,
30
32
  _ => false,
31
33
  };
32
34
 
@@ -195,7 +197,11 @@ mod tests {
195
197
 
196
198
  let mut result = ExtractionResult {
197
199
  content: "Hello World".to_string(),
198
- mime_type: Cow::Borrowed("text/djot"),
200
+ mime_type: Cow::Borrowed("text/html"),
201
+ metadata: Metadata {
202
+ output_format: Some("djot".to_string()),
203
+ ..Default::default()
204
+ },
199
205
  djot_content: Some(DjotContent {
200
206
  plain_text: "Hello World".to_string(),
201
207
  blocks: vec![FormattedBlock {
@@ -322,6 +328,7 @@ mod tests {
322
328
  cells: vec![vec!["A".to_string(), "B".to_string()]],
323
329
  markdown: "| A | B |".to_string(),
324
330
  page_number: 1,
331
+ bounding_box: None,
325
332
  };
326
333
 
327
334
  let mut result = ExtractionResult {
@@ -367,7 +374,11 @@ mod tests {
367
374
 
368
375
  let mut result = ExtractionResult {
369
376
  content: "test".to_string(),
370
- mime_type: Cow::Borrowed("text/djot"),
377
+ mime_type: Cow::Borrowed("text/html"),
378
+ metadata: Metadata {
379
+ output_format: Some("djot".to_string()),
380
+ ..Default::default()
381
+ },
371
382
  djot_content: Some(djot_content),
372
383
  ..Default::default()
373
384
  };
@@ -40,7 +40,13 @@ async fn test_run_pipeline_basic() {
40
40
  Cow::Borrowed(VALIDATION_MARKER_KEY),
41
41
  serde_json::json!(ORDER_VALIDATION_MARKER),
42
42
  );
43
- let config = ExtractionConfig::default();
43
+ let config = ExtractionConfig {
44
+ postprocessor: Some(crate::core::config::PostProcessorConfig {
45
+ enabled: false,
46
+ ..Default::default()
47
+ }),
48
+ ..Default::default()
49
+ };
44
50
 
45
51
  let processed = run_pipeline(result, &config).await.unwrap();
46
52
  assert_eq!(processed.content, "test");
@@ -98,6 +104,10 @@ async fn test_pipeline_without_quality_processing() {
98
104
  };
99
105
  let config = ExtractionConfig {
100
106
  enable_quality_processing: false,
107
+ postprocessor: Some(crate::core::config::PostProcessorConfig {
108
+ enabled: false,
109
+ ..Default::default()
110
+ }),
101
111
  ..Default::default()
102
112
  };
103
113
 
@@ -166,6 +176,10 @@ async fn test_pipeline_without_chunking() {
166
176
  };
167
177
  let config = ExtractionConfig {
168
178
  chunking: None,
179
+ postprocessor: Some(crate::core::config::PostProcessorConfig {
180
+ enabled: false,
181
+ ..Default::default()
182
+ }),
169
183
  ..Default::default()
170
184
  };
171
185
 
@@ -201,7 +215,13 @@ async fn test_pipeline_preserves_metadata() {
201
215
  quality_score: None,
202
216
  processing_warnings: Vec::new(),
203
217
  };
204
- let config = ExtractionConfig::default();
218
+ let config = ExtractionConfig {
219
+ postprocessor: Some(crate::core::config::PostProcessorConfig {
220
+ enabled: false,
221
+ ..Default::default()
222
+ }),
223
+ ..Default::default()
224
+ };
205
225
 
206
226
  let processed = run_pipeline(result, &config).await.unwrap();
207
227
  assert_eq!(
@@ -222,6 +242,7 @@ async fn test_pipeline_preserves_tables() {
222
242
  cells: vec![vec!["A".to_string(), "B".to_string()]],
223
243
  markdown: "| A | B |".to_string(),
224
244
  page_number: 0,
245
+ bounding_box: None,
225
246
  };
226
247
 
227
248
  let result = ExtractionResult {
@@ -242,7 +263,13 @@ async fn test_pipeline_preserves_tables() {
242
263
  quality_score: None,
243
264
  processing_warnings: Vec::new(),
244
265
  };
245
- let config = ExtractionConfig::default();
266
+ let config = ExtractionConfig {
267
+ postprocessor: Some(crate::core::config::PostProcessorConfig {
268
+ enabled: false,
269
+ ..Default::default()
270
+ }),
271
+ ..Default::default()
272
+ };
246
273
 
247
274
  let processed = run_pipeline(result, &config).await.unwrap();
248
275
  assert_eq!(processed.tables.len(), 1);
@@ -157,6 +157,7 @@ fn extract_pptx_from_container<R: std::io::Read + std::io::Seek>(
157
157
  is_mask: false,
158
158
  description: None,
159
159
  ocr_result: None,
160
+ bounding_box: None,
160
161
  });
161
162
  }
162
163
  }
@@ -694,6 +694,7 @@ mod tests {
694
694
  ],
695
695
  markdown: "| Name | Age |\n|---|---|\n| Alice | 30 |".to_string(),
696
696
  page_number: 1,
697
+ bounding_box: None,
697
698
  }],
698
699
  ..test_result("Some content")
699
700
  };
@@ -374,6 +374,7 @@ mod tests {
374
374
  ],
375
375
  markdown: "| Header1 | Header2 |\n| Cell1 | Cell2 |".to_string(),
376
376
  page_number: 1,
377
+ bounding_box: None,
377
378
  };
378
379
 
379
380
  let image = ExtractedImage {
@@ -388,6 +389,7 @@ mod tests {
388
389
  is_mask: false,
389
390
  description: None,
390
391
  ocr_result: None,
392
+ bounding_box: None,
391
393
  };
392
394
 
393
395
  let result = ExtractionResult {
@@ -91,6 +91,7 @@ impl DocumentExtractor for CsvExtractor {
91
91
  cells: rows.clone(),
92
92
  markdown,
93
93
  page_number: 1,
94
+ bounding_box: None,
94
95
  };
95
96
 
96
97
  let row_count = rows.len();
@@ -56,6 +56,7 @@ pub fn extract_tables_from_events(events: &[Event]) -> Vec<Table> {
56
56
  cells,
57
57
  markdown,
58
58
  page_number: idx + 1,
59
+ bounding_box: None,
59
60
  });
60
61
  table_index += 1;
61
62
  }
@@ -242,6 +242,7 @@ fn parse_docbook_single_pass(content: &str) -> Result<DocBookParseResult> {
242
242
  cells: current_table.clone(),
243
243
  markdown,
244
244
  page_number: table_index + 1,
245
+ bounding_box: None,
245
246
  });
246
247
  table_index += 1;
247
248
  current_table.clear();