RubyGems - kreuzberg - Versions diffs - 4.3.4 → 4.3.5 - Mend

kreuzberg 4.3.4 → 4.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +5 -5
data/ext/kreuzberg_rb/native/src/result.rs +40 -0
data/lib/kreuzberg/result.rb +44 -20
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +9 -0
data/sig/kreuzberg.rbs +7 -2
data/vendor/Cargo.toml +3 -3
data/vendor/kreuzberg/Cargo.toml +5 -5
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/router.rs +2 -2
data/vendor/kreuzberg/src/chunking/core.rs +2 -0
data/vendor/kreuzberg/src/chunking/mod.rs +2 -0
data/vendor/kreuzberg/src/core/pipeline/format.rs +17 -6
data/vendor/kreuzberg/src/core/pipeline/tests.rs +30 -3
data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -0
data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +1 -0
data/vendor/kreuzberg/src/extraction/transform/mod.rs +2 -0
data/vendor/kreuzberg/src/extractors/csv.rs +1 -0
data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -0
data/vendor/kreuzberg/src/extractors/docbook.rs +1 -0
data/vendor/kreuzberg/src/extractors/docx.rs +2 -0
data/vendor/kreuzberg/src/extractors/excel.rs +1 -0
data/vendor/kreuzberg/src/extractors/html.rs +15 -8
data/vendor/kreuzberg/src/extractors/jats/elements.rs +1 -0
data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
data/vendor/kreuzberg/src/extractors/latex/environments.rs +1 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +2 -0
data/vendor/kreuzberg/src/extractors/odt.rs +1 -0
data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +185 -10
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +84 -12
data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
data/vendor/kreuzberg/src/extractors/rtf/tables.rs +1 -0
data/vendor/kreuzberg/src/mcp/format.rs +2 -1
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
data/vendor/kreuzberg/src/paddle_ocr/backend.rs +2 -1
data/vendor/kreuzberg/src/paddle_ocr/config.rs +26 -6
data/vendor/kreuzberg/src/pdf/fonts.rs +3 -1
data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +78 -9
data/vendor/kreuzberg/src/pdf/hierarchy/extraction.rs +57 -7
data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +1 -1
data/vendor/kreuzberg/src/pdf/markdown.rs +2014 -0
data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
data/vendor/kreuzberg/src/plugins/extractor/registry.rs +2 -0
data/vendor/kreuzberg/src/plugins/extractor/trait.rs +6 -0
data/vendor/kreuzberg/src/plugins/mod.rs +6 -0
data/vendor/kreuzberg/src/plugins/ocr.rs +6 -0
data/vendor/kreuzberg/src/plugins/processor/mod.rs +1 -0
data/vendor/kreuzberg/src/plugins/validator/mod.rs +1 -0
data/vendor/kreuzberg/src/types/extraction.rs +6 -0
data/vendor/kreuzberg/src/types/mod.rs +167 -0
data/vendor/kreuzberg/src/types/tables.rs +121 -0
data/vendor/kreuzberg/tests/config_behavioral.rs +1 -0
data/vendor/kreuzberg/tests/dump_pdf_markdown.rs +83 -0
data/vendor/kreuzberg/tests/helpers/mod.rs +3 -0
data/vendor/kreuzberg/tests/pdf_markdown_all_docs.rs +282 -0
data/vendor/kreuzberg/tests/pdf_markdown_extraction.rs +108 -0
data/vendor/kreuzberg/tests/pdf_table_detection.rs +285 -0
data/vendor/kreuzberg/tests/pdf_text_merging.rs +9 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +4 -0
data/vendor/kreuzberg-ffi/Cargo.toml +2 -2
data/vendor/kreuzberg-ffi/src/helpers.rs +1 -0
data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
data/vendor/kreuzberg-paddle-ocr/src/crnn_net.rs +58 -0
data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text/char.rs +15 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
metadata +7 -4
data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
data/lib/libpdfium.so +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f3c080ecb465d2860ccaef34aff627bf155689e912b0ff48ae72d99a16f045cf
-  data.tar.gz: 8cd3f4aae7a19c229bfb2451811233f59e04c39a9aa9d6d9d69f9e12d747ec04
+  metadata.gz: 3936788c6812a84428d0467330f573c20c9b569c399eab105cc2815d777b2141
+  data.tar.gz: 1b115f87bc4a40960584de9459725d0afabfefe6244ddea434461a2c6f36a647
 SHA512:
-  metadata.gz: c1948824a088e4d6296bd8f161bd59624a252763b47ef21b2b0763d8be0a472de814f95273e03fd82b6c1f8c66aea2fde8fd70c3315ad296be1eaaf09cf3e64c
-  data.tar.gz: 46340a09386141c8ddc19c4c23f9a3b9e4b74cfa8070d3aeec6ae0e339aec114cf3d83a5d08ef1d9734f293186829b075f1cb79f619fd59133148f9571fa9a4a
+  metadata.gz: 73fb7522dcd091b449d5146e65f008bded545a6c49a35d77468bd9bfa61d30a7413dc7364eef6ce79cca4606c5c710ea3f574509743569a7fb4f8a7bc579f402
+  data.tar.gz: e21fb401768da5005a1edb720b0c00c82aaa9a8ef60b6f2bab3587b3c8c94cd8fdcc220daeb8545bae6523bf8be22dbc1dab7c966ae8f30e888342d96c7e5df2

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    kreuzberg (4.3.4)
+    kreuzberg (4.3.5)
       rb_sys (~> 0.9.119)
 GEM
@@ -59,7 +59,7 @@ GEM
       prism (~> 1.5)
     mutex_m (0.3.0)
     parallel (1.27.0)
-    parser (3.3.10.1)
+    parser (3.3.10.2)
       ast (~> 2.4.1)
       racc
     prism (1.9.0)
@@ -210,7 +210,7 @@ CHECKSUMS
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
   json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
-  kreuzberg (4.3.4)
+  kreuzberg (4.3.5)
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -219,7 +219,7 @@ CHECKSUMS
   minitest (6.0.1) sha256=7854c74f48e2e975969062833adc4013f249a4b212f5e7b9d5c040bf838d54bb
   mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
   parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
-  parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
+  parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
   prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
   pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
   pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.4" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.5" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -37,7 +37,7 @@ collapsible_if = "allow"
 [package]
 name = "kreuzberg-rb"
-version = "4.3.4"
+version = "4.3.5"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/ext/kreuzberg_rb/native/src/lib.rs CHANGED Viewed

@@ -100,19 +100,19 @@ pub fn ruby_cache_stats() -> Result<RHash, Error> {
 // Validation wrapper functions
 pub fn validate_binarization_method(method: String) -> Result<i32, Error> {
-    unsafe { Ok(kreuzberg_validate_binarization_method(method.as_ptr() as *const i8)) }
+    unsafe { Ok(kreuzberg_validate_binarization_method(method.as_ptr() as *const std::os::raw::c_char)) }
 }
 pub fn validate_ocr_backend(backend: String) -> Result<i32, Error> {
-    unsafe { Ok(kreuzberg_validate_ocr_backend(backend.as_ptr() as *const i8)) }
+    unsafe { Ok(kreuzberg_validate_ocr_backend(backend.as_ptr() as *const std::os::raw::c_char)) }
 }
 pub fn validate_language_code(code: String) -> Result<i32, Error> {
-    unsafe { Ok(kreuzberg_validate_language_code(code.as_ptr() as *const i8)) }
+    unsafe { Ok(kreuzberg_validate_language_code(code.as_ptr() as *const std::os::raw::c_char)) }
 }
 pub fn validate_token_reduction_level(level: String) -> Result<i32, Error> {
-    unsafe { Ok(kreuzberg_validate_token_reduction_level(level.as_ptr() as *const i8)) }
+    unsafe { Ok(kreuzberg_validate_token_reduction_level(level.as_ptr() as *const std::os::raw::c_char)) }
 }
 pub fn validate_tesseract_psm(psm: i32) -> Result<i32, Error> {
@@ -124,7 +124,7 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<i32, Error> {
 }
 pub fn validate_output_format(format: String) -> Result<i32, Error> {
-    unsafe { Ok(kreuzberg_validate_output_format(format.as_ptr() as *const i8)) }
+    unsafe { Ok(kreuzberg_validate_output_format(format.as_ptr() as *const std::os::raw::c_char)) }
 }
 pub fn validate_confidence(confidence: f64) -> Result<i32, Error> {

data/ext/kreuzberg_rb/native/src/result.rs CHANGED Viewed

@@ -53,6 +53,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
         table_hash.aset("cells", cells_array)?;
         table_hash.aset("markdown", table.markdown)?;
         table_hash.aset("page_number", table.page_number)?;
+        if let Some(bbox) = table.bounding_box {
+            let bbox_hash = ruby.hash_new();
+            bbox_hash.aset("x0", bbox.x0)?;
+            bbox_hash.aset("y0", bbox.y0)?;
+            bbox_hash.aset("x1", bbox.x1)?;
+            bbox_hash.aset("y1", bbox.y1)?;
+            table_hash.aset("bounding_box", bbox_hash)?;
+        } else {
+            table_hash.aset("bounding_box", ruby.qnil().as_value())?;
+        }
         tables_array.push(table_hash)?;
     }
@@ -164,6 +174,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
             } else {
                 image_hash.aset("ocr_result", ruby.qnil().as_value())?;
             }
+            if let Some(bbox) = image.bounding_box {
+                let bbox_hash = ruby.hash_new();
+                bbox_hash.aset("x0", bbox.x0)?;
+                bbox_hash.aset("y0", bbox.y0)?;
+                bbox_hash.aset("x1", bbox.x1)?;
+                bbox_hash.aset("y1", bbox.y1)?;
+                image_hash.aset("bounding_box", bbox_hash)?;
+            } else {
+                image_hash.aset("bounding_box", ruby.qnil().as_value())?;
+            }
             images_array.push(image_hash)?;
         }
         set_hash_entry(ruby, &hash, "images", images_array.into_value_with(ruby))?;
@@ -191,6 +211,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
                 table_hash.aset("cells", cells_array)?;
                 table_hash.aset("markdown", table.markdown.clone())?;
                 table_hash.aset("page_number", table.page_number as i64)?;
+                if let Some(ref bbox) = table.bounding_box {
+                    let bbox_hash = ruby.hash_new();
+                    bbox_hash.aset("x0", bbox.x0)?;
+                    bbox_hash.aset("y0", bbox.y0)?;
+                    bbox_hash.aset("x1", bbox.x1)?;
+                    bbox_hash.aset("y1", bbox.y1)?;
+                    table_hash.aset("bounding_box", bbox_hash)?;
+                } else {
+                    table_hash.aset("bounding_box", ruby.qnil().as_value())?;
+                }
                 tables_array.push(table_hash)?;
             }
@@ -248,6 +278,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
                 } else {
                     image_hash.aset("ocr_result", ruby.qnil().as_value())?;
                 }
+                if let Some(ref bbox) = image.bounding_box {
+                    let bbox_hash = ruby.hash_new();
+                    bbox_hash.aset("x0", bbox.x0)?;
+                    bbox_hash.aset("y0", bbox.y0)?;
+                    bbox_hash.aset("x1", bbox.x1)?;
+                    bbox_hash.aset("y1", bbox.y1)?;
+                    image_hash.aset("bounding_box", bbox_hash)?;
+                } else {
+                    image_hash.aset("bounding_box", ruby.qnil().as_value())?;
+                }
                 images_array.push(image_hash)?;
             }
             page_hash.aset("images", images_array)?;

data/lib/kreuzberg/result.rb CHANGED Viewed

@@ -22,9 +22,11 @@ module Kreuzberg
     #   @return [String] Markdown representation
     # @!attribute [r] page_number
     #   @return [Integer] Page number where table was found
-    Table = Struct.new(:cells, :markdown, :page_number, keyword_init: true) do
+    # @!attribute [r] bounding_box
+    #   @return [BoundingBox, nil] Bounding box of the table on the page
+    Table = Struct.new(:cells, :markdown, :page_number, :bounding_box, keyword_init: true) do
       def to_h
-        { cells: cells, markdown: markdown, page_number: page_number }
+        { cells: cells, markdown: markdown, page_number: page_number, bounding_box: bounding_box&.to_h }
       end
     end
@@ -78,6 +80,7 @@ module Kreuzberg
       :bits_per_component,
       :is_mask,
       :description,
+      :bounding_box,
       :ocr_result,
       keyword_init: true
     ) do
@@ -93,6 +96,7 @@ module Kreuzberg
           bits_per_component: bits_per_component,
           is_mask: is_mask,
           description: description,
+          bounding_box: bounding_box&.to_h,
           ocr_result: ocr_result&.to_h
         }
       end
@@ -486,10 +490,12 @@ module Kreuzberg
       return [] if tables_data.nil? || tables_data.empty?
       tables_data.map do |table_hash|
+        bounding_box = parse_bounding_box(table_hash['bounding_box'])
         Table.new(
           cells: table_hash['cells'] || [],
           markdown: table_hash['markdown'] || '',
-          page_number: table_hash['page_number'] || 0
+          page_number: table_hash['page_number'] || 0,
+          bounding_box: bounding_box
         )
       end
     end
@@ -521,23 +527,26 @@ module Kreuzberg
     def parse_images(images_data)
       return nil if images_data.nil?
-      images_data.map do |image_hash|
-        data = image_hash['data']
-        data = data.dup.force_encoding(Encoding::BINARY) if data.respond_to?(:force_encoding)
-        Image.new(
-          data: data,
-          format: image_hash['format'],
-          image_index: image_hash['image_index'],
-          page_number: image_hash['page_number'],
-          width: image_hash['width'],
-          height: image_hash['height'],
-          colorspace: image_hash['colorspace'],
-          bits_per_component: image_hash['bits_per_component'],
-          is_mask: image_hash['is_mask'],
-          description: image_hash['description'],
-          ocr_result: image_hash['ocr_result'] ? Result.new(image_hash['ocr_result']) : nil
-        )
-      end
+      images_data.map { |image_hash| parse_single_image(image_hash) }
+    end
+    def parse_single_image(image_hash)
+      data = image_hash['data']
+      data = data.dup.force_encoding(Encoding::BINARY) if data.respond_to?(:force_encoding)
+      Image.new(
+        data: data,
+        format: image_hash['format'],
+        image_index: image_hash['image_index'],
+        page_number: image_hash['page_number'],
+        width: image_hash['width'],
+        height: image_hash['height'],
+        colorspace: image_hash['colorspace'],
+        bits_per_component: image_hash['bits_per_component'],
+        is_mask: image_hash['is_mask'],
+        description: image_hash['description'],
+        bounding_box: parse_bounding_box(image_hash['bounding_box']),
+        ocr_result: image_hash['ocr_result'] ? Result.new(image_hash['ocr_result']) : nil
+      )
     end
     def parse_pages(pages_data)
@@ -610,6 +619,21 @@ module Kreuzberg
       )
     end
+    def parse_bounding_box(bounding_box_data)
+      return nil if bounding_box_data.nil?
+      # If it's already a BoundingBox object, return it
+      return bounding_box_data if bounding_box_data.is_a?(BoundingBox)
+      # Otherwise parse from hash
+      BoundingBox.new(
+        x0: bounding_box_data['x0'].to_f,
+        y0: bounding_box_data['y0'].to_f,
+        x1: bounding_box_data['x1'].to_f,
+        y1: bounding_box_data['y1'].to_f
+      )
+    end
     def parse_ocr_elements(ocr_elements_data)
       return nil if ocr_elements_data.nil?

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.3.4'
+  VERSION = '4.3.5'
 end

data/lib/kreuzberg.rb CHANGED Viewed

@@ -21,11 +21,20 @@ module Kreuzberg
   autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
   autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
+  autoload :BoundingBox, 'kreuzberg/types'
+  autoload :ElementMetadata, 'kreuzberg/types'
+  autoload :Element, 'kreuzberg/types'
   autoload :HtmlMetadata, 'kreuzberg/types'
   autoload :HeaderMetadata, 'kreuzberg/types'
   autoload :LinkMetadata, 'kreuzberg/types'
   autoload :ImageMetadata, 'kreuzberg/types'
   autoload :StructuredData, 'kreuzberg/types'
+  autoload :ExtractedKeyword, 'kreuzberg/types'
+  autoload :ProcessingWarning, 'kreuzberg/types'
+  autoload :DocumentBoundingBox, 'kreuzberg/types'
+  autoload :DocumentAnnotation, 'kreuzberg/types'
+  autoload :DocumentNode, 'kreuzberg/types'
+  autoload :DocumentStructure, 'kreuzberg/types'
   ExtractionConfig = Config::Extraction
   PageConfig = Config::PageConfig

data/sig/kreuzberg.rbs CHANGED Viewed

@@ -677,7 +677,8 @@ module Kreuzberg
   type table_hash = {
     cells: Array[Array[String]],
     markdown: String,
-    page_number: Integer
+    page_number: Integer,
+    bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?
   }
   type chunk_hash = {
@@ -703,6 +704,7 @@ module Kreuzberg
     bits_per_component: Integer?,
     is_mask: bool,
     description: String?,
+    bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?,
     ocr_result: extraction_result_hash?
   }
@@ -746,8 +748,9 @@ module Kreuzberg
       attr_reader cells: Array[Array[String]]
       attr_reader markdown: String
       attr_reader page_number: Integer
+      attr_reader bounding_box: BoundingBox?
-      def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer) -> void
+      def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer, bounding_box: BoundingBox?) -> void
       def to_h: () -> table_hash
     end
@@ -789,6 +792,7 @@ module Kreuzberg
       attr_reader bits_per_component: Integer?
       attr_reader is_mask: bool
       attr_reader description: String?
+      attr_reader bounding_box: BoundingBox?
       attr_reader ocr_result: Result?
       def initialize: (
@@ -802,6 +806,7 @@ module Kreuzberg
         bits_per_component: Integer?,
         is_mask: bool,
         description: String?,
+        bounding_box: BoundingBox?,
         ocr_result: Result?
       ) -> void
       def to_h: () -> image_hash

data/vendor/Cargo.toml CHANGED Viewed

@@ -2,7 +2,7 @@
 members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
 [workspace.package]
-version = "4.3.4"
+version = "4.3.5"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -21,7 +21,7 @@ console_error_panic_hook = "0.1"
 criterion = { version = "0.8", features = ["html_reports"] }
 getrandom = { version = "0.4.1", features = ["wasm_js"] }
 hex = "0.4.3"
-html-to-markdown-rs = { version = "2.25.0", default-features = false }
+html-to-markdown-rs = { version = "2.25.1", default-features = false }
 image = { version = "0.25.9", default-features = false }
 js-sys = "0.3"
 libc = "0.2.182"
@@ -37,7 +37,7 @@ serde_json = { version = "1.0.149" }
 tempfile = "3.25.0"
 thiserror = "2.0.18"
 tokio = { version = "1.49.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
-toml = "1.0.1"
+toml = "1.0.2"
 tracing = "0.1"
 wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
 wasm-bindgen-futures = "0.4"

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.3.4"
+version = "4.3.5"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -154,7 +154,7 @@ serde = { version = "1.0.228", features = ["derive"] }
 serde_json = { version = "1.0.149" }
 serde_yaml_ng = "0.10.0"
 jotdown = "0.9"
-toml = "1.0.1"
+toml = "1.0.2"
 mime_guess = "2.0"
 rmp-serde = "1.3"
 thiserror = "2.0.18"
@@ -167,11 +167,11 @@ lopdf = { version = "0.39.0", optional = true }
 calamine = { version = "0.33.0", features = ["dates"], optional = true }
 polars = { version = "0.53.0", default-features = false, features = ["ipc"], optional = true }
 roxmltree = { version = "0.21.1", optional = true }
-zip = { version = "8.0.0", optional = true, default-features = false, features = [
+zip = { version = "8.1.0", optional = true, default-features = false, features = [
     "deflate-flate2",
 ] }
 mail-parser = { version = "0.11.2", optional = true }
-html-to-markdown-rs = { version = "2.25.0", default-features = false , features = [
+html-to-markdown-rs = { version = "2.25.1", default-features = false , features = [
     "inline-images", "metadata", ], optional = true }
 cfb = { version = "0.14.0", optional = true }
 quick-xml = { version = "0.39.1", features = ["serialize"], optional = true }
@@ -236,7 +236,7 @@ sha2 = { version = "0.10", optional = true }
 tempfile = "3.25.0"
 filetime = "0.2"
 tar = "0.4.44"
-zip = { version = "8.0.0", default-features = false, features = ["deflate-flate2"] }
+zip = { version = "8.1.0", default-features = false, features = ["deflate-flate2"] }
 serial_test = "3.3.1"
 anyhow = "1.0"
 tokio-test = "0.4"

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.3.4 Release**
+> **🚀 Version 4.3.5 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/api/router.rs CHANGED Viewed

@@ -99,7 +99,7 @@ pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits
 /// # Examples
 ///
 /// ```no_run
-/// use kreuzberg::{ExtractionConfig, api::create_router_with_limits, core::ServerConfig};
+/// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits_and_server_config, ApiSizeLimits}, core::ServerConfig};
 ///
 /// # #[tokio::main]
 /// # async fn main() -> kreuzberg::Result<()> {
@@ -108,7 +108,7 @@ pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits
 /// server_config.cors_origins = vec!["https://example.com".to_string()];
 /// let router = create_router_with_limits_and_server_config(
 ///     extraction_config,
-///     Default::default(),
+///     ApiSizeLimits::default(),
 ///     server_config
 /// );
 /// # Ok(())

data/vendor/kreuzberg/src/chunking/core.rs CHANGED Viewed

@@ -37,6 +37,8 @@ use super::validation::validate_utf8_boundaries;
 ///     overlap: 50,
 ///     trim: true,
 ///     chunker_type: ChunkerType::Text,
+///     embedding: None,
+///     preset: None,
 /// };
 /// let result = chunk_text("Long text...", &config, None)?;
 /// assert!(!result.chunks.is_empty());

data/vendor/kreuzberg/src/chunking/mod.rs CHANGED Viewed

@@ -27,6 +27,8 @@
 //!     overlap: 50,
 //!     trim: true,
 //!     chunker_type: ChunkerType::Text,
+//!     embedding: None,
+//!     preset: None,
 //! };
 //!
 //! let long_text = "This is a very long document...".repeat(100);

data/vendor/kreuzberg/src/core/pipeline/format.rs CHANGED Viewed

@@ -23,10 +23,12 @@ use std::borrow::Cow;
 /// * `result` - The extraction result to modify
 /// * `output_format` - The desired output format
 pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputFormat) {
-    // Check if content was already formatted during extraction
-    let already_formatted = match &*result.mime_type {
-        "text/markdown" if output_format == OutputFormat::Markdown => true,
-        "text/djot" if output_format == OutputFormat::Djot => true,
+    // Check if content was already formatted during extraction.
+    // Since extractors now preserve original MIME types, detect by checking
+    // metadata.output_format which is set by extractors that pre-format.
+    let already_formatted = match result.metadata.output_format.as_deref() {
+        Some("markdown") if output_format == OutputFormat::Markdown => true,
+        Some("djot") if output_format == OutputFormat::Djot => true,
         _ => false,
     };
@@ -195,7 +197,11 @@ mod tests {
         let mut result = ExtractionResult {
             content: "Hello World".to_string(),
-            mime_type: Cow::Borrowed("text/djot"),
+            mime_type: Cow::Borrowed("text/html"),
+            metadata: Metadata {
+                output_format: Some("djot".to_string()),
+                ..Default::default()
+            },
             djot_content: Some(DjotContent {
                 plain_text: "Hello World".to_string(),
                 blocks: vec![FormattedBlock {
@@ -322,6 +328,7 @@ mod tests {
             cells: vec![vec!["A".to_string(), "B".to_string()]],
             markdown: "| A | B |".to_string(),
             page_number: 1,
+            bounding_box: None,
         };
         let mut result = ExtractionResult {
@@ -367,7 +374,11 @@ mod tests {
         let mut result = ExtractionResult {
             content: "test".to_string(),
-            mime_type: Cow::Borrowed("text/djot"),
+            mime_type: Cow::Borrowed("text/html"),
+            metadata: Metadata {
+                output_format: Some("djot".to_string()),
+                ..Default::default()
+            },
             djot_content: Some(djot_content),
             ..Default::default()
         };

data/vendor/kreuzberg/src/core/pipeline/tests.rs CHANGED Viewed

@@ -40,7 +40,13 @@ async fn test_run_pipeline_basic() {
         Cow::Borrowed(VALIDATION_MARKER_KEY),
         serde_json::json!(ORDER_VALIDATION_MARKER),
     );
-    let config = ExtractionConfig::default();
+    let config = ExtractionConfig {
+        postprocessor: Some(crate::core::config::PostProcessorConfig {
+            enabled: false,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
     let processed = run_pipeline(result, &config).await.unwrap();
     assert_eq!(processed.content, "test");
@@ -98,6 +104,10 @@ async fn test_pipeline_without_quality_processing() {
     };
     let config = ExtractionConfig {
         enable_quality_processing: false,
+        postprocessor: Some(crate::core::config::PostProcessorConfig {
+            enabled: false,
+            ..Default::default()
+        }),
         ..Default::default()
     };
@@ -166,6 +176,10 @@ async fn test_pipeline_without_chunking() {
     };
     let config = ExtractionConfig {
         chunking: None,
+        postprocessor: Some(crate::core::config::PostProcessorConfig {
+            enabled: false,
+            ..Default::default()
+        }),
         ..Default::default()
     };
@@ -201,7 +215,13 @@ async fn test_pipeline_preserves_metadata() {
         quality_score: None,
         processing_warnings: Vec::new(),
     };
-    let config = ExtractionConfig::default();
+    let config = ExtractionConfig {
+        postprocessor: Some(crate::core::config::PostProcessorConfig {
+            enabled: false,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
     let processed = run_pipeline(result, &config).await.unwrap();
     assert_eq!(
@@ -222,6 +242,7 @@ async fn test_pipeline_preserves_tables() {
         cells: vec![vec!["A".to_string(), "B".to_string()]],
         markdown: "| A | B |".to_string(),
         page_number: 0,
+        bounding_box: None,
     };
     let result = ExtractionResult {
@@ -242,7 +263,13 @@ async fn test_pipeline_preserves_tables() {
         quality_score: None,
         processing_warnings: Vec::new(),
     };
-    let config = ExtractionConfig::default();
+    let config = ExtractionConfig {
+        postprocessor: Some(crate::core::config::PostProcessorConfig {
+            enabled: false,
+            ..Default::default()
+        }),
+        ..Default::default()
+    };
     let processed = run_pipeline(result, &config).await.unwrap();
     assert_eq!(processed.tables.len(), 1);

data/vendor/kreuzberg/src/extraction/pptx/mod.rs CHANGED Viewed

@@ -157,6 +157,7 @@ fn extract_pptx_from_container<R: std::io::Read + std::io::Seek>(
                     is_mask: false,
                     description: None,
                     ocr_result: None,
+                    bounding_box: None,
                 });
             }
         }

data/vendor/kreuzberg/src/extraction/transform/document_tree.rs CHANGED Viewed

@@ -694,6 +694,7 @@ mod tests {
                 ],
                 markdown: "| Name | Age |\n|---|---|\n| Alice | 30 |".to_string(),
                 page_number: 1,
+                bounding_box: None,
             }],
             ..test_result("Some content")
         };

data/vendor/kreuzberg/src/extraction/transform/mod.rs CHANGED Viewed

@@ -374,6 +374,7 @@ mod tests {
             ],
             markdown: "| Header1 | Header2 |\n| Cell1 | Cell2 |".to_string(),
             page_number: 1,
+            bounding_box: None,
         };
         let image = ExtractedImage {
@@ -388,6 +389,7 @@ mod tests {
             is_mask: false,
             description: None,
             ocr_result: None,
+            bounding_box: None,
         };
         let result = ExtractionResult {

data/vendor/kreuzberg/src/extractors/csv.rs CHANGED Viewed

@@ -91,6 +91,7 @@ impl DocumentExtractor for CsvExtractor {
             cells: rows.clone(),
             markdown,
             page_number: 1,
+            bounding_box: None,
         };
         let row_count = rows.len();

data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs CHANGED Viewed

@@ -56,6 +56,7 @@ pub fn extract_tables_from_events(events: &[Event]) -> Vec<Table> {
                         cells,
                         markdown,
                         page_number: idx + 1,
+                        bounding_box: None,
                     });
                     table_index += 1;
                 }

data/vendor/kreuzberg/src/extractors/docbook.rs CHANGED Viewed

@@ -242,6 +242,7 @@ fn parse_docbook_single_pass(content: &str) -> Result<DocBookParseResult> {
                                 cells: current_table.clone(),
                                 markdown,
                                 page_number: table_index + 1,
+                                bounding_box: None,
                             });
                             table_index += 1;
                             current_table.clear();