RubyGems - kreuzberg - Versions diffs - 4.2.9 → 4.2.10 - Mend

kreuzberg 4.2.9 → 4.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/Gemfile.lock +13 -13
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/lib/kreuzberg/config.rb +2 -6
data/lib/kreuzberg/version.rb +1 -1
data/spec/binding/cache_spec.rb +2 -2
data/spec/binding/cli_spec.rb +4 -4
data/spec/binding/images_spec.rb +2 -2
data/spec/binding/metadata_types_spec.rb +1 -1
data/spec/binding/pages_extraction_spec.rb +26 -26
data/spec/binding/tables_spec.rb +1 -1
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +2 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/examples/bench_fixes.rs +4 -7
data/vendor/kreuzberg/examples/test_pdfium_fork.rs +3 -3
data/vendor/kreuzberg/src/core/mime.rs +113 -0
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +3 -4
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +7 -7
data/vendor/kreuzberg/src/mcp/tools/mime.rs +4 -4
data/vendor/kreuzberg/src/pdf/text.rs +1 -1
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +2 -2
data/vendor/kreuzberg/tests/docx_mime_detection_test.rs +97 -0
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +2 -2
data/vendor/kreuzberg/tests/format_integration.rs +2 -2
data/vendor/kreuzberg/tests/image_integration.rs +4 -4
data/vendor/kreuzberg/tests/issue_350_regression_test.rs +42 -0
data/vendor/kreuzberg/tests/ocr_configuration.rs +8 -8
data/vendor/kreuzberg/tests/ocr_errors.rs +2 -2
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +4 -4
data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +1 -1
data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
data/vendor/kreuzberg/tests/pdfium_linking.rs +24 -27
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +3 -3
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +3 -3
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +3 -3
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
data/vendor/kreuzberg-tesseract/tests/integration_test.rs +1 -1
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1bdd32141526f545868c567acbc8e3a7caf94b4ff7e42bebf859fe33416669e4
-  data.tar.gz: 10da5a6da3a781b9676ba1213a535a69edde90b89ccad45489fab9fb593f5f73
+  metadata.gz: abf625c4f7eedb0ba24619d640ac572192a112bf29876c25c662c8faf8a7219c
+  data.tar.gz: 460cdf492f802db89332e989340070448c5b60bb44ce0860a1104889814bb9ac
 SHA512:
-  metadata.gz: e45428f1c646ed0683f51fa932c2432b0563d3258912fbe7b49f75acf0cdbc43c844c92b17cf7d4a5ddccb0b010d23cce4b20de950877fbe64ecafb858312bc5
-  data.tar.gz: f0abcd49fe46a4f0e3e2bf80e217ff36970b4a6037ecec6ea889230605a83178d76bff31d0960d50fb2ad4e1ea6f703c595bd43c244ff0e082ab365eb86bf02a
+  metadata.gz: 6e9b8b00347a73747e7ab8aad698f2d7a5798609dd1b086fe6df3a723c49bd05c5dff3c8ad0e7c83720cc3944b1a9d66fdec710405c9f1e22e43fe55387cdc92
+  data.tar.gz: dab907905f37a8fbc13d4c3e7e893cf6162fe57c6d735bfe08dfec32ea721706ab683d1baa4cfa6b0db4db8c393e7909ee3cf98195881ea31c8dc5ce0cda0b6a

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    kreuzberg (4.2.9)
+    kreuzberg (4.2.10)
       rb_sys (~> 0.9.119)
 GEM
@@ -46,7 +46,7 @@ GEM
     i18n (1.14.8)
       concurrent-ruby (~> 1.0)
     io-console (0.8.2)
-    json (2.18.0)
+    json (2.18.1)
     language_server-protocol (3.17.0.5)
     lint_roller (1.1.0)
     listen (3.10.0)
@@ -75,12 +75,12 @@ GEM
     rake (13.3.1)
     rake-compiler (1.3.1)
       rake
-    rake-compiler-dock (1.10.0)
+    rake-compiler-dock (1.11.0)
     rb-fsevent (0.11.2)
     rb-inotify (0.11.1)
       ffi (~> 1.0)
-    rb_sys (0.9.119)
-      rake-compiler-dock (= 1.10.0)
+    rb_sys (0.9.124)
+      rake-compiler-dock (= 1.11.0)
     rbs (3.10.3)
       logger
       tsort
@@ -100,7 +100,7 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.13.0)
     rspec-support (3.13.7)
-    rubocop (1.84.0)
+    rubocop (1.84.1)
       json (~> 2.3)
       language_server-protocol (~> 3.17.0.2)
       lint_roller (~> 1.1.0)
@@ -123,7 +123,7 @@ GEM
       rubocop (~> 1.81)
     ruby-progressbar (1.13.0)
     securerandom (0.4.1)
-    sorbet-runtime (0.6.12908)
+    sorbet-runtime (0.6.12914)
     steep (1.10.0)
       activesupport (>= 5.1)
       concurrent-ruby (>= 1.1.10)
@@ -208,8 +208,8 @@ CHECKSUMS
   fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
-  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
-  kreuzberg (4.2.9)
+  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
+  kreuzberg (4.2.10)
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -226,10 +226,10 @@ CHECKSUMS
   rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
   rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
   rake-compiler (1.3.1) sha256=6b351612b6e2d73ddd5563ee799bb58685176e05363db6758504bd11573d670a
-  rake-compiler-dock (1.10.0) sha256=dd62ee19df2a185a3315697e560cfa8cc9129901332152851e023fab0e94bf11
+  rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
   rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
   rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
-  rb_sys (0.9.119) sha256=64393fa148e402e1b79b64496d2aabfc7df79da6b822b8bb48dc1141eaf40b4b
+  rb_sys (0.9.124) sha256=513476557b12eaf73764b3da9f8746024558fe8699bda785fb548c9aa3877ae7
   rbs (3.10.3) sha256=70627f3919016134d554e6c99195552ae3ef6020fe034c8e983facc9c192daa6
   regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
   reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
@@ -238,13 +238,13 @@ CHECKSUMS
   rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
   rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
   rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
-  rubocop (1.84.0) sha256=88dec310153bb685a879f5a7cdb601f6287b8f0ee675d9dc63a17c7204c4190a
+  rubocop (1.84.1) sha256=14cc626f355141f5a2ef53c10a68d66b13bb30639b26370a76559096cc6bcc1a
   rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
   rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
   rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
   ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
   securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
-  sorbet-runtime (0.6.12908) sha256=229f43e76527b01c5291c00c43cc29ccebe437a87b34925c3ee250ebf23d328e
+  sorbet-runtime (0.6.12914) sha256=6d3c985d671dab9ab8ea244b51888b6e8e8e65e881e5bf816d1ac0950479dce6
   steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
   strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
   terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.9" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.10" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -37,7 +37,7 @@ collapsible_if = "allow"
 [package]
 name = "kreuzberg-rb"
-version = "4.0.0"
+version = "4.2.10"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/lib/kreuzberg/config.rb CHANGED Viewed

@@ -71,7 +71,6 @@ module Kreuzberg
     class Chunking
       attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
-      # rubocop:disable Metrics/CyclomaticComplexity
       def initialize(
         max_chars: nil,
         max_overlap: nil,
@@ -81,7 +80,6 @@ module Kreuzberg
         chunk_overlap: nil,
         enabled: true
       )
-        # rubocop:enable Metrics/CyclomaticComplexity
         resolved_size = chunk_size || max_chars || 1000
         resolved_overlap = chunk_overlap || max_overlap || 200
@@ -867,7 +865,6 @@ module Kreuzberg
               "Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
       end
-      # rubocop:disable Metrics/CyclomaticComplexity
       def to_h
         {
           use_cache: @use_cache,
@@ -888,7 +885,6 @@ module Kreuzberg
           result_format: @result_format
         }.compact
       end
-      # rubocop:enable Metrics/CyclomaticComplexity
       # Serialize configuration to JSON string
       #
@@ -992,7 +988,7 @@ module Kreuzberg
       #   config[:use_cache] = false
       #   config[:force_ocr] = true
       #
-      # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
+      # rubocop:disable Metrics/MethodLength
       def []=(key, value)
         key_sym = key.to_sym
         case key_sym
@@ -1032,7 +1028,7 @@ module Kreuzberg
           raise ArgumentError, "Unknown configuration key: #{key}"
         end
       end
-      # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
+      # rubocop:enable Metrics/MethodLength
       # Get a configuration field using hash-like syntax
       #

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.2.9'
+  VERSION = '4.2.10'
 end

data/spec/binding/cache_spec.rb CHANGED Viewed

@@ -4,10 +4,10 @@ require 'spec_helper'
 RSpec.describe 'Cache Management' do
   let(:test_pdf) do
-    test_document_path('pdfs/5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf')
+    test_document_path('pdf/5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf')
   end
   let(:test_text) { test_document_path('text/contract_test.txt') }
-  let(:test_docx) { test_document_path('documents/contract.docx') }
+  let(:test_docx) { test_document_path('docx/extraction_test.docx') }
   before do
     Kreuzberg.clear_cache

data/spec/binding/cli_spec.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 RSpec.describe Kreuzberg::CLI do
   describe '.extract' do
     it 'extracts content from a file' do
-      path = test_document_path('documents/simple.odt')
+      path = test_document_path('odt/simple.odt')
       output = described_class.extract(path)
       expect(output).to be_a(String)
@@ -11,7 +11,7 @@ RSpec.describe Kreuzberg::CLI do
     end
     it 'accepts output format option' do
-      path = test_document_path('documents/simple.odt')
+      path = test_document_path('odt/simple.odt')
       output = described_class.extract(path, output: 'json')
       expect(output).to be_a(String)
@@ -19,7 +19,7 @@ RSpec.describe Kreuzberg::CLI do
     end
     it 'accepts OCR option' do
-      path = test_document_path('pdfs/100_g_networking_technology_overview_slides_toronto_august_2016.pdf')
+      path = test_document_path('pdf/100_g_networking_technology_overview_slides_toronto_august_2016.pdf')
       output = described_class.extract(path, ocr: false)
       expect(output).to be_a(String)
@@ -29,7 +29,7 @@ RSpec.describe Kreuzberg::CLI do
   describe '.detect' do
     it 'detects MIME type' do
-      path = test_document_path('documents/simple.odt')
+      path = test_document_path('odt/simple.odt')
       mime_type = described_class.detect(path)
       expect(mime_type).to be_a(String)

data/spec/binding/images_spec.rb CHANGED Viewed

@@ -108,7 +108,7 @@ RSpec.describe 'Image Extraction' do
       )
       begin
-        docx_path = test_document_path('office/document.docx')
+        docx_path = test_document_path('docx/extraction_test.docx')
         result = Kreuzberg.extract_file_sync(path: docx_path, config: config)
         expect(result).not_to be_nil
@@ -126,7 +126,7 @@ RSpec.describe 'Image Extraction' do
       )
       begin
-        pptx_path = test_document_path('presentations/simple.pptx')
+        pptx_path = test_document_path('pptx/simple.pptx')
         result = Kreuzberg.extract_file_sync(path: pptx_path, config: config)
         expect(result).not_to be_nil

data/spec/binding/metadata_types_spec.rb CHANGED Viewed

@@ -959,7 +959,7 @@ RSpec.describe 'Kreuzberg Metadata Types' do
   describe 'Integration Test: Extract actual HTML file' do
     it 'extracts metadata from actual HTML file' do
-      html_file = test_document_path('web/html.html')
+      html_file = test_document_path('html/html.html')
       expect(File.exist?(html_file)).to be(true)

data/spec/binding/pages_extraction_spec.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 RSpec.describe 'Pages Extraction' do
   describe 'Extract Pages' do
     it 'returns pages array when extractPages is true' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -18,7 +18,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'returns page numbers for each page' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -34,7 +34,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'returns page content for each page' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -50,7 +50,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'returns nil for pages when extractPages is false' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -64,7 +64,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'preserves page order' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -83,7 +83,7 @@ RSpec.describe 'Pages Extraction' do
   describe 'Insert Page Markers' do
     it 'inserts page markers when insertPageMarkers is true' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -98,7 +98,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'does not insert markers when insertPageMarkers is false' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -113,7 +113,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'contains page numbers in markers' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -128,7 +128,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'inserts multiple markers for multi-page documents' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -145,7 +145,7 @@ RSpec.describe 'Pages Extraction' do
   describe 'Custom Marker Format' do
     it 'uses custom marker format when specified' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       custom_format = '=== PAGE {page_num} ==='
@@ -164,7 +164,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'replaces page_num placeholder in custom format' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       custom_format = '[Page Number: {page_num}]'
@@ -183,7 +183,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'handles simple custom format' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       custom_format = 'PAGE_{page_num}'
@@ -201,7 +201,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'handles custom format with line separators' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       custom_format = "\n---PAGE {page_num}---\n"
@@ -219,7 +219,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'overrides default marker format' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       custom_format = 'CUSTOM_PAGE_{page_num}'
@@ -239,7 +239,7 @@ RSpec.describe 'Pages Extraction' do
   describe 'Multi-Page PDF' do
     it 'produces multiple pages from multi-page PDF' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -253,7 +253,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'page numbers are sequential' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -269,7 +269,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'each page has content' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -286,7 +286,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'with markers contains all pages' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -303,7 +303,7 @@ RSpec.describe 'Pages Extraction' do
   describe 'Page Content Structure Validation' do
     it 'validates page structure' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -320,7 +320,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'page content has required fields' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -337,7 +337,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'page content with tables preserves table data' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -354,7 +354,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'page content with images preserves image data' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -371,7 +371,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'page content is not empty' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -388,7 +388,7 @@ RSpec.describe 'Pages Extraction' do
   describe 'Combined Features' do
     it 'extract pages and insert markers together' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -407,7 +407,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'extract pages with custom marker format' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(
@@ -426,7 +426,7 @@ RSpec.describe 'Pages Extraction' do
     end
     it 'page extraction consistency between array and markers' do
-      pdf_file = test_document_path('pdf/sample.pdf')
+      pdf_file = test_document_path('pdf/sample_contract.pdf')
       skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
       config = Kreuzberg::Config::Extraction.new(

data/spec/binding/tables_spec.rb CHANGED Viewed

@@ -246,7 +246,7 @@ RSpec.describe 'Table Extraction Quality' do
       config = Kreuzberg::Config::Extraction.new
       begin
-        result = Kreuzberg.extract_file(path: test_document_path('office/document.docx'), config: config)
+        result = Kreuzberg.extract_file(path: test_document_path('docx/extraction_test.docx'), config: config)
         expect(result).not_to be_nil
       rescue Kreuzberg::Errors::ValidationError
         skip 'DOCX test file not available'

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
 resolver = "2"
 [workspace.package]
-version = "4.2.9"
+version = "4.2.10"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.2.9"
+version = "4.2.10"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -133,7 +133,7 @@ once_cell = { workspace = true }
 parking_lot = { workspace = true }
 pastey = "0.2"
 rayon = "1.11.0"
-regex = "1.12.2"
+regex = "1.12.3"
 serde = { workspace = true }
 serde_json = { workspace = true }
 serde_yaml_ng = "0.10.0"

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.2.9 Release**
+> **🚀 Version 4.2.10 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/examples/bench_fixes.rs CHANGED Viewed

@@ -5,14 +5,11 @@ use std::time::Instant;
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     let test_pdfs = [
         (
-            "a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
+            "a_comprehensive_stud_large_acomprehensives.pdf",
             "Academic Paper (18 fonts)",
         ),
-        (
-            "5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf",
-            "Intel PDF (5 fonts)",
-        ),
-        ("fake_memo.pdf", "Tiny Memo (3-5 fonts)"),
+        ("5_level_paging_and_5_medium_5levelpagingand.pdf", "Intel PDF (5 fonts)"),
+        ("simple_small_fakememo.pdf", "Tiny Memo (3-5 fonts)"),
     ];
     let config = ExtractionConfig {
@@ -24,7 +21,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     println!("Testing warm execution fix and font overhead fix\n");
     for (file, description) in &test_pdfs {
-        let path = PathBuf::from(format!("test_documents/pdfs/{}", file));
+        let path = PathBuf::from(format!("test_documents/pdf/{}", file));
         println!("=== {} ===", description);
         println!("File: {}\n", file);

data/vendor/kreuzberg/examples/test_pdfium_fork.rs CHANGED Viewed

@@ -12,7 +12,7 @@ async fn main() {
     println!("Test 1: fake_memo.pdf");
     let start = Instant::now();
-    match extract_file("test_documents/pdfs/fake_memo.pdf", None, &config).await {
+    match extract_file("test_documents/pdf/fake_memo.pdf", None, &config).await {
         Ok(result) => {
             let duration = start.elapsed();
             println!("  ✓ Success! Duration: {:?}", duration);
@@ -26,7 +26,7 @@ async fn main() {
     println!("\nTest 2: Warm iteration");
     let start = Instant::now();
-    match extract_file("test_documents/pdfs/fake_memo.pdf", None, &config).await {
+    match extract_file("test_documents/pdf/fake_memo.pdf", None, &config).await {
         Ok(result) => {
             let duration = start.elapsed();
             println!("  ✓ Success! Duration: {:?}", duration);
@@ -41,7 +41,7 @@ async fn main() {
     println!("\nTest 3: Academic Paper (18 fonts)");
     let start = Instant::now();
     match extract_file(
-        "test_documents/pdfs/a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
+        "test_documents/pdf/a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
         None,
         &config,
     )