kreuzberg 4.2.9 → 4.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +13 -13
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/config.rb +2 -6
  6. data/lib/kreuzberg/version.rb +1 -1
  7. data/spec/binding/cache_spec.rb +2 -2
  8. data/spec/binding/cli_spec.rb +4 -4
  9. data/spec/binding/images_spec.rb +2 -2
  10. data/spec/binding/metadata_types_spec.rb +1 -1
  11. data/spec/binding/pages_extraction_spec.rb +26 -26
  12. data/spec/binding/tables_spec.rb +1 -1
  13. data/vendor/Cargo.toml +1 -1
  14. data/vendor/kreuzberg/Cargo.toml +2 -2
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/examples/bench_fixes.rs +4 -7
  17. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +3 -3
  18. data/vendor/kreuzberg/src/core/mime.rs +113 -0
  19. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  20. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +3 -4
  21. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +7 -7
  22. data/vendor/kreuzberg/src/mcp/tools/mime.rs +4 -4
  23. data/vendor/kreuzberg/src/pdf/text.rs +1 -1
  24. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +2 -2
  25. data/vendor/kreuzberg/tests/docx_mime_detection_test.rs +97 -0
  26. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +2 -2
  27. data/vendor/kreuzberg/tests/format_integration.rs +2 -2
  28. data/vendor/kreuzberg/tests/image_integration.rs +4 -4
  29. data/vendor/kreuzberg/tests/issue_350_regression_test.rs +42 -0
  30. data/vendor/kreuzberg/tests/ocr_configuration.rs +8 -8
  31. data/vendor/kreuzberg/tests/ocr_errors.rs +2 -2
  32. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +2 -2
  33. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  34. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +4 -4
  35. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +1 -1
  36. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  37. data/vendor/kreuzberg/tests/pdfium_linking.rs +24 -27
  38. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +3 -3
  39. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +3 -3
  40. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +3 -3
  41. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  42. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +1 -1
  43. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1bdd32141526f545868c567acbc8e3a7caf94b4ff7e42bebf859fe33416669e4
4
- data.tar.gz: 10da5a6da3a781b9676ba1213a535a69edde90b89ccad45489fab9fb593f5f73
3
+ metadata.gz: abf625c4f7eedb0ba24619d640ac572192a112bf29876c25c662c8faf8a7219c
4
+ data.tar.gz: 460cdf492f802db89332e989340070448c5b60bb44ce0860a1104889814bb9ac
5
5
  SHA512:
6
- metadata.gz: e45428f1c646ed0683f51fa932c2432b0563d3258912fbe7b49f75acf0cdbc43c844c92b17cf7d4a5ddccb0b010d23cce4b20de950877fbe64ecafb858312bc5
7
- data.tar.gz: f0abcd49fe46a4f0e3e2bf80e217ff36970b4a6037ecec6ea889230605a83178d76bff31d0960d50fb2ad4e1ea6f703c595bd43c244ff0e082ab365eb86bf02a
6
+ metadata.gz: 6e9b8b00347a73747e7ab8aad698f2d7a5798609dd1b086fe6df3a723c49bd05c5dff3c8ad0e7c83720cc3944b1a9d66fdec710405c9f1e22e43fe55387cdc92
7
+ data.tar.gz: dab907905f37a8fbc13d4c3e7e893cf6162fe57c6d735bfe08dfec32ea721706ab683d1baa4cfa6b0db4db8c393e7909ee3cf98195881ea31c8dc5ce0cda0b6a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.9)
4
+ kreuzberg (4.2.10)
5
5
  rb_sys (~> 0.9.119)
6
6
 
7
7
  GEM
@@ -46,7 +46,7 @@ GEM
46
46
  i18n (1.14.8)
47
47
  concurrent-ruby (~> 1.0)
48
48
  io-console (0.8.2)
49
- json (2.18.0)
49
+ json (2.18.1)
50
50
  language_server-protocol (3.17.0.5)
51
51
  lint_roller (1.1.0)
52
52
  listen (3.10.0)
@@ -75,12 +75,12 @@ GEM
75
75
  rake (13.3.1)
76
76
  rake-compiler (1.3.1)
77
77
  rake
78
- rake-compiler-dock (1.10.0)
78
+ rake-compiler-dock (1.11.0)
79
79
  rb-fsevent (0.11.2)
80
80
  rb-inotify (0.11.1)
81
81
  ffi (~> 1.0)
82
- rb_sys (0.9.119)
83
- rake-compiler-dock (= 1.10.0)
82
+ rb_sys (0.9.124)
83
+ rake-compiler-dock (= 1.11.0)
84
84
  rbs (3.10.3)
85
85
  logger
86
86
  tsort
@@ -100,7 +100,7 @@ GEM
100
100
  diff-lcs (>= 1.2.0, < 2.0)
101
101
  rspec-support (~> 3.13.0)
102
102
  rspec-support (3.13.7)
103
- rubocop (1.84.0)
103
+ rubocop (1.84.1)
104
104
  json (~> 2.3)
105
105
  language_server-protocol (~> 3.17.0.2)
106
106
  lint_roller (~> 1.1.0)
@@ -123,7 +123,7 @@ GEM
123
123
  rubocop (~> 1.81)
124
124
  ruby-progressbar (1.13.0)
125
125
  securerandom (0.4.1)
126
- sorbet-runtime (0.6.12908)
126
+ sorbet-runtime (0.6.12914)
127
127
  steep (1.10.0)
128
128
  activesupport (>= 5.1)
129
129
  concurrent-ruby (>= 1.1.10)
@@ -208,8 +208,8 @@ CHECKSUMS
208
208
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
209
209
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
210
210
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
211
- json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
212
- kreuzberg (4.2.9)
211
+ json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
212
+ kreuzberg (4.2.10)
213
213
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
214
214
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
215
215
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -226,10 +226,10 @@ CHECKSUMS
226
226
  rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
227
227
  rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
228
228
  rake-compiler (1.3.1) sha256=6b351612b6e2d73ddd5563ee799bb58685176e05363db6758504bd11573d670a
229
- rake-compiler-dock (1.10.0) sha256=dd62ee19df2a185a3315697e560cfa8cc9129901332152851e023fab0e94bf11
229
+ rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
230
230
  rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
231
231
  rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
232
- rb_sys (0.9.119) sha256=64393fa148e402e1b79b64496d2aabfc7df79da6b822b8bb48dc1141eaf40b4b
232
+ rb_sys (0.9.124) sha256=513476557b12eaf73764b3da9f8746024558fe8699bda785fb548c9aa3877ae7
233
233
  rbs (3.10.3) sha256=70627f3919016134d554e6c99195552ae3ef6020fe034c8e983facc9c192daa6
234
234
  regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
235
235
  reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
@@ -238,13 +238,13 @@ CHECKSUMS
238
238
  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
239
239
  rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
240
240
  rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
241
- rubocop (1.84.0) sha256=88dec310153bb685a879f5a7cdb601f6287b8f0ee675d9dc63a17c7204c4190a
241
+ rubocop (1.84.1) sha256=14cc626f355141f5a2ef53c10a68d66b13bb30639b26370a76559096cc6bcc1a
242
242
  rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
243
243
  rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
244
244
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
245
245
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
246
246
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
247
- sorbet-runtime (0.6.12908) sha256=229f43e76527b01c5291c00c43cc29ccebe437a87b34925c3ee250ebf23d328e
247
+ sorbet-runtime (0.6.12914) sha256=6d3c985d671dab9ab8ea244b51888b6e8e8e65e881e5bf816d1ac0950479dce6
248
248
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
249
249
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
250
250
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.9" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.10" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.0.0"
40
+ version = "4.2.10"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -71,7 +71,6 @@ module Kreuzberg
71
71
  class Chunking
72
72
  attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
73
73
 
74
- # rubocop:disable Metrics/CyclomaticComplexity
75
74
  def initialize(
76
75
  max_chars: nil,
77
76
  max_overlap: nil,
@@ -81,7 +80,6 @@ module Kreuzberg
81
80
  chunk_overlap: nil,
82
81
  enabled: true
83
82
  )
84
- # rubocop:enable Metrics/CyclomaticComplexity
85
83
  resolved_size = chunk_size || max_chars || 1000
86
84
  resolved_overlap = chunk_overlap || max_overlap || 200
87
85
 
@@ -867,7 +865,6 @@ module Kreuzberg
867
865
  "Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
868
866
  end
869
867
 
870
- # rubocop:disable Metrics/CyclomaticComplexity
871
868
  def to_h
872
869
  {
873
870
  use_cache: @use_cache,
@@ -888,7 +885,6 @@ module Kreuzberg
888
885
  result_format: @result_format
889
886
  }.compact
890
887
  end
891
- # rubocop:enable Metrics/CyclomaticComplexity
892
888
 
893
889
  # Serialize configuration to JSON string
894
890
  #
@@ -992,7 +988,7 @@ module Kreuzberg
992
988
  # config[:use_cache] = false
993
989
  # config[:force_ocr] = true
994
990
  #
995
- # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
991
+ # rubocop:disable Metrics/MethodLength
996
992
  def []=(key, value)
997
993
  key_sym = key.to_sym
998
994
  case key_sym
@@ -1032,7 +1028,7 @@ module Kreuzberg
1032
1028
  raise ArgumentError, "Unknown configuration key: #{key}"
1033
1029
  end
1034
1030
  end
1035
- # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength
1031
+ # rubocop:enable Metrics/MethodLength
1036
1032
 
1037
1033
  # Get a configuration field using hash-like syntax
1038
1034
  #
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.9'
4
+ VERSION = '4.2.10'
5
5
  end
@@ -4,10 +4,10 @@ require 'spec_helper'
4
4
 
5
5
  RSpec.describe 'Cache Management' do
6
6
  let(:test_pdf) do
7
- test_document_path('pdfs/5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf')
7
+ test_document_path('pdf/5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf')
8
8
  end
9
9
  let(:test_text) { test_document_path('text/contract_test.txt') }
10
- let(:test_docx) { test_document_path('documents/contract.docx') }
10
+ let(:test_docx) { test_document_path('docx/extraction_test.docx') }
11
11
 
12
12
  before do
13
13
  Kreuzberg.clear_cache
@@ -3,7 +3,7 @@
3
3
  RSpec.describe Kreuzberg::CLI do
4
4
  describe '.extract' do
5
5
  it 'extracts content from a file' do
6
- path = test_document_path('documents/simple.odt')
6
+ path = test_document_path('odt/simple.odt')
7
7
  output = described_class.extract(path)
8
8
 
9
9
  expect(output).to be_a(String)
@@ -11,7 +11,7 @@ RSpec.describe Kreuzberg::CLI do
11
11
  end
12
12
 
13
13
  it 'accepts output format option' do
14
- path = test_document_path('documents/simple.odt')
14
+ path = test_document_path('odt/simple.odt')
15
15
  output = described_class.extract(path, output: 'json')
16
16
 
17
17
  expect(output).to be_a(String)
@@ -19,7 +19,7 @@ RSpec.describe Kreuzberg::CLI do
19
19
  end
20
20
 
21
21
  it 'accepts OCR option' do
22
- path = test_document_path('pdfs/100_g_networking_technology_overview_slides_toronto_august_2016.pdf')
22
+ path = test_document_path('pdf/100_g_networking_technology_overview_slides_toronto_august_2016.pdf')
23
23
  output = described_class.extract(path, ocr: false)
24
24
 
25
25
  expect(output).to be_a(String)
@@ -29,7 +29,7 @@ RSpec.describe Kreuzberg::CLI do
29
29
 
30
30
  describe '.detect' do
31
31
  it 'detects MIME type' do
32
- path = test_document_path('documents/simple.odt')
32
+ path = test_document_path('odt/simple.odt')
33
33
  mime_type = described_class.detect(path)
34
34
 
35
35
  expect(mime_type).to be_a(String)
@@ -108,7 +108,7 @@ RSpec.describe 'Image Extraction' do
108
108
  )
109
109
 
110
110
  begin
111
- docx_path = test_document_path('office/document.docx')
111
+ docx_path = test_document_path('docx/extraction_test.docx')
112
112
  result = Kreuzberg.extract_file_sync(path: docx_path, config: config)
113
113
 
114
114
  expect(result).not_to be_nil
@@ -126,7 +126,7 @@ RSpec.describe 'Image Extraction' do
126
126
  )
127
127
 
128
128
  begin
129
- pptx_path = test_document_path('presentations/simple.pptx')
129
+ pptx_path = test_document_path('pptx/simple.pptx')
130
130
  result = Kreuzberg.extract_file_sync(path: pptx_path, config: config)
131
131
 
132
132
  expect(result).not_to be_nil
@@ -959,7 +959,7 @@ RSpec.describe 'Kreuzberg Metadata Types' do
959
959
 
960
960
  describe 'Integration Test: Extract actual HTML file' do
961
961
  it 'extracts metadata from actual HTML file' do
962
- html_file = test_document_path('web/html.html')
962
+ html_file = test_document_path('html/html.html')
963
963
 
964
964
  expect(File.exist?(html_file)).to be(true)
965
965
 
@@ -3,7 +3,7 @@
3
3
  RSpec.describe 'Pages Extraction' do
4
4
  describe 'Extract Pages' do
5
5
  it 'returns pages array when extractPages is true' do
6
- pdf_file = test_document_path('pdf/sample.pdf')
6
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
7
7
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
8
8
 
9
9
  config = Kreuzberg::Config::Extraction.new(
@@ -18,7 +18,7 @@ RSpec.describe 'Pages Extraction' do
18
18
  end
19
19
 
20
20
  it 'returns page numbers for each page' do
21
- pdf_file = test_document_path('pdf/sample.pdf')
21
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
22
22
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
23
23
 
24
24
  config = Kreuzberg::Config::Extraction.new(
@@ -34,7 +34,7 @@ RSpec.describe 'Pages Extraction' do
34
34
  end
35
35
 
36
36
  it 'returns page content for each page' do
37
- pdf_file = test_document_path('pdf/sample.pdf')
37
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
38
38
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
39
39
 
40
40
  config = Kreuzberg::Config::Extraction.new(
@@ -50,7 +50,7 @@ RSpec.describe 'Pages Extraction' do
50
50
  end
51
51
 
52
52
  it 'returns nil for pages when extractPages is false' do
53
- pdf_file = test_document_path('pdf/sample.pdf')
53
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
54
54
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
55
55
 
56
56
  config = Kreuzberg::Config::Extraction.new(
@@ -64,7 +64,7 @@ RSpec.describe 'Pages Extraction' do
64
64
  end
65
65
 
66
66
  it 'preserves page order' do
67
- pdf_file = test_document_path('pdf/sample.pdf')
67
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
68
68
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
69
69
 
70
70
  config = Kreuzberg::Config::Extraction.new(
@@ -83,7 +83,7 @@ RSpec.describe 'Pages Extraction' do
83
83
 
84
84
  describe 'Insert Page Markers' do
85
85
  it 'inserts page markers when insertPageMarkers is true' do
86
- pdf_file = test_document_path('pdf/sample.pdf')
86
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
87
87
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
88
88
 
89
89
  config = Kreuzberg::Config::Extraction.new(
@@ -98,7 +98,7 @@ RSpec.describe 'Pages Extraction' do
98
98
  end
99
99
 
100
100
  it 'does not insert markers when insertPageMarkers is false' do
101
- pdf_file = test_document_path('pdf/sample.pdf')
101
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
102
102
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
103
103
 
104
104
  config = Kreuzberg::Config::Extraction.new(
@@ -113,7 +113,7 @@ RSpec.describe 'Pages Extraction' do
113
113
  end
114
114
 
115
115
  it 'contains page numbers in markers' do
116
- pdf_file = test_document_path('pdf/sample.pdf')
116
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
117
117
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
118
118
 
119
119
  config = Kreuzberg::Config::Extraction.new(
@@ -128,7 +128,7 @@ RSpec.describe 'Pages Extraction' do
128
128
  end
129
129
 
130
130
  it 'inserts multiple markers for multi-page documents' do
131
- pdf_file = test_document_path('pdf/sample.pdf')
131
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
132
132
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
133
133
 
134
134
  config = Kreuzberg::Config::Extraction.new(
@@ -145,7 +145,7 @@ RSpec.describe 'Pages Extraction' do
145
145
 
146
146
  describe 'Custom Marker Format' do
147
147
  it 'uses custom marker format when specified' do
148
- pdf_file = test_document_path('pdf/sample.pdf')
148
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
149
149
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
150
150
 
151
151
  custom_format = '=== PAGE {page_num} ==='
@@ -164,7 +164,7 @@ RSpec.describe 'Pages Extraction' do
164
164
  end
165
165
 
166
166
  it 'replaces page_num placeholder in custom format' do
167
- pdf_file = test_document_path('pdf/sample.pdf')
167
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
168
168
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
169
169
 
170
170
  custom_format = '[Page Number: {page_num}]'
@@ -183,7 +183,7 @@ RSpec.describe 'Pages Extraction' do
183
183
  end
184
184
 
185
185
  it 'handles simple custom format' do
186
- pdf_file = test_document_path('pdf/sample.pdf')
186
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
187
187
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
188
188
 
189
189
  custom_format = 'PAGE_{page_num}'
@@ -201,7 +201,7 @@ RSpec.describe 'Pages Extraction' do
201
201
  end
202
202
 
203
203
  it 'handles custom format with line separators' do
204
- pdf_file = test_document_path('pdf/sample.pdf')
204
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
205
205
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
206
206
 
207
207
  custom_format = "\n---PAGE {page_num}---\n"
@@ -219,7 +219,7 @@ RSpec.describe 'Pages Extraction' do
219
219
  end
220
220
 
221
221
  it 'overrides default marker format' do
222
- pdf_file = test_document_path('pdf/sample.pdf')
222
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
223
223
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
224
224
 
225
225
  custom_format = 'CUSTOM_PAGE_{page_num}'
@@ -239,7 +239,7 @@ RSpec.describe 'Pages Extraction' do
239
239
 
240
240
  describe 'Multi-Page PDF' do
241
241
  it 'produces multiple pages from multi-page PDF' do
242
- pdf_file = test_document_path('pdf/sample.pdf')
242
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
243
243
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
244
244
 
245
245
  config = Kreuzberg::Config::Extraction.new(
@@ -253,7 +253,7 @@ RSpec.describe 'Pages Extraction' do
253
253
  end
254
254
 
255
255
  it 'page numbers are sequential' do
256
- pdf_file = test_document_path('pdf/sample.pdf')
256
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
257
257
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
258
258
 
259
259
  config = Kreuzberg::Config::Extraction.new(
@@ -269,7 +269,7 @@ RSpec.describe 'Pages Extraction' do
269
269
  end
270
270
 
271
271
  it 'each page has content' do
272
- pdf_file = test_document_path('pdf/sample.pdf')
272
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
273
273
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
274
274
 
275
275
  config = Kreuzberg::Config::Extraction.new(
@@ -286,7 +286,7 @@ RSpec.describe 'Pages Extraction' do
286
286
  end
287
287
 
288
288
  it 'with markers contains all pages' do
289
- pdf_file = test_document_path('pdf/sample.pdf')
289
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
290
290
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
291
291
 
292
292
  config = Kreuzberg::Config::Extraction.new(
@@ -303,7 +303,7 @@ RSpec.describe 'Pages Extraction' do
303
303
 
304
304
  describe 'Page Content Structure Validation' do
305
305
  it 'validates page structure' do
306
- pdf_file = test_document_path('pdf/sample.pdf')
306
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
307
307
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
308
308
 
309
309
  config = Kreuzberg::Config::Extraction.new(
@@ -320,7 +320,7 @@ RSpec.describe 'Pages Extraction' do
320
320
  end
321
321
 
322
322
  it 'page content has required fields' do
323
- pdf_file = test_document_path('pdf/sample.pdf')
323
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
324
324
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
325
325
 
326
326
  config = Kreuzberg::Config::Extraction.new(
@@ -337,7 +337,7 @@ RSpec.describe 'Pages Extraction' do
337
337
  end
338
338
 
339
339
  it 'page content with tables preserves table data' do
340
- pdf_file = test_document_path('pdf/sample.pdf')
340
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
341
341
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
342
342
 
343
343
  config = Kreuzberg::Config::Extraction.new(
@@ -354,7 +354,7 @@ RSpec.describe 'Pages Extraction' do
354
354
  end
355
355
 
356
356
  it 'page content with images preserves image data' do
357
- pdf_file = test_document_path('pdf/sample.pdf')
357
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
358
358
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
359
359
 
360
360
  config = Kreuzberg::Config::Extraction.new(
@@ -371,7 +371,7 @@ RSpec.describe 'Pages Extraction' do
371
371
  end
372
372
 
373
373
  it 'page content is not empty' do
374
- pdf_file = test_document_path('pdf/sample.pdf')
374
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
375
375
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
376
376
 
377
377
  config = Kreuzberg::Config::Extraction.new(
@@ -388,7 +388,7 @@ RSpec.describe 'Pages Extraction' do
388
388
 
389
389
  describe 'Combined Features' do
390
390
  it 'extract pages and insert markers together' do
391
- pdf_file = test_document_path('pdf/sample.pdf')
391
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
392
392
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
393
393
 
394
394
  config = Kreuzberg::Config::Extraction.new(
@@ -407,7 +407,7 @@ RSpec.describe 'Pages Extraction' do
407
407
  end
408
408
 
409
409
  it 'extract pages with custom marker format' do
410
- pdf_file = test_document_path('pdf/sample.pdf')
410
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
411
411
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
412
412
 
413
413
  config = Kreuzberg::Config::Extraction.new(
@@ -426,7 +426,7 @@ RSpec.describe 'Pages Extraction' do
426
426
  end
427
427
 
428
428
  it 'page extraction consistency between array and markers' do
429
- pdf_file = test_document_path('pdf/sample.pdf')
429
+ pdf_file = test_document_path('pdf/sample_contract.pdf')
430
430
  skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
431
431
 
432
432
  config = Kreuzberg::Config::Extraction.new(
@@ -246,7 +246,7 @@ RSpec.describe 'Table Extraction Quality' do
246
246
  config = Kreuzberg::Config::Extraction.new
247
247
 
248
248
  begin
249
- result = Kreuzberg.extract_file(path: test_document_path('office/document.docx'), config: config)
249
+ result = Kreuzberg.extract_file(path: test_document_path('docx/extraction_test.docx'), config: config)
250
250
  expect(result).not_to be_nil
251
251
  rescue Kreuzberg::Errors::ValidationError
252
252
  skip 'DOCX test file not available'
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.9"
6
+ version = "4.2.10"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.9"
3
+ version = "4.2.10"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -133,7 +133,7 @@ once_cell = { workspace = true }
133
133
  parking_lot = { workspace = true }
134
134
  pastey = "0.2"
135
135
  rayon = "1.11.0"
136
- regex = "1.12.2"
136
+ regex = "1.12.3"
137
137
  serde = { workspace = true }
138
138
  serde_json = { workspace = true }
139
139
  serde_yaml_ng = "0.10.0"
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.9 Release**
20
+ > **🚀 Version 4.2.10 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -5,14 +5,11 @@ use std::time::Instant;
5
5
  fn main() -> Result<(), Box<dyn std::error::Error>> {
6
6
  let test_pdfs = [
7
7
  (
8
- "a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
8
+ "a_comprehensive_stud_large_acomprehensives.pdf",
9
9
  "Academic Paper (18 fonts)",
10
10
  ),
11
- (
12
- "5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf",
13
- "Intel PDF (5 fonts)",
14
- ),
15
- ("fake_memo.pdf", "Tiny Memo (3-5 fonts)"),
11
+ ("5_level_paging_and_5_medium_5levelpagingand.pdf", "Intel PDF (5 fonts)"),
12
+ ("simple_small_fakememo.pdf", "Tiny Memo (3-5 fonts)"),
16
13
  ];
17
14
 
18
15
  let config = ExtractionConfig {
@@ -24,7 +21,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
24
21
  println!("Testing warm execution fix and font overhead fix\n");
25
22
 
26
23
  for (file, description) in &test_pdfs {
27
- let path = PathBuf::from(format!("test_documents/pdfs/{}", file));
24
+ let path = PathBuf::from(format!("test_documents/pdf/{}", file));
28
25
  println!("=== {} ===", description);
29
26
  println!("File: {}\n", file);
30
27
 
@@ -12,7 +12,7 @@ async fn main() {
12
12
 
13
13
  println!("Test 1: fake_memo.pdf");
14
14
  let start = Instant::now();
15
- match extract_file("test_documents/pdfs/fake_memo.pdf", None, &config).await {
15
+ match extract_file("test_documents/pdf/fake_memo.pdf", None, &config).await {
16
16
  Ok(result) => {
17
17
  let duration = start.elapsed();
18
18
  println!(" ✓ Success! Duration: {:?}", duration);
@@ -26,7 +26,7 @@ async fn main() {
26
26
 
27
27
  println!("\nTest 2: Warm iteration");
28
28
  let start = Instant::now();
29
- match extract_file("test_documents/pdfs/fake_memo.pdf", None, &config).await {
29
+ match extract_file("test_documents/pdf/fake_memo.pdf", None, &config).await {
30
30
  Ok(result) => {
31
31
  let duration = start.elapsed();
32
32
  println!(" ✓ Success! Duration: {:?}", duration);
@@ -41,7 +41,7 @@ async fn main() {
41
41
  println!("\nTest 3: Academic Paper (18 fonts)");
42
42
  let start = Instant::now();
43
43
  match extract_file(
44
- "test_documents/pdfs/a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
44
+ "test_documents/pdf/a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
45
45
  None,
46
46
  &config,
47
47
  )