kreuzberg 4.2.9 → 4.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +13 -13
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/config.rb +2 -6
- data/lib/kreuzberg/version.rb +1 -1
- data/spec/binding/cache_spec.rb +2 -2
- data/spec/binding/cli_spec.rb +4 -4
- data/spec/binding/images_spec.rb +2 -2
- data/spec/binding/metadata_types_spec.rb +1 -1
- data/spec/binding/pages_extraction_spec.rb +26 -26
- data/spec/binding/tables_spec.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/examples/bench_fixes.rs +4 -7
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +3 -3
- data/vendor/kreuzberg/src/core/mime.rs +113 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +3 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +7 -7
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +4 -4
- data/vendor/kreuzberg/src/pdf/text.rs +1 -1
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +2 -2
- data/vendor/kreuzberg/tests/docx_mime_detection_test.rs +97 -0
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +2 -2
- data/vendor/kreuzberg/tests/format_integration.rs +2 -2
- data/vendor/kreuzberg/tests/image_integration.rs +4 -4
- data/vendor/kreuzberg/tests/issue_350_regression_test.rs +42 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +8 -8
- data/vendor/kreuzberg/tests/ocr_errors.rs +2 -2
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +4 -4
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pdfium_linking.rs +24 -27
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +3 -3
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +3 -3
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +3 -3
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: abf625c4f7eedb0ba24619d640ac572192a112bf29876c25c662c8faf8a7219c
|
|
4
|
+
data.tar.gz: 460cdf492f802db89332e989340070448c5b60bb44ce0860a1104889814bb9ac
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6e9b8b00347a73747e7ab8aad698f2d7a5798609dd1b086fe6df3a723c49bd05c5dff3c8ad0e7c83720cc3944b1a9d66fdec710405c9f1e22e43fe55387cdc92
|
|
7
|
+
data.tar.gz: dab907905f37a8fbc13d4c3e7e893cf6162fe57c6d735bfe08dfec32ea721706ab683d1baa4cfa6b0db4db8c393e7909ee3cf98195881ea31c8dc5ce0cda0b6a
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.10)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -46,7 +46,7 @@ GEM
|
|
|
46
46
|
i18n (1.14.8)
|
|
47
47
|
concurrent-ruby (~> 1.0)
|
|
48
48
|
io-console (0.8.2)
|
|
49
|
-
json (2.18.
|
|
49
|
+
json (2.18.1)
|
|
50
50
|
language_server-protocol (3.17.0.5)
|
|
51
51
|
lint_roller (1.1.0)
|
|
52
52
|
listen (3.10.0)
|
|
@@ -75,12 +75,12 @@ GEM
|
|
|
75
75
|
rake (13.3.1)
|
|
76
76
|
rake-compiler (1.3.1)
|
|
77
77
|
rake
|
|
78
|
-
rake-compiler-dock (1.
|
|
78
|
+
rake-compiler-dock (1.11.0)
|
|
79
79
|
rb-fsevent (0.11.2)
|
|
80
80
|
rb-inotify (0.11.1)
|
|
81
81
|
ffi (~> 1.0)
|
|
82
|
-
rb_sys (0.9.
|
|
83
|
-
rake-compiler-dock (= 1.
|
|
82
|
+
rb_sys (0.9.124)
|
|
83
|
+
rake-compiler-dock (= 1.11.0)
|
|
84
84
|
rbs (3.10.3)
|
|
85
85
|
logger
|
|
86
86
|
tsort
|
|
@@ -100,7 +100,7 @@ GEM
|
|
|
100
100
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
101
101
|
rspec-support (~> 3.13.0)
|
|
102
102
|
rspec-support (3.13.7)
|
|
103
|
-
rubocop (1.84.
|
|
103
|
+
rubocop (1.84.1)
|
|
104
104
|
json (~> 2.3)
|
|
105
105
|
language_server-protocol (~> 3.17.0.2)
|
|
106
106
|
lint_roller (~> 1.1.0)
|
|
@@ -123,7 +123,7 @@ GEM
|
|
|
123
123
|
rubocop (~> 1.81)
|
|
124
124
|
ruby-progressbar (1.13.0)
|
|
125
125
|
securerandom (0.4.1)
|
|
126
|
-
sorbet-runtime (0.6.
|
|
126
|
+
sorbet-runtime (0.6.12914)
|
|
127
127
|
steep (1.10.0)
|
|
128
128
|
activesupport (>= 5.1)
|
|
129
129
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -208,8 +208,8 @@ CHECKSUMS
|
|
|
208
208
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
209
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
210
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
211
|
-
json (2.18.
|
|
212
|
-
kreuzberg (4.2.
|
|
211
|
+
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
212
|
+
kreuzberg (4.2.10)
|
|
213
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
214
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
215
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -226,10 +226,10 @@ CHECKSUMS
|
|
|
226
226
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
227
227
|
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
228
228
|
rake-compiler (1.3.1) sha256=6b351612b6e2d73ddd5563ee799bb58685176e05363db6758504bd11573d670a
|
|
229
|
-
rake-compiler-dock (1.
|
|
229
|
+
rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
|
|
230
230
|
rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
|
|
231
231
|
rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
|
|
232
|
-
rb_sys (0.9.
|
|
232
|
+
rb_sys (0.9.124) sha256=513476557b12eaf73764b3da9f8746024558fe8699bda785fb548c9aa3877ae7
|
|
233
233
|
rbs (3.10.3) sha256=70627f3919016134d554e6c99195552ae3ef6020fe034c8e983facc9c192daa6
|
|
234
234
|
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
235
235
|
reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
|
|
@@ -238,13 +238,13 @@ CHECKSUMS
|
|
|
238
238
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
239
239
|
rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
|
|
240
240
|
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
241
|
-
rubocop (1.84.
|
|
241
|
+
rubocop (1.84.1) sha256=14cc626f355141f5a2ef53c10a68d66b13bb30639b26370a76559096cc6bcc1a
|
|
242
242
|
rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
|
|
243
243
|
rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
|
|
244
244
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
245
245
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
246
246
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
247
|
-
sorbet-runtime (0.6.
|
|
247
|
+
sorbet-runtime (0.6.12914) sha256=6d3c985d671dab9ab8ea244b51888b6e8e8e65e881e5bf816d1ac0950479dce6
|
|
248
248
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
249
249
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
250
250
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.10" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -71,7 +71,6 @@ module Kreuzberg
|
|
|
71
71
|
class Chunking
|
|
72
72
|
attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
|
|
73
73
|
|
|
74
|
-
# rubocop:disable Metrics/CyclomaticComplexity
|
|
75
74
|
def initialize(
|
|
76
75
|
max_chars: nil,
|
|
77
76
|
max_overlap: nil,
|
|
@@ -81,7 +80,6 @@ module Kreuzberg
|
|
|
81
80
|
chunk_overlap: nil,
|
|
82
81
|
enabled: true
|
|
83
82
|
)
|
|
84
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
|
85
83
|
resolved_size = chunk_size || max_chars || 1000
|
|
86
84
|
resolved_overlap = chunk_overlap || max_overlap || 200
|
|
87
85
|
|
|
@@ -867,7 +865,6 @@ module Kreuzberg
|
|
|
867
865
|
"Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
|
|
868
866
|
end
|
|
869
867
|
|
|
870
|
-
# rubocop:disable Metrics/CyclomaticComplexity
|
|
871
868
|
def to_h
|
|
872
869
|
{
|
|
873
870
|
use_cache: @use_cache,
|
|
@@ -888,7 +885,6 @@ module Kreuzberg
|
|
|
888
885
|
result_format: @result_format
|
|
889
886
|
}.compact
|
|
890
887
|
end
|
|
891
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
|
892
888
|
|
|
893
889
|
# Serialize configuration to JSON string
|
|
894
890
|
#
|
|
@@ -992,7 +988,7 @@ module Kreuzberg
|
|
|
992
988
|
# config[:use_cache] = false
|
|
993
989
|
# config[:force_ocr] = true
|
|
994
990
|
#
|
|
995
|
-
# rubocop:disable Metrics/
|
|
991
|
+
# rubocop:disable Metrics/MethodLength
|
|
996
992
|
def []=(key, value)
|
|
997
993
|
key_sym = key.to_sym
|
|
998
994
|
case key_sym
|
|
@@ -1032,7 +1028,7 @@ module Kreuzberg
|
|
|
1032
1028
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
1033
1029
|
end
|
|
1034
1030
|
end
|
|
1035
|
-
# rubocop:enable Metrics/
|
|
1031
|
+
# rubocop:enable Metrics/MethodLength
|
|
1036
1032
|
|
|
1037
1033
|
# Get a configuration field using hash-like syntax
|
|
1038
1034
|
#
|
data/lib/kreuzberg/version.rb
CHANGED
data/spec/binding/cache_spec.rb
CHANGED
|
@@ -4,10 +4,10 @@ require 'spec_helper'
|
|
|
4
4
|
|
|
5
5
|
RSpec.describe 'Cache Management' do
|
|
6
6
|
let(:test_pdf) do
|
|
7
|
-
test_document_path('
|
|
7
|
+
test_document_path('pdf/5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf')
|
|
8
8
|
end
|
|
9
9
|
let(:test_text) { test_document_path('text/contract_test.txt') }
|
|
10
|
-
let(:test_docx) { test_document_path('
|
|
10
|
+
let(:test_docx) { test_document_path('docx/extraction_test.docx') }
|
|
11
11
|
|
|
12
12
|
before do
|
|
13
13
|
Kreuzberg.clear_cache
|
data/spec/binding/cli_spec.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
RSpec.describe Kreuzberg::CLI do
|
|
4
4
|
describe '.extract' do
|
|
5
5
|
it 'extracts content from a file' do
|
|
6
|
-
path = test_document_path('
|
|
6
|
+
path = test_document_path('odt/simple.odt')
|
|
7
7
|
output = described_class.extract(path)
|
|
8
8
|
|
|
9
9
|
expect(output).to be_a(String)
|
|
@@ -11,7 +11,7 @@ RSpec.describe Kreuzberg::CLI do
|
|
|
11
11
|
end
|
|
12
12
|
|
|
13
13
|
it 'accepts output format option' do
|
|
14
|
-
path = test_document_path('
|
|
14
|
+
path = test_document_path('odt/simple.odt')
|
|
15
15
|
output = described_class.extract(path, output: 'json')
|
|
16
16
|
|
|
17
17
|
expect(output).to be_a(String)
|
|
@@ -19,7 +19,7 @@ RSpec.describe Kreuzberg::CLI do
|
|
|
19
19
|
end
|
|
20
20
|
|
|
21
21
|
it 'accepts OCR option' do
|
|
22
|
-
path = test_document_path('
|
|
22
|
+
path = test_document_path('pdf/100_g_networking_technology_overview_slides_toronto_august_2016.pdf')
|
|
23
23
|
output = described_class.extract(path, ocr: false)
|
|
24
24
|
|
|
25
25
|
expect(output).to be_a(String)
|
|
@@ -29,7 +29,7 @@ RSpec.describe Kreuzberg::CLI do
|
|
|
29
29
|
|
|
30
30
|
describe '.detect' do
|
|
31
31
|
it 'detects MIME type' do
|
|
32
|
-
path = test_document_path('
|
|
32
|
+
path = test_document_path('odt/simple.odt')
|
|
33
33
|
mime_type = described_class.detect(path)
|
|
34
34
|
|
|
35
35
|
expect(mime_type).to be_a(String)
|
data/spec/binding/images_spec.rb
CHANGED
|
@@ -108,7 +108,7 @@ RSpec.describe 'Image Extraction' do
|
|
|
108
108
|
)
|
|
109
109
|
|
|
110
110
|
begin
|
|
111
|
-
docx_path = test_document_path('
|
|
111
|
+
docx_path = test_document_path('docx/extraction_test.docx')
|
|
112
112
|
result = Kreuzberg.extract_file_sync(path: docx_path, config: config)
|
|
113
113
|
|
|
114
114
|
expect(result).not_to be_nil
|
|
@@ -126,7 +126,7 @@ RSpec.describe 'Image Extraction' do
|
|
|
126
126
|
)
|
|
127
127
|
|
|
128
128
|
begin
|
|
129
|
-
pptx_path = test_document_path('
|
|
129
|
+
pptx_path = test_document_path('pptx/simple.pptx')
|
|
130
130
|
result = Kreuzberg.extract_file_sync(path: pptx_path, config: config)
|
|
131
131
|
|
|
132
132
|
expect(result).not_to be_nil
|
|
@@ -959,7 +959,7 @@ RSpec.describe 'Kreuzberg Metadata Types' do
|
|
|
959
959
|
|
|
960
960
|
describe 'Integration Test: Extract actual HTML file' do
|
|
961
961
|
it 'extracts metadata from actual HTML file' do
|
|
962
|
-
html_file = test_document_path('
|
|
962
|
+
html_file = test_document_path('html/html.html')
|
|
963
963
|
|
|
964
964
|
expect(File.exist?(html_file)).to be(true)
|
|
965
965
|
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
RSpec.describe 'Pages Extraction' do
|
|
4
4
|
describe 'Extract Pages' do
|
|
5
5
|
it 'returns pages array when extractPages is true' do
|
|
6
|
-
pdf_file = test_document_path('pdf/
|
|
6
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
7
7
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
8
8
|
|
|
9
9
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -18,7 +18,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
18
18
|
end
|
|
19
19
|
|
|
20
20
|
it 'returns page numbers for each page' do
|
|
21
|
-
pdf_file = test_document_path('pdf/
|
|
21
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
22
22
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
23
23
|
|
|
24
24
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -34,7 +34,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
34
34
|
end
|
|
35
35
|
|
|
36
36
|
it 'returns page content for each page' do
|
|
37
|
-
pdf_file = test_document_path('pdf/
|
|
37
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
38
38
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
39
39
|
|
|
40
40
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -50,7 +50,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
50
50
|
end
|
|
51
51
|
|
|
52
52
|
it 'returns nil for pages when extractPages is false' do
|
|
53
|
-
pdf_file = test_document_path('pdf/
|
|
53
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
54
54
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
55
55
|
|
|
56
56
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -64,7 +64,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
64
64
|
end
|
|
65
65
|
|
|
66
66
|
it 'preserves page order' do
|
|
67
|
-
pdf_file = test_document_path('pdf/
|
|
67
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
68
68
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
69
69
|
|
|
70
70
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -83,7 +83,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
83
83
|
|
|
84
84
|
describe 'Insert Page Markers' do
|
|
85
85
|
it 'inserts page markers when insertPageMarkers is true' do
|
|
86
|
-
pdf_file = test_document_path('pdf/
|
|
86
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
87
87
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
88
88
|
|
|
89
89
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -98,7 +98,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
98
98
|
end
|
|
99
99
|
|
|
100
100
|
it 'does not insert markers when insertPageMarkers is false' do
|
|
101
|
-
pdf_file = test_document_path('pdf/
|
|
101
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
102
102
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
103
103
|
|
|
104
104
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -113,7 +113,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
113
113
|
end
|
|
114
114
|
|
|
115
115
|
it 'contains page numbers in markers' do
|
|
116
|
-
pdf_file = test_document_path('pdf/
|
|
116
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
117
117
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
118
118
|
|
|
119
119
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -128,7 +128,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
128
128
|
end
|
|
129
129
|
|
|
130
130
|
it 'inserts multiple markers for multi-page documents' do
|
|
131
|
-
pdf_file = test_document_path('pdf/
|
|
131
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
132
132
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
133
133
|
|
|
134
134
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -145,7 +145,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
145
145
|
|
|
146
146
|
describe 'Custom Marker Format' do
|
|
147
147
|
it 'uses custom marker format when specified' do
|
|
148
|
-
pdf_file = test_document_path('pdf/
|
|
148
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
149
149
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
150
150
|
|
|
151
151
|
custom_format = '=== PAGE {page_num} ==='
|
|
@@ -164,7 +164,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
164
164
|
end
|
|
165
165
|
|
|
166
166
|
it 'replaces page_num placeholder in custom format' do
|
|
167
|
-
pdf_file = test_document_path('pdf/
|
|
167
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
168
168
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
169
169
|
|
|
170
170
|
custom_format = '[Page Number: {page_num}]'
|
|
@@ -183,7 +183,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
183
183
|
end
|
|
184
184
|
|
|
185
185
|
it 'handles simple custom format' do
|
|
186
|
-
pdf_file = test_document_path('pdf/
|
|
186
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
187
187
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
188
188
|
|
|
189
189
|
custom_format = 'PAGE_{page_num}'
|
|
@@ -201,7 +201,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
201
201
|
end
|
|
202
202
|
|
|
203
203
|
it 'handles custom format with line separators' do
|
|
204
|
-
pdf_file = test_document_path('pdf/
|
|
204
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
205
205
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
206
206
|
|
|
207
207
|
custom_format = "\n---PAGE {page_num}---\n"
|
|
@@ -219,7 +219,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
219
219
|
end
|
|
220
220
|
|
|
221
221
|
it 'overrides default marker format' do
|
|
222
|
-
pdf_file = test_document_path('pdf/
|
|
222
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
223
223
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
224
224
|
|
|
225
225
|
custom_format = 'CUSTOM_PAGE_{page_num}'
|
|
@@ -239,7 +239,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
239
239
|
|
|
240
240
|
describe 'Multi-Page PDF' do
|
|
241
241
|
it 'produces multiple pages from multi-page PDF' do
|
|
242
|
-
pdf_file = test_document_path('pdf/
|
|
242
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
243
243
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
244
244
|
|
|
245
245
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -253,7 +253,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
253
253
|
end
|
|
254
254
|
|
|
255
255
|
it 'page numbers are sequential' do
|
|
256
|
-
pdf_file = test_document_path('pdf/
|
|
256
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
257
257
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
258
258
|
|
|
259
259
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -269,7 +269,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
269
269
|
end
|
|
270
270
|
|
|
271
271
|
it 'each page has content' do
|
|
272
|
-
pdf_file = test_document_path('pdf/
|
|
272
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
273
273
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
274
274
|
|
|
275
275
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -286,7 +286,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
286
286
|
end
|
|
287
287
|
|
|
288
288
|
it 'with markers contains all pages' do
|
|
289
|
-
pdf_file = test_document_path('pdf/
|
|
289
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
290
290
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
291
291
|
|
|
292
292
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -303,7 +303,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
303
303
|
|
|
304
304
|
describe 'Page Content Structure Validation' do
|
|
305
305
|
it 'validates page structure' do
|
|
306
|
-
pdf_file = test_document_path('pdf/
|
|
306
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
307
307
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
308
308
|
|
|
309
309
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -320,7 +320,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
320
320
|
end
|
|
321
321
|
|
|
322
322
|
it 'page content has required fields' do
|
|
323
|
-
pdf_file = test_document_path('pdf/
|
|
323
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
324
324
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
325
325
|
|
|
326
326
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -337,7 +337,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
337
337
|
end
|
|
338
338
|
|
|
339
339
|
it 'page content with tables preserves table data' do
|
|
340
|
-
pdf_file = test_document_path('pdf/
|
|
340
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
341
341
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
342
342
|
|
|
343
343
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -354,7 +354,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
354
354
|
end
|
|
355
355
|
|
|
356
356
|
it 'page content with images preserves image data' do
|
|
357
|
-
pdf_file = test_document_path('pdf/
|
|
357
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
358
358
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
359
359
|
|
|
360
360
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -371,7 +371,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
371
371
|
end
|
|
372
372
|
|
|
373
373
|
it 'page content is not empty' do
|
|
374
|
-
pdf_file = test_document_path('pdf/
|
|
374
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
375
375
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
376
376
|
|
|
377
377
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -388,7 +388,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
388
388
|
|
|
389
389
|
describe 'Combined Features' do
|
|
390
390
|
it 'extract pages and insert markers together' do
|
|
391
|
-
pdf_file = test_document_path('pdf/
|
|
391
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
392
392
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
393
393
|
|
|
394
394
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -407,7 +407,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
407
407
|
end
|
|
408
408
|
|
|
409
409
|
it 'extract pages with custom marker format' do
|
|
410
|
-
pdf_file = test_document_path('pdf/
|
|
410
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
411
411
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
412
412
|
|
|
413
413
|
config = Kreuzberg::Config::Extraction.new(
|
|
@@ -426,7 +426,7 @@ RSpec.describe 'Pages Extraction' do
|
|
|
426
426
|
end
|
|
427
427
|
|
|
428
428
|
it 'page extraction consistency between array and markers' do
|
|
429
|
-
pdf_file = test_document_path('pdf/
|
|
429
|
+
pdf_file = test_document_path('pdf/sample_contract.pdf')
|
|
430
430
|
skip "Test PDF not available at #{pdf_file}" unless File.exist?(pdf_file)
|
|
431
431
|
|
|
432
432
|
config = Kreuzberg::Config::Extraction.new(
|
data/spec/binding/tables_spec.rb
CHANGED
|
@@ -246,7 +246,7 @@ RSpec.describe 'Table Extraction Quality' do
|
|
|
246
246
|
config = Kreuzberg::Config::Extraction.new
|
|
247
247
|
|
|
248
248
|
begin
|
|
249
|
-
result = Kreuzberg.extract_file(path: test_document_path('
|
|
249
|
+
result = Kreuzberg.extract_file(path: test_document_path('docx/extraction_test.docx'), config: config)
|
|
250
250
|
expect(result).not_to be_nil
|
|
251
251
|
rescue Kreuzberg::Errors::ValidationError
|
|
252
252
|
skip 'DOCX test file not available'
|
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.10"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -133,7 +133,7 @@ once_cell = { workspace = true }
|
|
|
133
133
|
parking_lot = { workspace = true }
|
|
134
134
|
pastey = "0.2"
|
|
135
135
|
rayon = "1.11.0"
|
|
136
|
-
regex = "1.12.
|
|
136
|
+
regex = "1.12.3"
|
|
137
137
|
serde = { workspace = true }
|
|
138
138
|
serde_json = { workspace = true }
|
|
139
139
|
serde_yaml_ng = "0.10.0"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.10 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -5,14 +5,11 @@ use std::time::Instant;
|
|
|
5
5
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
6
6
|
let test_pdfs = [
|
|
7
7
|
(
|
|
8
|
-
"
|
|
8
|
+
"a_comprehensive_stud_large_acomprehensives.pdf",
|
|
9
9
|
"Academic Paper (18 fonts)",
|
|
10
10
|
),
|
|
11
|
-
(
|
|
12
|
-
|
|
13
|
-
"Intel PDF (5 fonts)",
|
|
14
|
-
),
|
|
15
|
-
("fake_memo.pdf", "Tiny Memo (3-5 fonts)"),
|
|
11
|
+
("5_level_paging_and_5_medium_5levelpagingand.pdf", "Intel PDF (5 fonts)"),
|
|
12
|
+
("simple_small_fakememo.pdf", "Tiny Memo (3-5 fonts)"),
|
|
16
13
|
];
|
|
17
14
|
|
|
18
15
|
let config = ExtractionConfig {
|
|
@@ -24,7 +21,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
24
21
|
println!("Testing warm execution fix and font overhead fix\n");
|
|
25
22
|
|
|
26
23
|
for (file, description) in &test_pdfs {
|
|
27
|
-
let path = PathBuf::from(format!("test_documents/
|
|
24
|
+
let path = PathBuf::from(format!("test_documents/pdf/{}", file));
|
|
28
25
|
println!("=== {} ===", description);
|
|
29
26
|
println!("File: {}\n", file);
|
|
30
27
|
|
|
@@ -12,7 +12,7 @@ async fn main() {
|
|
|
12
12
|
|
|
13
13
|
println!("Test 1: fake_memo.pdf");
|
|
14
14
|
let start = Instant::now();
|
|
15
|
-
match extract_file("test_documents/
|
|
15
|
+
match extract_file("test_documents/pdf/fake_memo.pdf", None, &config).await {
|
|
16
16
|
Ok(result) => {
|
|
17
17
|
let duration = start.elapsed();
|
|
18
18
|
println!(" ✓ Success! Duration: {:?}", duration);
|
|
@@ -26,7 +26,7 @@ async fn main() {
|
|
|
26
26
|
|
|
27
27
|
println!("\nTest 2: Warm iteration");
|
|
28
28
|
let start = Instant::now();
|
|
29
|
-
match extract_file("test_documents/
|
|
29
|
+
match extract_file("test_documents/pdf/fake_memo.pdf", None, &config).await {
|
|
30
30
|
Ok(result) => {
|
|
31
31
|
let duration = start.elapsed();
|
|
32
32
|
println!(" ✓ Success! Duration: {:?}", duration);
|
|
@@ -41,7 +41,7 @@ async fn main() {
|
|
|
41
41
|
println!("\nTest 3: Academic Paper (18 fonts)");
|
|
42
42
|
let start = Instant::now();
|
|
43
43
|
match extract_file(
|
|
44
|
-
"test_documents/
|
|
44
|
+
"test_documents/pdf/a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
|
|
45
45
|
None,
|
|
46
46
|
&config,
|
|
47
47
|
)
|