kreuzberg 4.3.4 → 4.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +5 -5
- data/ext/kreuzberg_rb/native/src/result.rs +40 -0
- data/lib/kreuzberg/result.rb +44 -20
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +9 -0
- data/sig/kreuzberg.rbs +7 -2
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/router.rs +2 -2
- data/vendor/kreuzberg/src/chunking/core.rs +2 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +2 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +17 -6
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +30 -3
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -0
- data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +1 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +2 -0
- data/vendor/kreuzberg/src/extractors/csv.rs +1 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +2 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +15 -8
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +1 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +2 -0
- data/vendor/kreuzberg/src/extractors/odt.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +185 -10
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +84 -12
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +1 -0
- data/vendor/kreuzberg/src/mcp/format.rs +2 -1
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +2 -1
- data/vendor/kreuzberg/src/paddle_ocr/config.rs +26 -6
- data/vendor/kreuzberg/src/pdf/fonts.rs +3 -1
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +78 -9
- data/vendor/kreuzberg/src/pdf/hierarchy/extraction.rs +57 -7
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +1 -1
- data/vendor/kreuzberg/src/pdf/markdown.rs +2014 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +6 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +6 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +6 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +1 -0
- data/vendor/kreuzberg/src/types/extraction.rs +6 -0
- data/vendor/kreuzberg/src/types/mod.rs +167 -0
- data/vendor/kreuzberg/src/types/tables.rs +121 -0
- data/vendor/kreuzberg/tests/config_behavioral.rs +1 -0
- data/vendor/kreuzberg/tests/dump_pdf_markdown.rs +83 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +3 -0
- data/vendor/kreuzberg/tests/pdf_markdown_all_docs.rs +282 -0
- data/vendor/kreuzberg/tests/pdf_markdown_extraction.rs +108 -0
- data/vendor/kreuzberg/tests/pdf_table_detection.rs +285 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +9 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +4 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +1 -0
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/src/crnn_net.rs +58 -0
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text/char.rs +15 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +7 -4
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/lib/libpdfium.so +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3936788c6812a84428d0467330f573c20c9b569c399eab105cc2815d777b2141
|
|
4
|
+
data.tar.gz: 1b115f87bc4a40960584de9459725d0afabfefe6244ddea434461a2c6f36a647
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 73fb7522dcd091b449d5146e65f008bded545a6c49a35d77468bd9bfa61d30a7413dc7364eef6ce79cca4606c5c710ea3f574509743569a7fb4f8a7bc579f402
|
|
7
|
+
data.tar.gz: e21fb401768da5005a1edb720b0c00c82aaa9a8ef60b6f2bab3587b3c8c94cd8fdcc220daeb8545bae6523bf8be22dbc1dab7c966ae8f30e888342d96c7e5df2
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.3.
|
|
4
|
+
kreuzberg (4.3.5)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -59,7 +59,7 @@ GEM
|
|
|
59
59
|
prism (~> 1.5)
|
|
60
60
|
mutex_m (0.3.0)
|
|
61
61
|
parallel (1.27.0)
|
|
62
|
-
parser (3.3.10.
|
|
62
|
+
parser (3.3.10.2)
|
|
63
63
|
ast (~> 2.4.1)
|
|
64
64
|
racc
|
|
65
65
|
prism (1.9.0)
|
|
@@ -210,7 +210,7 @@ CHECKSUMS
|
|
|
210
210
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
211
211
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
212
212
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
213
|
-
kreuzberg (4.3.
|
|
213
|
+
kreuzberg (4.3.5)
|
|
214
214
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
215
215
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
216
216
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -219,7 +219,7 @@ CHECKSUMS
|
|
|
219
219
|
minitest (6.0.1) sha256=7854c74f48e2e975969062833adc4013f249a4b212f5e7b9d5c040bf838d54bb
|
|
220
220
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
221
221
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
222
|
-
parser (3.3.10.
|
|
222
|
+
parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
|
|
223
223
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
224
224
|
pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
|
|
225
225
|
pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.5" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -100,19 +100,19 @@ pub fn ruby_cache_stats() -> Result<RHash, Error> {
|
|
|
100
100
|
|
|
101
101
|
// Validation wrapper functions
|
|
102
102
|
pub fn validate_binarization_method(method: String) -> Result<i32, Error> {
|
|
103
|
-
unsafe { Ok(kreuzberg_validate_binarization_method(method.as_ptr() as *const
|
|
103
|
+
unsafe { Ok(kreuzberg_validate_binarization_method(method.as_ptr() as *const std::os::raw::c_char)) }
|
|
104
104
|
}
|
|
105
105
|
|
|
106
106
|
pub fn validate_ocr_backend(backend: String) -> Result<i32, Error> {
|
|
107
|
-
unsafe { Ok(kreuzberg_validate_ocr_backend(backend.as_ptr() as *const
|
|
107
|
+
unsafe { Ok(kreuzberg_validate_ocr_backend(backend.as_ptr() as *const std::os::raw::c_char)) }
|
|
108
108
|
}
|
|
109
109
|
|
|
110
110
|
pub fn validate_language_code(code: String) -> Result<i32, Error> {
|
|
111
|
-
unsafe { Ok(kreuzberg_validate_language_code(code.as_ptr() as *const
|
|
111
|
+
unsafe { Ok(kreuzberg_validate_language_code(code.as_ptr() as *const std::os::raw::c_char)) }
|
|
112
112
|
}
|
|
113
113
|
|
|
114
114
|
pub fn validate_token_reduction_level(level: String) -> Result<i32, Error> {
|
|
115
|
-
unsafe { Ok(kreuzberg_validate_token_reduction_level(level.as_ptr() as *const
|
|
115
|
+
unsafe { Ok(kreuzberg_validate_token_reduction_level(level.as_ptr() as *const std::os::raw::c_char)) }
|
|
116
116
|
}
|
|
117
117
|
|
|
118
118
|
pub fn validate_tesseract_psm(psm: i32) -> Result<i32, Error> {
|
|
@@ -124,7 +124,7 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<i32, Error> {
|
|
|
124
124
|
}
|
|
125
125
|
|
|
126
126
|
pub fn validate_output_format(format: String) -> Result<i32, Error> {
|
|
127
|
-
unsafe { Ok(kreuzberg_validate_output_format(format.as_ptr() as *const
|
|
127
|
+
unsafe { Ok(kreuzberg_validate_output_format(format.as_ptr() as *const std::os::raw::c_char)) }
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
pub fn validate_confidence(confidence: f64) -> Result<i32, Error> {
|
|
@@ -53,6 +53,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
|
|
|
53
53
|
table_hash.aset("cells", cells_array)?;
|
|
54
54
|
table_hash.aset("markdown", table.markdown)?;
|
|
55
55
|
table_hash.aset("page_number", table.page_number)?;
|
|
56
|
+
if let Some(bbox) = table.bounding_box {
|
|
57
|
+
let bbox_hash = ruby.hash_new();
|
|
58
|
+
bbox_hash.aset("x0", bbox.x0)?;
|
|
59
|
+
bbox_hash.aset("y0", bbox.y0)?;
|
|
60
|
+
bbox_hash.aset("x1", bbox.x1)?;
|
|
61
|
+
bbox_hash.aset("y1", bbox.y1)?;
|
|
62
|
+
table_hash.aset("bounding_box", bbox_hash)?;
|
|
63
|
+
} else {
|
|
64
|
+
table_hash.aset("bounding_box", ruby.qnil().as_value())?;
|
|
65
|
+
}
|
|
56
66
|
|
|
57
67
|
tables_array.push(table_hash)?;
|
|
58
68
|
}
|
|
@@ -164,6 +174,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
|
|
|
164
174
|
} else {
|
|
165
175
|
image_hash.aset("ocr_result", ruby.qnil().as_value())?;
|
|
166
176
|
}
|
|
177
|
+
if let Some(bbox) = image.bounding_box {
|
|
178
|
+
let bbox_hash = ruby.hash_new();
|
|
179
|
+
bbox_hash.aset("x0", bbox.x0)?;
|
|
180
|
+
bbox_hash.aset("y0", bbox.y0)?;
|
|
181
|
+
bbox_hash.aset("x1", bbox.x1)?;
|
|
182
|
+
bbox_hash.aset("y1", bbox.y1)?;
|
|
183
|
+
image_hash.aset("bounding_box", bbox_hash)?;
|
|
184
|
+
} else {
|
|
185
|
+
image_hash.aset("bounding_box", ruby.qnil().as_value())?;
|
|
186
|
+
}
|
|
167
187
|
images_array.push(image_hash)?;
|
|
168
188
|
}
|
|
169
189
|
set_hash_entry(ruby, &hash, "images", images_array.into_value_with(ruby))?;
|
|
@@ -191,6 +211,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
|
|
|
191
211
|
table_hash.aset("cells", cells_array)?;
|
|
192
212
|
table_hash.aset("markdown", table.markdown.clone())?;
|
|
193
213
|
table_hash.aset("page_number", table.page_number as i64)?;
|
|
214
|
+
if let Some(ref bbox) = table.bounding_box {
|
|
215
|
+
let bbox_hash = ruby.hash_new();
|
|
216
|
+
bbox_hash.aset("x0", bbox.x0)?;
|
|
217
|
+
bbox_hash.aset("y0", bbox.y0)?;
|
|
218
|
+
bbox_hash.aset("x1", bbox.x1)?;
|
|
219
|
+
bbox_hash.aset("y1", bbox.y1)?;
|
|
220
|
+
table_hash.aset("bounding_box", bbox_hash)?;
|
|
221
|
+
} else {
|
|
222
|
+
table_hash.aset("bounding_box", ruby.qnil().as_value())?;
|
|
223
|
+
}
|
|
194
224
|
|
|
195
225
|
tables_array.push(table_hash)?;
|
|
196
226
|
}
|
|
@@ -248,6 +278,16 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
|
|
|
248
278
|
} else {
|
|
249
279
|
image_hash.aset("ocr_result", ruby.qnil().as_value())?;
|
|
250
280
|
}
|
|
281
|
+
if let Some(ref bbox) = image.bounding_box {
|
|
282
|
+
let bbox_hash = ruby.hash_new();
|
|
283
|
+
bbox_hash.aset("x0", bbox.x0)?;
|
|
284
|
+
bbox_hash.aset("y0", bbox.y0)?;
|
|
285
|
+
bbox_hash.aset("x1", bbox.x1)?;
|
|
286
|
+
bbox_hash.aset("y1", bbox.y1)?;
|
|
287
|
+
image_hash.aset("bounding_box", bbox_hash)?;
|
|
288
|
+
} else {
|
|
289
|
+
image_hash.aset("bounding_box", ruby.qnil().as_value())?;
|
|
290
|
+
}
|
|
251
291
|
images_array.push(image_hash)?;
|
|
252
292
|
}
|
|
253
293
|
page_hash.aset("images", images_array)?;
|
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -22,9 +22,11 @@ module Kreuzberg
|
|
|
22
22
|
# @return [String] Markdown representation
|
|
23
23
|
# @!attribute [r] page_number
|
|
24
24
|
# @return [Integer] Page number where table was found
|
|
25
|
-
|
|
25
|
+
# @!attribute [r] bounding_box
|
|
26
|
+
# @return [BoundingBox, nil] Bounding box of the table on the page
|
|
27
|
+
Table = Struct.new(:cells, :markdown, :page_number, :bounding_box, keyword_init: true) do
|
|
26
28
|
def to_h
|
|
27
|
-
{ cells: cells, markdown: markdown, page_number: page_number }
|
|
29
|
+
{ cells: cells, markdown: markdown, page_number: page_number, bounding_box: bounding_box&.to_h }
|
|
28
30
|
end
|
|
29
31
|
end
|
|
30
32
|
|
|
@@ -78,6 +80,7 @@ module Kreuzberg
|
|
|
78
80
|
:bits_per_component,
|
|
79
81
|
:is_mask,
|
|
80
82
|
:description,
|
|
83
|
+
:bounding_box,
|
|
81
84
|
:ocr_result,
|
|
82
85
|
keyword_init: true
|
|
83
86
|
) do
|
|
@@ -93,6 +96,7 @@ module Kreuzberg
|
|
|
93
96
|
bits_per_component: bits_per_component,
|
|
94
97
|
is_mask: is_mask,
|
|
95
98
|
description: description,
|
|
99
|
+
bounding_box: bounding_box&.to_h,
|
|
96
100
|
ocr_result: ocr_result&.to_h
|
|
97
101
|
}
|
|
98
102
|
end
|
|
@@ -486,10 +490,12 @@ module Kreuzberg
|
|
|
486
490
|
return [] if tables_data.nil? || tables_data.empty?
|
|
487
491
|
|
|
488
492
|
tables_data.map do |table_hash|
|
|
493
|
+
bounding_box = parse_bounding_box(table_hash['bounding_box'])
|
|
489
494
|
Table.new(
|
|
490
495
|
cells: table_hash['cells'] || [],
|
|
491
496
|
markdown: table_hash['markdown'] || '',
|
|
492
|
-
page_number: table_hash['page_number'] || 0
|
|
497
|
+
page_number: table_hash['page_number'] || 0,
|
|
498
|
+
bounding_box: bounding_box
|
|
493
499
|
)
|
|
494
500
|
end
|
|
495
501
|
end
|
|
@@ -521,23 +527,26 @@ module Kreuzberg
|
|
|
521
527
|
def parse_images(images_data)
|
|
522
528
|
return nil if images_data.nil?
|
|
523
529
|
|
|
524
|
-
images_data.map
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
530
|
+
images_data.map { |image_hash| parse_single_image(image_hash) }
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
def parse_single_image(image_hash)
|
|
534
|
+
data = image_hash['data']
|
|
535
|
+
data = data.dup.force_encoding(Encoding::BINARY) if data.respond_to?(:force_encoding)
|
|
536
|
+
Image.new(
|
|
537
|
+
data: data,
|
|
538
|
+
format: image_hash['format'],
|
|
539
|
+
image_index: image_hash['image_index'],
|
|
540
|
+
page_number: image_hash['page_number'],
|
|
541
|
+
width: image_hash['width'],
|
|
542
|
+
height: image_hash['height'],
|
|
543
|
+
colorspace: image_hash['colorspace'],
|
|
544
|
+
bits_per_component: image_hash['bits_per_component'],
|
|
545
|
+
is_mask: image_hash['is_mask'],
|
|
546
|
+
description: image_hash['description'],
|
|
547
|
+
bounding_box: parse_bounding_box(image_hash['bounding_box']),
|
|
548
|
+
ocr_result: image_hash['ocr_result'] ? Result.new(image_hash['ocr_result']) : nil
|
|
549
|
+
)
|
|
541
550
|
end
|
|
542
551
|
|
|
543
552
|
def parse_pages(pages_data)
|
|
@@ -610,6 +619,21 @@ module Kreuzberg
|
|
|
610
619
|
)
|
|
611
620
|
end
|
|
612
621
|
|
|
622
|
+
def parse_bounding_box(bounding_box_data)
|
|
623
|
+
return nil if bounding_box_data.nil?
|
|
624
|
+
|
|
625
|
+
# If it's already a BoundingBox object, return it
|
|
626
|
+
return bounding_box_data if bounding_box_data.is_a?(BoundingBox)
|
|
627
|
+
|
|
628
|
+
# Otherwise parse from hash
|
|
629
|
+
BoundingBox.new(
|
|
630
|
+
x0: bounding_box_data['x0'].to_f,
|
|
631
|
+
y0: bounding_box_data['y0'].to_f,
|
|
632
|
+
x1: bounding_box_data['x1'].to_f,
|
|
633
|
+
y1: bounding_box_data['y1'].to_f
|
|
634
|
+
)
|
|
635
|
+
end
|
|
636
|
+
|
|
613
637
|
def parse_ocr_elements(ocr_elements_data)
|
|
614
638
|
return nil if ocr_elements_data.nil?
|
|
615
639
|
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -21,11 +21,20 @@ module Kreuzberg
|
|
|
21
21
|
autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
|
|
22
22
|
autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
|
|
23
23
|
|
|
24
|
+
autoload :BoundingBox, 'kreuzberg/types'
|
|
25
|
+
autoload :ElementMetadata, 'kreuzberg/types'
|
|
26
|
+
autoload :Element, 'kreuzberg/types'
|
|
24
27
|
autoload :HtmlMetadata, 'kreuzberg/types'
|
|
25
28
|
autoload :HeaderMetadata, 'kreuzberg/types'
|
|
26
29
|
autoload :LinkMetadata, 'kreuzberg/types'
|
|
27
30
|
autoload :ImageMetadata, 'kreuzberg/types'
|
|
28
31
|
autoload :StructuredData, 'kreuzberg/types'
|
|
32
|
+
autoload :ExtractedKeyword, 'kreuzberg/types'
|
|
33
|
+
autoload :ProcessingWarning, 'kreuzberg/types'
|
|
34
|
+
autoload :DocumentBoundingBox, 'kreuzberg/types'
|
|
35
|
+
autoload :DocumentAnnotation, 'kreuzberg/types'
|
|
36
|
+
autoload :DocumentNode, 'kreuzberg/types'
|
|
37
|
+
autoload :DocumentStructure, 'kreuzberg/types'
|
|
29
38
|
|
|
30
39
|
ExtractionConfig = Config::Extraction
|
|
31
40
|
PageConfig = Config::PageConfig
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -677,7 +677,8 @@ module Kreuzberg
|
|
|
677
677
|
type table_hash = {
|
|
678
678
|
cells: Array[Array[String]],
|
|
679
679
|
markdown: String,
|
|
680
|
-
page_number: Integer
|
|
680
|
+
page_number: Integer,
|
|
681
|
+
bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?
|
|
681
682
|
}
|
|
682
683
|
|
|
683
684
|
type chunk_hash = {
|
|
@@ -703,6 +704,7 @@ module Kreuzberg
|
|
|
703
704
|
bits_per_component: Integer?,
|
|
704
705
|
is_mask: bool,
|
|
705
706
|
description: String?,
|
|
707
|
+
bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?,
|
|
706
708
|
ocr_result: extraction_result_hash?
|
|
707
709
|
}
|
|
708
710
|
|
|
@@ -746,8 +748,9 @@ module Kreuzberg
|
|
|
746
748
|
attr_reader cells: Array[Array[String]]
|
|
747
749
|
attr_reader markdown: String
|
|
748
750
|
attr_reader page_number: Integer
|
|
751
|
+
attr_reader bounding_box: BoundingBox?
|
|
749
752
|
|
|
750
|
-
def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer) -> void
|
|
753
|
+
def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer, bounding_box: BoundingBox?) -> void
|
|
751
754
|
def to_h: () -> table_hash
|
|
752
755
|
end
|
|
753
756
|
|
|
@@ -789,6 +792,7 @@ module Kreuzberg
|
|
|
789
792
|
attr_reader bits_per_component: Integer?
|
|
790
793
|
attr_reader is_mask: bool
|
|
791
794
|
attr_reader description: String?
|
|
795
|
+
attr_reader bounding_box: BoundingBox?
|
|
792
796
|
attr_reader ocr_result: Result?
|
|
793
797
|
|
|
794
798
|
def initialize: (
|
|
@@ -802,6 +806,7 @@ module Kreuzberg
|
|
|
802
806
|
bits_per_component: Integer?,
|
|
803
807
|
is_mask: bool,
|
|
804
808
|
description: String?,
|
|
809
|
+
bounding_box: BoundingBox?,
|
|
805
810
|
ocr_result: Result?
|
|
806
811
|
) -> void
|
|
807
812
|
def to_h: () -> image_hash
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.3.
|
|
5
|
+
version = "4.3.5"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -21,7 +21,7 @@ console_error_panic_hook = "0.1"
|
|
|
21
21
|
criterion = { version = "0.8", features = ["html_reports"] }
|
|
22
22
|
getrandom = { version = "0.4.1", features = ["wasm_js"] }
|
|
23
23
|
hex = "0.4.3"
|
|
24
|
-
html-to-markdown-rs = { version = "2.25.
|
|
24
|
+
html-to-markdown-rs = { version = "2.25.1", default-features = false }
|
|
25
25
|
image = { version = "0.25.9", default-features = false }
|
|
26
26
|
js-sys = "0.3"
|
|
27
27
|
libc = "0.2.182"
|
|
@@ -37,7 +37,7 @@ serde_json = { version = "1.0.149" }
|
|
|
37
37
|
tempfile = "3.25.0"
|
|
38
38
|
thiserror = "2.0.18"
|
|
39
39
|
tokio = { version = "1.49.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
40
|
-
toml = "1.0.
|
|
40
|
+
toml = "1.0.2"
|
|
41
41
|
tracing = "0.1"
|
|
42
42
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
|
43
43
|
wasm-bindgen-futures = "0.4"
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.3.
|
|
3
|
+
version = "4.3.5"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -154,7 +154,7 @@ serde = { version = "1.0.228", features = ["derive"] }
|
|
|
154
154
|
serde_json = { version = "1.0.149" }
|
|
155
155
|
serde_yaml_ng = "0.10.0"
|
|
156
156
|
jotdown = "0.9"
|
|
157
|
-
toml = "1.0.
|
|
157
|
+
toml = "1.0.2"
|
|
158
158
|
mime_guess = "2.0"
|
|
159
159
|
rmp-serde = "1.3"
|
|
160
160
|
thiserror = "2.0.18"
|
|
@@ -167,11 +167,11 @@ lopdf = { version = "0.39.0", optional = true }
|
|
|
167
167
|
calamine = { version = "0.33.0", features = ["dates"], optional = true }
|
|
168
168
|
polars = { version = "0.53.0", default-features = false, features = ["ipc"], optional = true }
|
|
169
169
|
roxmltree = { version = "0.21.1", optional = true }
|
|
170
|
-
zip = { version = "8.
|
|
170
|
+
zip = { version = "8.1.0", optional = true, default-features = false, features = [
|
|
171
171
|
"deflate-flate2",
|
|
172
172
|
] }
|
|
173
173
|
mail-parser = { version = "0.11.2", optional = true }
|
|
174
|
-
html-to-markdown-rs = { version = "2.25.
|
|
174
|
+
html-to-markdown-rs = { version = "2.25.1", default-features = false , features = [
|
|
175
175
|
"inline-images", "metadata", ], optional = true }
|
|
176
176
|
cfb = { version = "0.14.0", optional = true }
|
|
177
177
|
quick-xml = { version = "0.39.1", features = ["serialize"], optional = true }
|
|
@@ -236,7 +236,7 @@ sha2 = { version = "0.10", optional = true }
|
|
|
236
236
|
tempfile = "3.25.0"
|
|
237
237
|
filetime = "0.2"
|
|
238
238
|
tar = "0.4.44"
|
|
239
|
-
zip = { version = "8.
|
|
239
|
+
zip = { version = "8.1.0", default-features = false, features = ["deflate-flate2"] }
|
|
240
240
|
serial_test = "3.3.1"
|
|
241
241
|
anyhow = "1.0"
|
|
242
242
|
tokio-test = "0.4"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.3.
|
|
20
|
+
> **🚀 Version 4.3.5 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -99,7 +99,7 @@ pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits
|
|
|
99
99
|
/// # Examples
|
|
100
100
|
///
|
|
101
101
|
/// ```no_run
|
|
102
|
-
/// use kreuzberg::{ExtractionConfig, api::
|
|
102
|
+
/// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits_and_server_config, ApiSizeLimits}, core::ServerConfig};
|
|
103
103
|
///
|
|
104
104
|
/// # #[tokio::main]
|
|
105
105
|
/// # async fn main() -> kreuzberg::Result<()> {
|
|
@@ -108,7 +108,7 @@ pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits
|
|
|
108
108
|
/// server_config.cors_origins = vec!["https://example.com".to_string()];
|
|
109
109
|
/// let router = create_router_with_limits_and_server_config(
|
|
110
110
|
/// extraction_config,
|
|
111
|
-
///
|
|
111
|
+
/// ApiSizeLimits::default(),
|
|
112
112
|
/// server_config
|
|
113
113
|
/// );
|
|
114
114
|
/// # Ok(())
|
|
@@ -37,6 +37,8 @@ use super::validation::validate_utf8_boundaries;
|
|
|
37
37
|
/// overlap: 50,
|
|
38
38
|
/// trim: true,
|
|
39
39
|
/// chunker_type: ChunkerType::Text,
|
|
40
|
+
/// embedding: None,
|
|
41
|
+
/// preset: None,
|
|
40
42
|
/// };
|
|
41
43
|
/// let result = chunk_text("Long text...", &config, None)?;
|
|
42
44
|
/// assert!(!result.chunks.is_empty());
|
|
@@ -23,10 +23,12 @@ use std::borrow::Cow;
|
|
|
23
23
|
/// * `result` - The extraction result to modify
|
|
24
24
|
/// * `output_format` - The desired output format
|
|
25
25
|
pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputFormat) {
|
|
26
|
-
// Check if content was already formatted during extraction
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
26
|
+
// Check if content was already formatted during extraction.
|
|
27
|
+
// Since extractors now preserve original MIME types, detect by checking
|
|
28
|
+
// metadata.output_format which is set by extractors that pre-format.
|
|
29
|
+
let already_formatted = match result.metadata.output_format.as_deref() {
|
|
30
|
+
Some("markdown") if output_format == OutputFormat::Markdown => true,
|
|
31
|
+
Some("djot") if output_format == OutputFormat::Djot => true,
|
|
30
32
|
_ => false,
|
|
31
33
|
};
|
|
32
34
|
|
|
@@ -195,7 +197,11 @@ mod tests {
|
|
|
195
197
|
|
|
196
198
|
let mut result = ExtractionResult {
|
|
197
199
|
content: "Hello World".to_string(),
|
|
198
|
-
mime_type: Cow::Borrowed("text/
|
|
200
|
+
mime_type: Cow::Borrowed("text/html"),
|
|
201
|
+
metadata: Metadata {
|
|
202
|
+
output_format: Some("djot".to_string()),
|
|
203
|
+
..Default::default()
|
|
204
|
+
},
|
|
199
205
|
djot_content: Some(DjotContent {
|
|
200
206
|
plain_text: "Hello World".to_string(),
|
|
201
207
|
blocks: vec![FormattedBlock {
|
|
@@ -322,6 +328,7 @@ mod tests {
|
|
|
322
328
|
cells: vec![vec!["A".to_string(), "B".to_string()]],
|
|
323
329
|
markdown: "| A | B |".to_string(),
|
|
324
330
|
page_number: 1,
|
|
331
|
+
bounding_box: None,
|
|
325
332
|
};
|
|
326
333
|
|
|
327
334
|
let mut result = ExtractionResult {
|
|
@@ -367,7 +374,11 @@ mod tests {
|
|
|
367
374
|
|
|
368
375
|
let mut result = ExtractionResult {
|
|
369
376
|
content: "test".to_string(),
|
|
370
|
-
mime_type: Cow::Borrowed("text/
|
|
377
|
+
mime_type: Cow::Borrowed("text/html"),
|
|
378
|
+
metadata: Metadata {
|
|
379
|
+
output_format: Some("djot".to_string()),
|
|
380
|
+
..Default::default()
|
|
381
|
+
},
|
|
371
382
|
djot_content: Some(djot_content),
|
|
372
383
|
..Default::default()
|
|
373
384
|
};
|
|
@@ -40,7 +40,13 @@ async fn test_run_pipeline_basic() {
|
|
|
40
40
|
Cow::Borrowed(VALIDATION_MARKER_KEY),
|
|
41
41
|
serde_json::json!(ORDER_VALIDATION_MARKER),
|
|
42
42
|
);
|
|
43
|
-
let config = ExtractionConfig
|
|
43
|
+
let config = ExtractionConfig {
|
|
44
|
+
postprocessor: Some(crate::core::config::PostProcessorConfig {
|
|
45
|
+
enabled: false,
|
|
46
|
+
..Default::default()
|
|
47
|
+
}),
|
|
48
|
+
..Default::default()
|
|
49
|
+
};
|
|
44
50
|
|
|
45
51
|
let processed = run_pipeline(result, &config).await.unwrap();
|
|
46
52
|
assert_eq!(processed.content, "test");
|
|
@@ -98,6 +104,10 @@ async fn test_pipeline_without_quality_processing() {
|
|
|
98
104
|
};
|
|
99
105
|
let config = ExtractionConfig {
|
|
100
106
|
enable_quality_processing: false,
|
|
107
|
+
postprocessor: Some(crate::core::config::PostProcessorConfig {
|
|
108
|
+
enabled: false,
|
|
109
|
+
..Default::default()
|
|
110
|
+
}),
|
|
101
111
|
..Default::default()
|
|
102
112
|
};
|
|
103
113
|
|
|
@@ -166,6 +176,10 @@ async fn test_pipeline_without_chunking() {
|
|
|
166
176
|
};
|
|
167
177
|
let config = ExtractionConfig {
|
|
168
178
|
chunking: None,
|
|
179
|
+
postprocessor: Some(crate::core::config::PostProcessorConfig {
|
|
180
|
+
enabled: false,
|
|
181
|
+
..Default::default()
|
|
182
|
+
}),
|
|
169
183
|
..Default::default()
|
|
170
184
|
};
|
|
171
185
|
|
|
@@ -201,7 +215,13 @@ async fn test_pipeline_preserves_metadata() {
|
|
|
201
215
|
quality_score: None,
|
|
202
216
|
processing_warnings: Vec::new(),
|
|
203
217
|
};
|
|
204
|
-
let config = ExtractionConfig
|
|
218
|
+
let config = ExtractionConfig {
|
|
219
|
+
postprocessor: Some(crate::core::config::PostProcessorConfig {
|
|
220
|
+
enabled: false,
|
|
221
|
+
..Default::default()
|
|
222
|
+
}),
|
|
223
|
+
..Default::default()
|
|
224
|
+
};
|
|
205
225
|
|
|
206
226
|
let processed = run_pipeline(result, &config).await.unwrap();
|
|
207
227
|
assert_eq!(
|
|
@@ -222,6 +242,7 @@ async fn test_pipeline_preserves_tables() {
|
|
|
222
242
|
cells: vec![vec!["A".to_string(), "B".to_string()]],
|
|
223
243
|
markdown: "| A | B |".to_string(),
|
|
224
244
|
page_number: 0,
|
|
245
|
+
bounding_box: None,
|
|
225
246
|
};
|
|
226
247
|
|
|
227
248
|
let result = ExtractionResult {
|
|
@@ -242,7 +263,13 @@ async fn test_pipeline_preserves_tables() {
|
|
|
242
263
|
quality_score: None,
|
|
243
264
|
processing_warnings: Vec::new(),
|
|
244
265
|
};
|
|
245
|
-
let config = ExtractionConfig
|
|
266
|
+
let config = ExtractionConfig {
|
|
267
|
+
postprocessor: Some(crate::core::config::PostProcessorConfig {
|
|
268
|
+
enabled: false,
|
|
269
|
+
..Default::default()
|
|
270
|
+
}),
|
|
271
|
+
..Default::default()
|
|
272
|
+
};
|
|
246
273
|
|
|
247
274
|
let processed = run_pipeline(result, &config).await.unwrap();
|
|
248
275
|
assert_eq!(processed.tables.len(), 1);
|
|
@@ -374,6 +374,7 @@ mod tests {
|
|
|
374
374
|
],
|
|
375
375
|
markdown: "| Header1 | Header2 |\n| Cell1 | Cell2 |".to_string(),
|
|
376
376
|
page_number: 1,
|
|
377
|
+
bounding_box: None,
|
|
377
378
|
};
|
|
378
379
|
|
|
379
380
|
let image = ExtractedImage {
|
|
@@ -388,6 +389,7 @@ mod tests {
|
|
|
388
389
|
is_mask: false,
|
|
389
390
|
description: None,
|
|
390
391
|
ocr_result: None,
|
|
392
|
+
bounding_box: None,
|
|
391
393
|
};
|
|
392
394
|
|
|
393
395
|
let result = ExtractionResult {
|