kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2f7a9518bec5df1adf4b81c08136aff44f7ab6b43cd85d7188625a3224e3a0e9
|
|
4
|
+
data.tar.gz: bd5b5a05b0612d6b66d59d458fd7b0366b0256f290e0158fa86002a64c016429
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dec54496350fe7c4ba61171ebb2c145bc7c2612fde428a395c5aa279bd25fcfedf21ddaeb9012f80db78717315200f2d7f4529119f217fc00e772e066a5de6e0
|
|
7
|
+
data.tar.gz: 68ae3d48391637936e8edd6b16947790bf4bf1da00a1c1efcf23b88da0dec4876d749a358327ea5e5f93e8a8f898a9a34d891bb6ab4311f46fec202de3b2e57a
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.7)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -80,8 +80,9 @@ GEM
|
|
|
80
80
|
ffi (~> 1.0)
|
|
81
81
|
rb_sys (0.9.119)
|
|
82
82
|
rake-compiler-dock (= 1.10.0)
|
|
83
|
-
rbs (3.10.
|
|
83
|
+
rbs (3.10.3)
|
|
84
84
|
logger
|
|
85
|
+
tsort
|
|
85
86
|
regexp_parser (2.11.3)
|
|
86
87
|
reline (0.6.3)
|
|
87
88
|
io-console (~> 0.5)
|
|
@@ -142,6 +143,7 @@ GEM
|
|
|
142
143
|
strscan (3.1.7)
|
|
143
144
|
terminal-table (4.0.0)
|
|
144
145
|
unicode-display_width (>= 1.1.1, < 4)
|
|
146
|
+
tsort (0.2.0)
|
|
145
147
|
tzinfo (2.0.6)
|
|
146
148
|
concurrent-ruby (~> 1.0)
|
|
147
149
|
unicode-display_width (3.2.0)
|
|
@@ -207,7 +209,7 @@ CHECKSUMS
|
|
|
207
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
209
211
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
210
|
-
kreuzberg (4.2.
|
|
212
|
+
kreuzberg (4.2.7)
|
|
211
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
212
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
213
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -228,7 +230,7 @@ CHECKSUMS
|
|
|
228
230
|
rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
|
|
229
231
|
rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
|
|
230
232
|
rb_sys (0.9.119) sha256=64393fa148e402e1b79b64496d2aabfc7df79da6b822b8bb48dc1141eaf40b4b
|
|
231
|
-
rbs (3.10.
|
|
233
|
+
rbs (3.10.3) sha256=70627f3919016134d554e6c99195552ae3ef6020fe034c8e983facc9c192daa6
|
|
232
234
|
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
233
235
|
reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
|
|
234
236
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
@@ -246,6 +248,7 @@ CHECKSUMS
|
|
|
246
248
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
247
249
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
248
250
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
|
251
|
+
tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
|
|
249
252
|
tzinfo (2.0.6) sha256=8daf828cc77bcf7d63b0e3bdb6caa47e2272dcfaf4fbfe46f8c3a9df087a829b
|
|
250
253
|
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
|
251
254
|
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.7" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -716,9 +716,9 @@ dependencies = [
|
|
|
716
716
|
|
|
717
717
|
[[package]]
|
|
718
718
|
name = "cc"
|
|
719
|
-
version = "1.2.
|
|
719
|
+
version = "1.2.55"
|
|
720
720
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
721
|
-
checksum = "
|
|
721
|
+
checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29"
|
|
722
722
|
dependencies = [
|
|
723
723
|
"find-msvc-tools",
|
|
724
724
|
"jobserver",
|
|
@@ -1631,9 +1631,9 @@ dependencies = [
|
|
|
1631
1631
|
|
|
1632
1632
|
[[package]]
|
|
1633
1633
|
name = "find-msvc-tools"
|
|
1634
|
-
version = "0.1.
|
|
1634
|
+
version = "0.1.9"
|
|
1635
1635
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1636
|
-
checksum = "
|
|
1636
|
+
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
|
|
1637
1637
|
|
|
1638
1638
|
[[package]]
|
|
1639
1639
|
name = "flate2"
|
|
@@ -2027,9 +2027,9 @@ dependencies = [
|
|
|
2027
2027
|
|
|
2028
2028
|
[[package]]
|
|
2029
2029
|
name = "html-to-markdown-rs"
|
|
2030
|
-
version = "2.
|
|
2030
|
+
version = "2.24.3"
|
|
2031
2031
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2032
|
-
checksum = "
|
|
2032
|
+
checksum = "51e190e3bcf14728f08547b10ba3afdae24c84299045e0831bb9ee1e199d54ad"
|
|
2033
2033
|
dependencies = [
|
|
2034
2034
|
"astral-tl",
|
|
2035
2035
|
"base64 0.22.1",
|
|
@@ -2627,7 +2627,7 @@ dependencies = [
|
|
|
2627
2627
|
|
|
2628
2628
|
[[package]]
|
|
2629
2629
|
name = "kreuzberg"
|
|
2630
|
-
version = "4.2.
|
|
2630
|
+
version = "4.2.6"
|
|
2631
2631
|
dependencies = [
|
|
2632
2632
|
"ahash",
|
|
2633
2633
|
"async-trait",
|
|
@@ -2636,6 +2636,7 @@ dependencies = [
|
|
|
2636
2636
|
"base64-simd",
|
|
2637
2637
|
"biblatex",
|
|
2638
2638
|
"bitvec",
|
|
2639
|
+
"bytes",
|
|
2639
2640
|
"calamine",
|
|
2640
2641
|
"chardetng",
|
|
2641
2642
|
"dashmap",
|
|
@@ -2703,6 +2704,7 @@ dependencies = [
|
|
|
2703
2704
|
"typst-syntax",
|
|
2704
2705
|
"unicode-normalization",
|
|
2705
2706
|
"ureq 3.1.4",
|
|
2707
|
+
"utoipa",
|
|
2706
2708
|
"uuid",
|
|
2707
2709
|
"whatlang",
|
|
2708
2710
|
"yake-rust",
|
|
@@ -2711,8 +2713,9 @@ dependencies = [
|
|
|
2711
2713
|
|
|
2712
2714
|
[[package]]
|
|
2713
2715
|
name = "kreuzberg-ffi"
|
|
2714
|
-
version = "4.2.
|
|
2716
|
+
version = "4.2.6"
|
|
2715
2717
|
dependencies = [
|
|
2718
|
+
"ahash",
|
|
2716
2719
|
"async-trait",
|
|
2717
2720
|
"cbindgen",
|
|
2718
2721
|
"html-to-markdown-rs",
|
|
@@ -2768,7 +2771,7 @@ dependencies = [
|
|
|
2768
2771
|
|
|
2769
2772
|
[[package]]
|
|
2770
2773
|
name = "kreuzberg-tesseract"
|
|
2771
|
-
version = "4.2.
|
|
2774
|
+
version = "4.2.6"
|
|
2772
2775
|
dependencies = [
|
|
2773
2776
|
"cc",
|
|
2774
2777
|
"cmake",
|
|
@@ -6704,6 +6707,30 @@ version = "0.2.2"
|
|
|
6704
6707
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6705
6708
|
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
|
6706
6709
|
|
|
6710
|
+
[[package]]
|
|
6711
|
+
name = "utoipa"
|
|
6712
|
+
version = "5.4.0"
|
|
6713
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6714
|
+
checksum = "2fcc29c80c21c31608227e0912b2d7fddba57ad76b606890627ba8ee7964e993"
|
|
6715
|
+
dependencies = [
|
|
6716
|
+
"indexmap",
|
|
6717
|
+
"serde",
|
|
6718
|
+
"serde_json",
|
|
6719
|
+
"utoipa-gen",
|
|
6720
|
+
]
|
|
6721
|
+
|
|
6722
|
+
[[package]]
|
|
6723
|
+
name = "utoipa-gen"
|
|
6724
|
+
version = "5.4.0"
|
|
6725
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6726
|
+
checksum = "6d79d08d92ab8af4c5e8a6da20c47ae3f61a0f1dabc1997cdf2d082b757ca08b"
|
|
6727
|
+
dependencies = [
|
|
6728
|
+
"proc-macro2",
|
|
6729
|
+
"quote",
|
|
6730
|
+
"regex",
|
|
6731
|
+
"syn",
|
|
6732
|
+
]
|
|
6733
|
+
|
|
6707
6734
|
[[package]]
|
|
6708
6735
|
name = "uuid"
|
|
6709
6736
|
version = "1.20.0"
|
|
@@ -1,5 +1,37 @@
|
|
|
1
1
|
[workspace]
|
|
2
2
|
|
|
3
|
+
[workspace.dependencies]
|
|
4
|
+
bytes = { version = "1", features = ["serde"] }
|
|
5
|
+
serde = { version = "1.0.228", features = ["derive"] }
|
|
6
|
+
serde_json = { version = "1.0.149" }
|
|
7
|
+
tokio = { version = "1.49.0", features = [
|
|
8
|
+
"rt",
|
|
9
|
+
"rt-multi-thread",
|
|
10
|
+
"macros",
|
|
11
|
+
"sync",
|
|
12
|
+
"process",
|
|
13
|
+
"fs",
|
|
14
|
+
"time",
|
|
15
|
+
"io-util",
|
|
16
|
+
] }
|
|
17
|
+
thiserror = "2.0.18"
|
|
18
|
+
anyhow = "1.0"
|
|
19
|
+
libc = "0.2.180"
|
|
20
|
+
async-trait = "0.1.89"
|
|
21
|
+
tracing = "0.1"
|
|
22
|
+
ahash = "0.8.12"
|
|
23
|
+
base64 = "0.22.1"
|
|
24
|
+
hex = "0.4.3"
|
|
25
|
+
num_cpus = "1.17.0"
|
|
26
|
+
once_cell = "1.21.3"
|
|
27
|
+
parking_lot = "0.12.5"
|
|
28
|
+
html-to-markdown-rs = { version = "2.24.3", default-features = false }
|
|
29
|
+
reqwest = { version = "0.13.1", default-features = false }
|
|
30
|
+
image = { version = "0.25.9", default-features = false }
|
|
31
|
+
toml = "0.9.11"
|
|
32
|
+
tempfile = "3.24.0"
|
|
33
|
+
lzma-rust2 = { version = "0.15.7" }
|
|
34
|
+
|
|
3
35
|
[workspace.lints.clippy]
|
|
4
36
|
collapsible_if = "allow"
|
|
5
37
|
|
|
@@ -93,8 +93,10 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
|
|
|
93
93
|
};
|
|
94
94
|
|
|
95
95
|
let config = ChunkingConfig {
|
|
96
|
-
max_chars,
|
|
97
|
-
max_overlap,
|
|
96
|
+
max_characters: max_chars,
|
|
97
|
+
overlap: max_overlap,
|
|
98
|
+
trim: true,
|
|
99
|
+
chunker_type: kreuzberg::ChunkerType::Text,
|
|
98
100
|
embedding,
|
|
99
101
|
preset,
|
|
100
102
|
};
|
|
@@ -95,7 +95,7 @@ pub fn register_post_processor(args: &[Value]) -> Result<(), Error> {
|
|
|
95
95
|
message: format!("Failed to convert mime_type: {}", e),
|
|
96
96
|
plugin_name: processor_name.clone(),
|
|
97
97
|
})?;
|
|
98
|
-
updated_result.mime_type = new_mime;
|
|
98
|
+
updated_result.mime_type = std::borrow::Cow::Owned(new_mime);
|
|
99
99
|
}
|
|
100
100
|
|
|
101
101
|
Ok::<kreuzberg::ExtractionResult, kreuzberg::KreuzbergError>(updated_result)
|
|
@@ -27,7 +27,7 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
|
|
|
27
27
|
let content_value = ruby.str_new(result.content.as_str()).into_value_with(ruby);
|
|
28
28
|
set_hash_entry(ruby, &hash, "content", content_value)?;
|
|
29
29
|
|
|
30
|
-
let mime_value = ruby.str_new(result.mime_type.
|
|
30
|
+
let mime_value = ruby.str_new(result.mime_type.as_ref()).into_value_with(ruby);
|
|
31
31
|
set_hash_entry(ruby, &hash, "mime_type", mime_value)?;
|
|
32
32
|
|
|
33
33
|
// Set metadata both as JSON string and parsed hash
|
|
@@ -117,7 +117,8 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
|
|
|
117
117
|
let image_hash = ruby.hash_new();
|
|
118
118
|
let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
|
|
119
119
|
image_hash.aset("data", data_value)?;
|
|
120
|
-
|
|
120
|
+
let format_value = ruby.str_new(image.format.as_ref()).into_value_with(ruby);
|
|
121
|
+
image_hash.aset("format", format_value)?;
|
|
121
122
|
image_hash.aset("image_index", image.image_index as i64)?;
|
|
122
123
|
if let Some(page) = image.page_number {
|
|
123
124
|
image_hash.aset("page_number", page as i64)?;
|
|
@@ -200,7 +201,8 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
|
|
|
200
201
|
let image_hash = ruby.hash_new();
|
|
201
202
|
let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
|
|
202
203
|
image_hash.aset("data", data_value)?;
|
|
203
|
-
|
|
204
|
+
let format_value = ruby.str_new(image.format.as_ref()).into_value_with(ruby);
|
|
205
|
+
image_hash.aset("format", format_value)?;
|
|
204
206
|
image_hash.aset("image_index", image.image_index as i64)?;
|
|
205
207
|
if let Some(page) = image.page_number {
|
|
206
208
|
image_hash.aset("page_number", page as i64)?;
|
data/lib/kreuzberg/version.rb
CHANGED