kreuzberg 4.4.2 → 4.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -6
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +13 -13
- data/ext/kreuzberg_rb/native/Cargo.toml +5 -5
- data/ext/kreuzberg_rb/native/src/config/types.rs +7 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +4 -3
- data/vendor/kreuzberg/Cargo.toml +6 -6
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/chunking/builder.rs +42 -141
- data/vendor/kreuzberg/src/chunking/core.rs +136 -1
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +7 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +52 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +3 -1
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +1 -1
- data/vendor/kreuzberg/src/extraction/html/converter.rs +34 -33
- data/vendor/kreuzberg/src/extraction/html/mod.rs +5 -1
- data/vendor/kreuzberg/src/extraction/html/processor.rs +2 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/html.rs +99 -192
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +8 -3
- data/vendor/kreuzberg/src/mcp/server.rs +6 -7
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -11
- data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +121 -13
- data/vendor/kreuzberg/tests/api_consistency.rs +1 -0
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +2 -2
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/build.rs +140 -27
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 879027155ef841eb494ce23bdd9817664d93cc86acb59cf7d04408b6b717cff5
|
|
4
|
+
data.tar.gz: be26ecbf4ad1f07cfa7f02a197ef355ed534e4a708361b8a2fcf4feb6bce514a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 980236a2633bfdc3b835aa8bf4b215aa623be2d61d74758bc6b668d5bcc48fecd7ed9dd97db5cb3a65dce3b2aae9aee8b15f2adb5df89bd48f7179fe0daab506
|
|
7
|
+
data.tar.gz: 57a7ed2f02591dbe8c7a79078504e2130953bb177151d246b8dffab40c9db1b260b526ac4abd0780b88835ea3378f835583a9400af7ae32481ad9c90ac4e86eb
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.4.
|
|
4
|
+
kreuzberg (4.4.3)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -110,7 +110,7 @@ GEM
|
|
|
110
110
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
111
111
|
rspec-support (~> 3.13.0)
|
|
112
112
|
rspec-support (3.13.7)
|
|
113
|
-
rubocop (1.85.
|
|
113
|
+
rubocop (1.85.1)
|
|
114
114
|
json (~> 2.3)
|
|
115
115
|
language_server-protocol (~> 3.17.0.2)
|
|
116
116
|
lint_roller (~> 1.1.0)
|
|
@@ -134,7 +134,7 @@ GEM
|
|
|
134
134
|
rubocop (~> 1.81)
|
|
135
135
|
ruby-progressbar (1.13.0)
|
|
136
136
|
securerandom (0.4.1)
|
|
137
|
-
sorbet-runtime (0.6.
|
|
137
|
+
sorbet-runtime (0.6.12993)
|
|
138
138
|
steep (1.10.0)
|
|
139
139
|
activesupport (>= 5.1)
|
|
140
140
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -222,7 +222,7 @@ CHECKSUMS
|
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
223
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
224
224
|
json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
|
|
225
|
-
kreuzberg (4.4.
|
|
225
|
+
kreuzberg (4.4.3)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -253,13 +253,13 @@ CHECKSUMS
|
|
|
253
253
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
254
254
|
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
255
255
|
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
256
|
-
rubocop (1.85.
|
|
256
|
+
rubocop (1.85.1) sha256=3dbcf9e961baa4c376eeeb2a03913dca5e3987033b04d38fa538aa1e7406cc77
|
|
257
257
|
rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
|
|
258
258
|
rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
|
|
259
259
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
260
260
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
261
261
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
262
|
-
sorbet-runtime (0.6.
|
|
262
|
+
sorbet-runtime (0.6.12993) sha256=5720d6e70063ed39528ddb18248c13a8072cf6991cf7d6652dcc0b8e9bc6b4ac
|
|
263
263
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
264
264
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
265
265
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.3" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -1390,9 +1390,9 @@ dependencies = [
|
|
|
1390
1390
|
|
|
1391
1391
|
[[package]]
|
|
1392
1392
|
name = "fastembed"
|
|
1393
|
-
version = "5.
|
|
1393
|
+
version = "5.12.0"
|
|
1394
1394
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1395
|
-
checksum = "
|
|
1395
|
+
checksum = "e7b01c79c5cb8ab3ce31c3d52916fda278e14cac027ff3a9cb66c419ed7288f8"
|
|
1396
1396
|
dependencies = [
|
|
1397
1397
|
"anyhow",
|
|
1398
1398
|
"hf-hub 0.4.3",
|
|
@@ -1977,9 +1977,9 @@ dependencies = [
|
|
|
1977
1977
|
|
|
1978
1978
|
[[package]]
|
|
1979
1979
|
name = "html-to-markdown-rs"
|
|
1980
|
-
version = "2.
|
|
1980
|
+
version = "2.28.0"
|
|
1981
1981
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1982
|
-
checksum = "
|
|
1982
|
+
checksum = "842656c6f85a8e7b35fd5656646ffa0d6644cdcfe429634c7d9923a5626e330b"
|
|
1983
1983
|
dependencies = [
|
|
1984
1984
|
"ahash",
|
|
1985
1985
|
"astral-tl",
|
|
@@ -2661,7 +2661,7 @@ dependencies = [
|
|
|
2661
2661
|
"thiserror 2.0.18",
|
|
2662
2662
|
"tiff 0.11.3",
|
|
2663
2663
|
"tokio",
|
|
2664
|
-
"toml 1.0.
|
|
2664
|
+
"toml 1.0.4+spec-1.1.0",
|
|
2665
2665
|
"tower",
|
|
2666
2666
|
"tower-http",
|
|
2667
2667
|
"tracing",
|
|
@@ -2729,7 +2729,7 @@ dependencies = [
|
|
|
2729
2729
|
|
|
2730
2730
|
[[package]]
|
|
2731
2731
|
name = "kreuzberg-rb"
|
|
2732
|
-
version = "4.4.
|
|
2732
|
+
version = "4.4.3"
|
|
2733
2733
|
dependencies = [
|
|
2734
2734
|
"async-trait",
|
|
2735
2735
|
"html-to-markdown-rs",
|
|
@@ -2741,7 +2741,7 @@ dependencies = [
|
|
|
2741
2741
|
"serde_json",
|
|
2742
2742
|
"serde_yaml_ng",
|
|
2743
2743
|
"tokio",
|
|
2744
|
-
"toml 1.0.
|
|
2744
|
+
"toml 1.0.4+spec-1.1.0",
|
|
2745
2745
|
]
|
|
2746
2746
|
|
|
2747
2747
|
[[package]]
|
|
@@ -5294,9 +5294,9 @@ dependencies = [
|
|
|
5294
5294
|
|
|
5295
5295
|
[[package]]
|
|
5296
5296
|
name = "toml"
|
|
5297
|
-
version = "1.0.
|
|
5297
|
+
version = "1.0.4+spec-1.1.0"
|
|
5298
5298
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5299
|
-
checksum = "
|
|
5299
|
+
checksum = "c94c3321114413476740df133f0d8862c61d87c8d26f04c6841e033c8c80db47"
|
|
5300
5300
|
dependencies = [
|
|
5301
5301
|
"indexmap",
|
|
5302
5302
|
"serde_core",
|
|
@@ -5700,9 +5700,9 @@ dependencies = [
|
|
|
5700
5700
|
|
|
5701
5701
|
[[package]]
|
|
5702
5702
|
name = "uuid"
|
|
5703
|
-
version = "1.
|
|
5703
|
+
version = "1.22.0"
|
|
5704
5704
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5705
|
-
checksum = "
|
|
5705
|
+
checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37"
|
|
5706
5706
|
dependencies = [
|
|
5707
5707
|
"getrandom 0.4.2",
|
|
5708
5708
|
"js-sys",
|
|
@@ -6314,9 +6314,9 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
|
|
|
6314
6314
|
|
|
6315
6315
|
[[package]]
|
|
6316
6316
|
name = "winnow"
|
|
6317
|
-
version = "0.7.
|
|
6317
|
+
version = "0.7.15"
|
|
6318
6318
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6319
|
-
checksum = "
|
|
6319
|
+
checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
|
|
6320
6320
|
|
|
6321
6321
|
[[package]]
|
|
6322
6322
|
name = "wit-bindgen"
|
|
@@ -25,10 +25,10 @@ hex = "0.4.3"
|
|
|
25
25
|
num_cpus = "1.17.0"
|
|
26
26
|
once_cell = "1.21.3"
|
|
27
27
|
parking_lot = "0.12.5"
|
|
28
|
-
html-to-markdown-rs = { version = "2.
|
|
28
|
+
html-to-markdown-rs = { version = "2.28.0", default-features = false }
|
|
29
29
|
reqwest = { version = "0.13.2", default-features = false }
|
|
30
30
|
image = { version = "0.25.9", default-features = false }
|
|
31
|
-
toml = "1.0.
|
|
31
|
+
toml = "1.0.4"
|
|
32
32
|
tempfile = "3.26.0"
|
|
33
33
|
lzma-rust2 = { version = "0.16.2" }
|
|
34
34
|
log = "0.4"
|
|
@@ -45,7 +45,7 @@ collapsible_if = "allow"
|
|
|
45
45
|
|
|
46
46
|
[package]
|
|
47
47
|
name = "kreuzberg-rb"
|
|
48
|
-
version = "4.4.
|
|
48
|
+
version = "4.4.3"
|
|
49
49
|
edition = "2024"
|
|
50
50
|
rust-version = "1.91"
|
|
51
51
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -99,7 +99,7 @@ rb-sys = { version = "0.9.124", default-features = false, features = [
|
|
|
99
99
|
"stable-api-compiled-fallback",
|
|
100
100
|
] }
|
|
101
101
|
serde_json = "1.0.149"
|
|
102
|
-
toml = "1.0.
|
|
102
|
+
toml = "1.0.4"
|
|
103
103
|
serde_yaml_ng = "0.10"
|
|
104
104
|
tokio = { version = "1.50.0", features = [
|
|
105
105
|
"rt",
|
|
@@ -111,7 +111,7 @@ tokio = { version = "1.50.0", features = [
|
|
|
111
111
|
"time",
|
|
112
112
|
"io-util",
|
|
113
113
|
] }
|
|
114
|
-
html-to-markdown-rs = { version = "2.
|
|
114
|
+
html-to-markdown-rs = { version = "2.28.0", default-features = false }
|
|
115
115
|
|
|
116
116
|
[dev-dependencies]
|
|
117
117
|
pretty_assertions = "1.4"
|
|
@@ -317,10 +317,17 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
|
|
|
317
317
|
600
|
|
318
318
|
};
|
|
319
319
|
|
|
320
|
+
let inject_placeholders = if let Some(val) = get_kw(ruby, hash, "inject_placeholders") {
|
|
321
|
+
bool::try_convert(val)?
|
|
322
|
+
} else {
|
|
323
|
+
true
|
|
324
|
+
};
|
|
325
|
+
|
|
320
326
|
let config = ImageExtractionConfig {
|
|
321
327
|
extract_images,
|
|
322
328
|
target_dpi,
|
|
323
329
|
max_image_dimension,
|
|
330
|
+
inject_placeholders,
|
|
324
331
|
auto_adjust_dpi,
|
|
325
332
|
min_dpi,
|
|
326
333
|
max_dpi,
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.4.
|
|
5
|
+
version = "4.4.3"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -23,7 +23,7 @@ criterion = { version = "0.8", features = ["html_reports"] }
|
|
|
23
23
|
ctor = "0.6"
|
|
24
24
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
25
25
|
hex = "0.4.3"
|
|
26
|
-
html-to-markdown-rs = { version = "2.
|
|
26
|
+
html-to-markdown-rs = { version = "2.28.0", default-features = false }
|
|
27
27
|
image = { version = "0.25.9", default-features = false }
|
|
28
28
|
itertools = "0.14"
|
|
29
29
|
js-sys = "0.3"
|
|
@@ -33,6 +33,7 @@ log = "0.4"
|
|
|
33
33
|
lzma-rust2 = { version = "0.16.2" }
|
|
34
34
|
num_cpus = "1.17.0"
|
|
35
35
|
once_cell = "1.21.3"
|
|
36
|
+
ort = { version = "=2.0.0-rc.11", default-features = false }
|
|
36
37
|
parking_lot = "0.12.5"
|
|
37
38
|
pdfium-render = { package = "kreuzberg-pdfium-render", version = "4.3" }
|
|
38
39
|
rayon = "1.11.0"
|
|
@@ -42,7 +43,7 @@ serde_json = { version = "1.0.149" }
|
|
|
42
43
|
tempfile = "3.26.0"
|
|
43
44
|
thiserror = "2.0.18"
|
|
44
45
|
tokio = { version = "1.50.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
45
|
-
toml = "1.0.
|
|
46
|
+
toml = "1.0.4"
|
|
46
47
|
tracing = "0.1"
|
|
47
48
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
|
48
49
|
wasm-bindgen-futures = "0.4"
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.4.
|
|
3
|
+
version = "4.4.3"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -153,7 +153,7 @@ serde = { version = "1.0.228", features = ["derive"] }
|
|
|
153
153
|
serde_json = { version = "1.0.149" }
|
|
154
154
|
serde_yaml_ng = "0.10.0"
|
|
155
155
|
jotdown = "0.9"
|
|
156
|
-
toml = "1.0.
|
|
156
|
+
toml = "1.0.4"
|
|
157
157
|
mime_guess = "2.0"
|
|
158
158
|
rmp-serde = "1.3"
|
|
159
159
|
thiserror = "2.0.18"
|
|
@@ -169,8 +169,8 @@ zip = { version = "8.2.0", optional = true, default-features = false, features =
|
|
|
169
169
|
"deflate-flate2",
|
|
170
170
|
] }
|
|
171
171
|
mail-parser = { version = "0.11.2", optional = true }
|
|
172
|
-
html-to-markdown-rs = { version = "2.
|
|
173
|
-
"inline-images", "metadata", ], optional = true }
|
|
172
|
+
html-to-markdown-rs = { version = "2.28.0", default-features = false , features = [
|
|
173
|
+
"inline-images", "metadata", "visitor", ], optional = true }
|
|
174
174
|
cfb = { version = "0.14.0", optional = true }
|
|
175
175
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
|
176
176
|
tar = { version = "0.4.44", optional = true }
|
|
@@ -244,7 +244,7 @@ kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "4.3", opti
|
|
|
244
244
|
reqwest = { version = "0.13.2", default-features = false , features = [
|
|
245
245
|
"json", "rustls", "blocking", ], optional = true }
|
|
246
246
|
# Use rustls-tls for fastembed on non-Windows platforms
|
|
247
|
-
fastembed = { version = "5.
|
|
247
|
+
fastembed = { version = "5.12", default-features = false, features = [
|
|
248
248
|
"hf-hub-rustls-tls",
|
|
249
249
|
"ort-load-dynamic",
|
|
250
250
|
], optional = true }
|
|
@@ -260,7 +260,7 @@ kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "4.3", opti
|
|
|
260
260
|
reqwest = { version = "0.13.2", default-features = false , features = [
|
|
261
261
|
"json", "native-tls", "blocking", ], optional = true }
|
|
262
262
|
# Use native-tls for fastembed on Windows
|
|
263
|
-
fastembed = { version = "5.
|
|
263
|
+
fastembed = { version = "5.12", default-features = false, features = [
|
|
264
264
|
"hf-hub-native-tls",
|
|
265
265
|
"ort-load-dynamic",
|
|
266
266
|
], optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.4.
|
|
20
|
+
> **🚀 Version 4.4.3 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -35,14 +35,15 @@ pub fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) ->
|
|
|
35
35
|
///
|
|
36
36
|
/// This function takes a collection of text segments (produced by a text splitter)
|
|
37
37
|
/// and constructs Chunk objects with proper metadata, including:
|
|
38
|
-
/// - Byte offsets
|
|
38
|
+
/// - Byte offsets derived from the chunk's position in the source text
|
|
39
39
|
/// - Chunk indices and total count
|
|
40
40
|
/// - Page boundary information (if provided)
|
|
41
41
|
///
|
|
42
42
|
/// # Arguments
|
|
43
43
|
///
|
|
44
|
+
/// * `source_text` - The original text that the chunks were split from. Chunk
|
|
45
|
+
/// slices must borrow from this text (as `text-splitter` guarantees).
|
|
44
46
|
/// * `text_chunks` - Iterator of text segments to convert into chunks
|
|
45
|
-
/// * `overlap` - Number of characters to overlap between chunks
|
|
46
47
|
/// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
|
|
47
48
|
///
|
|
48
49
|
/// # Returns
|
|
@@ -53,8 +54,8 @@ pub fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) ->
|
|
|
53
54
|
///
|
|
54
55
|
/// Returns an error if page boundary calculation fails.
|
|
55
56
|
pub fn build_chunks<'a, I>(
|
|
57
|
+
source_text: &'a str,
|
|
56
58
|
text_chunks: I,
|
|
57
|
-
overlap: usize,
|
|
58
59
|
page_boundaries: Option<&[PageBoundary]>,
|
|
59
60
|
) -> Result<Vec<Chunk>>
|
|
60
61
|
where
|
|
@@ -62,86 +63,37 @@ where
|
|
|
62
63
|
{
|
|
63
64
|
let chunks_vec: Vec<&str> = text_chunks.into_iter().collect();
|
|
64
65
|
let total_chunks = chunks_vec.len();
|
|
65
|
-
let
|
|
66
|
+
let source_start = source_text.as_ptr() as usize;
|
|
66
67
|
let mut chunks = Vec::with_capacity(total_chunks);
|
|
67
68
|
|
|
68
69
|
for (index, chunk_text) in chunks_vec.into_iter().enumerate() {
|
|
69
|
-
let
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
70
|
+
let byte_start = chunk_text.as_ptr() as usize - source_start;
|
|
71
|
+
let byte_end = byte_start + chunk_text.len();
|
|
72
|
+
|
|
73
|
+
let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
|
|
74
|
+
calculate_page_range(byte_start, byte_end, boundaries)?
|
|
75
|
+
} else {
|
|
76
|
+
(None, None)
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
chunks.push(Chunk {
|
|
80
|
+
content: chunk_text.to_string(),
|
|
81
|
+
embedding: None,
|
|
82
|
+
metadata: ChunkMetadata {
|
|
83
|
+
byte_start,
|
|
84
|
+
byte_end,
|
|
85
|
+
token_count: None,
|
|
86
|
+
chunk_index: index,
|
|
87
|
+
total_chunks,
|
|
88
|
+
first_page,
|
|
89
|
+
last_page,
|
|
90
|
+
},
|
|
91
|
+
});
|
|
78
92
|
}
|
|
79
93
|
|
|
80
94
|
Ok(chunks)
|
|
81
95
|
}
|
|
82
96
|
|
|
83
|
-
/// Build a single chunk with metadata.
|
|
84
|
-
///
|
|
85
|
-
/// # Arguments
|
|
86
|
-
///
|
|
87
|
-
/// * `chunk_text` - The text content for this chunk
|
|
88
|
-
/// * `index` - Zero-based index of this chunk
|
|
89
|
-
/// * `total_chunks` - Total number of chunks in the collection
|
|
90
|
-
/// * `byte_offset` - Mutable reference to current byte offset (will be updated)
|
|
91
|
-
/// * `overlap` - Number of characters to overlap between chunks
|
|
92
|
-
/// * `page_boundaries` - Optional page boundary markers
|
|
93
|
-
///
|
|
94
|
-
/// # Returns
|
|
95
|
-
///
|
|
96
|
-
/// A complete Chunk object with all metadata filled in.
|
|
97
|
-
///
|
|
98
|
-
/// # Errors
|
|
99
|
-
///
|
|
100
|
-
/// Returns an error if page boundary calculation fails.
|
|
101
|
-
fn build_single_chunk(
|
|
102
|
-
chunk_text: &str,
|
|
103
|
-
index: usize,
|
|
104
|
-
total_chunks: usize,
|
|
105
|
-
byte_offset: &mut usize,
|
|
106
|
-
overlap: usize,
|
|
107
|
-
page_boundaries: Option<&[PageBoundary]>,
|
|
108
|
-
) -> Result<Chunk> {
|
|
109
|
-
let byte_start = *byte_offset;
|
|
110
|
-
let chunk_length = chunk_text.len();
|
|
111
|
-
let byte_end = byte_start + chunk_length;
|
|
112
|
-
|
|
113
|
-
// Calculate overlap for next chunk (not applicable to last chunk)
|
|
114
|
-
let overlap_chars = if index < total_chunks - 1 {
|
|
115
|
-
overlap.min(chunk_length)
|
|
116
|
-
} else {
|
|
117
|
-
0
|
|
118
|
-
};
|
|
119
|
-
|
|
120
|
-
// Update offset for next chunk, accounting for overlap
|
|
121
|
-
*byte_offset = byte_end - overlap_chars;
|
|
122
|
-
|
|
123
|
-
// Calculate page range if boundaries are provided
|
|
124
|
-
let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
|
|
125
|
-
calculate_page_range(byte_start, byte_end, boundaries)?
|
|
126
|
-
} else {
|
|
127
|
-
(None, None)
|
|
128
|
-
};
|
|
129
|
-
|
|
130
|
-
Ok(Chunk {
|
|
131
|
-
content: chunk_text.to_string(),
|
|
132
|
-
embedding: None,
|
|
133
|
-
metadata: ChunkMetadata {
|
|
134
|
-
byte_start,
|
|
135
|
-
byte_end,
|
|
136
|
-
token_count: None,
|
|
137
|
-
chunk_index: index,
|
|
138
|
-
total_chunks,
|
|
139
|
-
first_page,
|
|
140
|
-
last_page,
|
|
141
|
-
},
|
|
142
|
-
})
|
|
143
|
-
}
|
|
144
|
-
|
|
145
97
|
#[cfg(test)]
|
|
146
98
|
mod tests {
|
|
147
99
|
use super::*;
|
|
@@ -162,15 +114,16 @@ mod tests {
|
|
|
162
114
|
|
|
163
115
|
#[test]
|
|
164
116
|
fn test_build_chunks_empty() {
|
|
117
|
+
let source = "";
|
|
165
118
|
let text_chunks: Vec<&str> = vec![];
|
|
166
|
-
let result = build_chunks(
|
|
119
|
+
let result = build_chunks(source, text_chunks, None).unwrap();
|
|
167
120
|
assert_eq!(result.len(), 0);
|
|
168
121
|
}
|
|
169
122
|
|
|
170
123
|
#[test]
|
|
171
124
|
fn test_build_chunks_single() {
|
|
172
|
-
let
|
|
173
|
-
let result = build_chunks(
|
|
125
|
+
let source = "Single chunk";
|
|
126
|
+
let result = build_chunks(source, vec![source], None).unwrap();
|
|
174
127
|
assert_eq!(result.len(), 1);
|
|
175
128
|
assert_eq!(result[0].content, "Single chunk");
|
|
176
129
|
assert_eq!(result[0].metadata.chunk_index, 0);
|
|
@@ -179,29 +132,10 @@ mod tests {
|
|
|
179
132
|
assert_eq!(result[0].metadata.byte_end, 12);
|
|
180
133
|
}
|
|
181
134
|
|
|
182
|
-
#[test]
|
|
183
|
-
fn test_build_chunks_multiple_with_overlap() {
|
|
184
|
-
let text_chunks = vec!["First chunk here", "Second chunk here", "Third chunk here"];
|
|
185
|
-
let overlap = 5;
|
|
186
|
-
let result = build_chunks(text_chunks, overlap, None).unwrap();
|
|
187
|
-
|
|
188
|
-
assert_eq!(result.len(), 3);
|
|
189
|
-
|
|
190
|
-
// First chunk
|
|
191
|
-
assert_eq!(result[0].content, "First chunk here");
|
|
192
|
-
assert_eq!(result[0].metadata.byte_start, 0);
|
|
193
|
-
assert_eq!(result[0].metadata.byte_end, 16);
|
|
194
|
-
|
|
195
|
-
// Second chunk should start before first ends (overlap)
|
|
196
|
-
assert!(result[1].metadata.byte_start < result[0].metadata.byte_end);
|
|
197
|
-
|
|
198
|
-
// Third chunk should start before second ends (overlap)
|
|
199
|
-
assert!(result[2].metadata.byte_start < result[1].metadata.byte_end);
|
|
200
|
-
}
|
|
201
|
-
|
|
202
135
|
#[test]
|
|
203
136
|
fn test_build_chunks_with_page_boundaries() {
|
|
204
|
-
let
|
|
137
|
+
let source = "First chunkSecond chunk";
|
|
138
|
+
let text_chunks = vec![&source[0..11], &source[11..23]];
|
|
205
139
|
let boundaries = vec![
|
|
206
140
|
PageBoundary {
|
|
207
141
|
byte_start: 0,
|
|
@@ -215,7 +149,7 @@ mod tests {
|
|
|
215
149
|
},
|
|
216
150
|
];
|
|
217
151
|
|
|
218
|
-
let result = build_chunks(
|
|
152
|
+
let result = build_chunks(source, text_chunks, Some(&boundaries)).unwrap();
|
|
219
153
|
|
|
220
154
|
assert_eq!(result.len(), 2);
|
|
221
155
|
assert_eq!(result[0].metadata.first_page, Some(1));
|
|
@@ -223,65 +157,32 @@ mod tests {
|
|
|
223
157
|
}
|
|
224
158
|
|
|
225
159
|
#[test]
|
|
226
|
-
fn
|
|
227
|
-
let
|
|
228
|
-
|
|
229
|
-
let
|
|
160
|
+
fn test_build_chunks_offset_from_source() {
|
|
161
|
+
let source = "AAAAABBBBBCCCCC";
|
|
162
|
+
// Overlapping slices from source
|
|
163
|
+
let text_chunks = vec![&source[0..5], &source[3..8], &source[6..11]];
|
|
164
|
+
let result = build_chunks(source, text_chunks, None).unwrap();
|
|
230
165
|
|
|
231
166
|
assert_eq!(result.len(), 3);
|
|
232
167
|
|
|
233
|
-
// First chunk: 0-5
|
|
234
168
|
assert_eq!(result[0].metadata.byte_start, 0);
|
|
235
169
|
assert_eq!(result[0].metadata.byte_end, 5);
|
|
236
170
|
|
|
237
|
-
// Second chunk: 3-8 (overlap of 2)
|
|
238
171
|
assert_eq!(result[1].metadata.byte_start, 3);
|
|
239
172
|
assert_eq!(result[1].metadata.byte_end, 8);
|
|
240
173
|
|
|
241
|
-
// Third chunk: 6-11 (overlap of 2, but last chunk so no further adjustment)
|
|
242
174
|
assert_eq!(result[2].metadata.byte_start, 6);
|
|
243
175
|
assert_eq!(result[2].metadata.byte_end, 11);
|
|
244
176
|
}
|
|
245
177
|
|
|
246
|
-
#[test]
|
|
247
|
-
fn test_build_single_chunk_metadata() {
|
|
248
|
-
let mut offset = 0;
|
|
249
|
-
let chunk = build_single_chunk("Test content", 0, 1, &mut offset, 5, None).unwrap();
|
|
250
|
-
|
|
251
|
-
assert_eq!(chunk.content, "Test content");
|
|
252
|
-
assert_eq!(chunk.metadata.byte_start, 0);
|
|
253
|
-
assert_eq!(chunk.metadata.byte_end, 12);
|
|
254
|
-
assert_eq!(chunk.metadata.chunk_index, 0);
|
|
255
|
-
assert_eq!(chunk.metadata.total_chunks, 1);
|
|
256
|
-
assert_eq!(chunk.metadata.first_page, None);
|
|
257
|
-
assert_eq!(chunk.metadata.last_page, None);
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
#[test]
|
|
261
|
-
fn test_build_single_chunk_with_overlap() {
|
|
262
|
-
let mut offset = 0;
|
|
263
|
-
|
|
264
|
-
// First chunk
|
|
265
|
-
let chunk1 = build_single_chunk("0123456789", 0, 2, &mut offset, 3, None).unwrap();
|
|
266
|
-
assert_eq!(chunk1.metadata.byte_start, 0);
|
|
267
|
-
assert_eq!(chunk1.metadata.byte_end, 10);
|
|
268
|
-
assert_eq!(offset, 7); // 10 - 3 (overlap)
|
|
269
|
-
|
|
270
|
-
// Second chunk
|
|
271
|
-
let chunk2 = build_single_chunk("ABCDEFGHIJ", 1, 2, &mut offset, 3, None).unwrap();
|
|
272
|
-
assert_eq!(chunk2.metadata.byte_start, 7);
|
|
273
|
-
assert_eq!(chunk2.metadata.byte_end, 17);
|
|
274
|
-
assert_eq!(offset, 17); // Last chunk, no overlap subtracted
|
|
275
|
-
}
|
|
276
|
-
|
|
277
178
|
#[test]
|
|
278
179
|
fn test_build_chunks_no_overlap() {
|
|
279
|
-
let
|
|
280
|
-
let
|
|
180
|
+
let source = "AAAAABBBBBCCCCC";
|
|
181
|
+
let text_chunks = vec![&source[0..5], &source[5..10], &source[10..15]];
|
|
182
|
+
let result = build_chunks(source, text_chunks, None).unwrap();
|
|
281
183
|
|
|
282
184
|
assert_eq!(result.len(), 3);
|
|
283
185
|
|
|
284
|
-
// Chunks should be contiguous with no overlap
|
|
285
186
|
assert_eq!(result[0].metadata.byte_start, 0);
|
|
286
187
|
assert_eq!(result[0].metadata.byte_end, 5);
|
|
287
188
|
|