kreuzberg 4.4.2 → 4.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +6 -6
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +13 -13
  5. data/ext/kreuzberg_rb/native/Cargo.toml +5 -5
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +7 -0
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/vendor/Cargo.toml +4 -3
  9. data/vendor/kreuzberg/Cargo.toml +6 -6
  10. data/vendor/kreuzberg/README.md +1 -1
  11. data/vendor/kreuzberg/src/chunking/builder.rs +42 -141
  12. data/vendor/kreuzberg/src/chunking/core.rs +136 -1
  13. data/vendor/kreuzberg/src/core/config/extraction/types.rs +7 -0
  14. data/vendor/kreuzberg/src/core/pipeline/features.rs +52 -0
  15. data/vendor/kreuzberg/src/core/pipeline/mod.rs +3 -1
  16. data/vendor/kreuzberg/src/extraction/docx/parser.rs +1 -1
  17. data/vendor/kreuzberg/src/extraction/html/converter.rs +34 -33
  18. data/vendor/kreuzberg/src/extraction/html/mod.rs +5 -1
  19. data/vendor/kreuzberg/src/extraction/html/processor.rs +2 -0
  20. data/vendor/kreuzberg/src/extraction/mod.rs +1 -1
  21. data/vendor/kreuzberg/src/extractors/html.rs +99 -192
  22. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +8 -3
  23. data/vendor/kreuzberg/src/mcp/server.rs +6 -7
  24. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -11
  25. data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +121 -13
  26. data/vendor/kreuzberg/tests/api_consistency.rs +1 -0
  27. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +2 -0
  28. data/vendor/kreuzberg-ffi/Cargo.toml +2 -2
  29. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  30. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +2 -2
  31. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  32. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  33. data/vendor/kreuzberg-tesseract/build.rs +140 -27
  34. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b65c17f6cf4332ce366d8da0197077c2e222a262a2da29b0681c1139918527bf
4
- data.tar.gz: 1082caefd179e5eeaa44b69ad93f02458dae2e6cc2edf5aaf1dfc1eee838276c
3
+ metadata.gz: 879027155ef841eb494ce23bdd9817664d93cc86acb59cf7d04408b6b717cff5
4
+ data.tar.gz: be26ecbf4ad1f07cfa7f02a197ef355ed534e4a708361b8a2fcf4feb6bce514a
5
5
  SHA512:
6
- metadata.gz: '01496d04b7671527a8e861ac3d155a4c1ed96c75b6898d419ad021edba646e896fbd3fa46920c0a8ebf8d7eba79ab3ad83edce44261b813b16a1e6d28161fc61'
7
- data.tar.gz: 64aa2e0654a3300c06948c65ed1d944f2c05f08f9989fcd27f5ab38e20772292d5913edea6beed522da4056ba0288f4416bed1ee6f90438155e3ec112d6ef4cb
6
+ metadata.gz: 980236a2633bfdc3b835aa8bf4b215aa623be2d61d74758bc6b668d5bcc48fecd7ed9dd97db5cb3a65dce3b2aae9aee8b15f2adb5df89bd48f7179fe0daab506
7
+ data.tar.gz: 57a7ed2f02591dbe8c7a79078504e2130953bb177151d246b8dffab40c9db1b260b526ac4abd0780b88835ea3378f835583a9400af7ae32481ad9c90ac4e86eb
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.4.2)
4
+ kreuzberg (4.4.3)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -110,7 +110,7 @@ GEM
110
110
  diff-lcs (>= 1.2.0, < 2.0)
111
111
  rspec-support (~> 3.13.0)
112
112
  rspec-support (3.13.7)
113
- rubocop (1.85.0)
113
+ rubocop (1.85.1)
114
114
  json (~> 2.3)
115
115
  language_server-protocol (~> 3.17.0.2)
116
116
  lint_roller (~> 1.1.0)
@@ -134,7 +134,7 @@ GEM
134
134
  rubocop (~> 1.81)
135
135
  ruby-progressbar (1.13.0)
136
136
  securerandom (0.4.1)
137
- sorbet-runtime (0.6.12984)
137
+ sorbet-runtime (0.6.12993)
138
138
  steep (1.10.0)
139
139
  activesupport (>= 5.1)
140
140
  concurrent-ruby (>= 1.1.10)
@@ -222,7 +222,7 @@ CHECKSUMS
222
222
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
223
223
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
224
224
  json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
225
- kreuzberg (4.4.2)
225
+ kreuzberg (4.4.3)
226
226
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
227
227
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
228
228
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -253,13 +253,13 @@ CHECKSUMS
253
253
  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
254
254
  rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
255
255
  rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
256
- rubocop (1.85.0) sha256=317407feb681a07d54f64d2f9e1d6b6af1ce7678e51cd658e3ad8bd66da48c01
256
+ rubocop (1.85.1) sha256=3dbcf9e961baa4c376eeeb2a03913dca5e3987033b04d38fa538aa1e7406cc77
257
257
  rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
258
258
  rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
259
259
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
260
260
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
261
261
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
262
- sorbet-runtime (0.6.12984) sha256=3fff20a5b147a2e191210563d61886ac121fc1cd8b5e0faf6bc18873139e0fe4
262
+ sorbet-runtime (0.6.12993) sha256=5720d6e70063ed39528ddb18248c13a8072cf6991cf7d6652dcc0b8e9bc6b4ac
263
263
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
264
264
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
265
265
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.3" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1390,9 +1390,9 @@ dependencies = [
1390
1390
 
1391
1391
  [[package]]
1392
1392
  name = "fastembed"
1393
- version = "5.11.0"
1393
+ version = "5.12.0"
1394
1394
  source = "registry+https://github.com/rust-lang/crates.io-index"
1395
- checksum = "b4339d45a80579ab8305616a501eacdbf18fb0f7def7fa6e4c0b75941416d5b0"
1395
+ checksum = "e7b01c79c5cb8ab3ce31c3d52916fda278e14cac027ff3a9cb66c419ed7288f8"
1396
1396
  dependencies = [
1397
1397
  "anyhow",
1398
1398
  "hf-hub 0.4.3",
@@ -1977,9 +1977,9 @@ dependencies = [
1977
1977
 
1978
1978
  [[package]]
1979
1979
  name = "html-to-markdown-rs"
1980
- version = "2.27.2"
1980
+ version = "2.28.0"
1981
1981
  source = "registry+https://github.com/rust-lang/crates.io-index"
1982
- checksum = "06f53f29c48f234830ccc312f652770723419fe9b3529fa5e0b027a0fedb7869"
1982
+ checksum = "842656c6f85a8e7b35fd5656646ffa0d6644cdcfe429634c7d9923a5626e330b"
1983
1983
  dependencies = [
1984
1984
  "ahash",
1985
1985
  "astral-tl",
@@ -2661,7 +2661,7 @@ dependencies = [
2661
2661
  "thiserror 2.0.18",
2662
2662
  "tiff 0.11.3",
2663
2663
  "tokio",
2664
- "toml 1.0.3+spec-1.1.0",
2664
+ "toml 1.0.4+spec-1.1.0",
2665
2665
  "tower",
2666
2666
  "tower-http",
2667
2667
  "tracing",
@@ -2729,7 +2729,7 @@ dependencies = [
2729
2729
 
2730
2730
  [[package]]
2731
2731
  name = "kreuzberg-rb"
2732
- version = "4.4.1"
2732
+ version = "4.4.3"
2733
2733
  dependencies = [
2734
2734
  "async-trait",
2735
2735
  "html-to-markdown-rs",
@@ -2741,7 +2741,7 @@ dependencies = [
2741
2741
  "serde_json",
2742
2742
  "serde_yaml_ng",
2743
2743
  "tokio",
2744
- "toml 1.0.3+spec-1.1.0",
2744
+ "toml 1.0.4+spec-1.1.0",
2745
2745
  ]
2746
2746
 
2747
2747
  [[package]]
@@ -5294,9 +5294,9 @@ dependencies = [
5294
5294
 
5295
5295
  [[package]]
5296
5296
  name = "toml"
5297
- version = "1.0.3+spec-1.1.0"
5297
+ version = "1.0.4+spec-1.1.0"
5298
5298
  source = "registry+https://github.com/rust-lang/crates.io-index"
5299
- checksum = "c7614eaf19ad818347db24addfa201729cf2a9b6fdfd9eb0ab870fcacc606c0c"
5299
+ checksum = "c94c3321114413476740df133f0d8862c61d87c8d26f04c6841e033c8c80db47"
5300
5300
  dependencies = [
5301
5301
  "indexmap",
5302
5302
  "serde_core",
@@ -5700,9 +5700,9 @@ dependencies = [
5700
5700
 
5701
5701
  [[package]]
5702
5702
  name = "uuid"
5703
- version = "1.21.0"
5703
+ version = "1.22.0"
5704
5704
  source = "registry+https://github.com/rust-lang/crates.io-index"
5705
- checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb"
5705
+ checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37"
5706
5706
  dependencies = [
5707
5707
  "getrandom 0.4.2",
5708
5708
  "js-sys",
@@ -6314,9 +6314,9 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
6314
6314
 
6315
6315
  [[package]]
6316
6316
  name = "winnow"
6317
- version = "0.7.14"
6317
+ version = "0.7.15"
6318
6318
  source = "registry+https://github.com/rust-lang/crates.io-index"
6319
- checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829"
6319
+ checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
6320
6320
 
6321
6321
  [[package]]
6322
6322
  name = "wit-bindgen"
@@ -25,10 +25,10 @@ hex = "0.4.3"
25
25
  num_cpus = "1.17.0"
26
26
  once_cell = "1.21.3"
27
27
  parking_lot = "0.12.5"
28
- html-to-markdown-rs = { version = "2.27.2", default-features = false }
28
+ html-to-markdown-rs = { version = "2.28.0", default-features = false }
29
29
  reqwest = { version = "0.13.2", default-features = false }
30
30
  image = { version = "0.25.9", default-features = false }
31
- toml = "1.0.3"
31
+ toml = "1.0.4"
32
32
  tempfile = "3.26.0"
33
33
  lzma-rust2 = { version = "0.16.2" }
34
34
  log = "0.4"
@@ -45,7 +45,7 @@ collapsible_if = "allow"
45
45
 
46
46
  [package]
47
47
  name = "kreuzberg-rb"
48
- version = "4.4.2"
48
+ version = "4.4.3"
49
49
  edition = "2024"
50
50
  rust-version = "1.91"
51
51
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -99,7 +99,7 @@ rb-sys = { version = "0.9.124", default-features = false, features = [
99
99
  "stable-api-compiled-fallback",
100
100
  ] }
101
101
  serde_json = "1.0.149"
102
- toml = "1.0.3"
102
+ toml = "1.0.4"
103
103
  serde_yaml_ng = "0.10"
104
104
  tokio = { version = "1.50.0", features = [
105
105
  "rt",
@@ -111,7 +111,7 @@ tokio = { version = "1.50.0", features = [
111
111
  "time",
112
112
  "io-util",
113
113
  ] }
114
- html-to-markdown-rs = { version = "2.27.2", default-features = false }
114
+ html-to-markdown-rs = { version = "2.28.0", default-features = false }
115
115
 
116
116
  [dev-dependencies]
117
117
  pretty_assertions = "1.4"
@@ -317,10 +317,17 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
317
317
  600
318
318
  };
319
319
 
320
+ let inject_placeholders = if let Some(val) = get_kw(ruby, hash, "inject_placeholders") {
321
+ bool::try_convert(val)?
322
+ } else {
323
+ true
324
+ };
325
+
320
326
  let config = ImageExtractionConfig {
321
327
  extract_images,
322
328
  target_dpi,
323
329
  max_image_dimension,
330
+ inject_placeholders,
324
331
  auto_adjust_dpi,
325
332
  min_dpi,
326
333
  max_dpi,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.4.2'
4
+ VERSION = '4.4.3'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.4.2"
5
+ version = "4.4.3"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -23,7 +23,7 @@ criterion = { version = "0.8", features = ["html_reports"] }
23
23
  ctor = "0.6"
24
24
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
25
25
  hex = "0.4.3"
26
- html-to-markdown-rs = { version = "2.27.2", default-features = false }
26
+ html-to-markdown-rs = { version = "2.28.0", default-features = false }
27
27
  image = { version = "0.25.9", default-features = false }
28
28
  itertools = "0.14"
29
29
  js-sys = "0.3"
@@ -33,6 +33,7 @@ log = "0.4"
33
33
  lzma-rust2 = { version = "0.16.2" }
34
34
  num_cpus = "1.17.0"
35
35
  once_cell = "1.21.3"
36
+ ort = { version = "=2.0.0-rc.11", default-features = false }
36
37
  parking_lot = "0.12.5"
37
38
  pdfium-render = { package = "kreuzberg-pdfium-render", version = "4.3" }
38
39
  rayon = "1.11.0"
@@ -42,7 +43,7 @@ serde_json = { version = "1.0.149" }
42
43
  tempfile = "3.26.0"
43
44
  thiserror = "2.0.18"
44
45
  tokio = { version = "1.50.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
45
- toml = "1.0.3"
46
+ toml = "1.0.4"
46
47
  tracing = "0.1"
47
48
  wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
48
49
  wasm-bindgen-futures = "0.4"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.4.2"
3
+ version = "4.4.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -153,7 +153,7 @@ serde = { version = "1.0.228", features = ["derive"] }
153
153
  serde_json = { version = "1.0.149" }
154
154
  serde_yaml_ng = "0.10.0"
155
155
  jotdown = "0.9"
156
- toml = "1.0.3"
156
+ toml = "1.0.4"
157
157
  mime_guess = "2.0"
158
158
  rmp-serde = "1.3"
159
159
  thiserror = "2.0.18"
@@ -169,8 +169,8 @@ zip = { version = "8.2.0", optional = true, default-features = false, features =
169
169
  "deflate-flate2",
170
170
  ] }
171
171
  mail-parser = { version = "0.11.2", optional = true }
172
- html-to-markdown-rs = { version = "2.27.2", default-features = false , features = [
173
- "inline-images", "metadata", ], optional = true }
172
+ html-to-markdown-rs = { version = "2.28.0", default-features = false , features = [
173
+ "inline-images", "metadata", "visitor", ], optional = true }
174
174
  cfb = { version = "0.14.0", optional = true }
175
175
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
176
176
  tar = { version = "0.4.44", optional = true }
@@ -244,7 +244,7 @@ kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "4.3", opti
244
244
  reqwest = { version = "0.13.2", default-features = false , features = [
245
245
  "json", "rustls", "blocking", ], optional = true }
246
246
  # Use rustls-tls for fastembed on non-Windows platforms
247
- fastembed = { version = "5.11", default-features = false, features = [
247
+ fastembed = { version = "5.12", default-features = false, features = [
248
248
  "hf-hub-rustls-tls",
249
249
  "ort-load-dynamic",
250
250
  ], optional = true }
@@ -260,7 +260,7 @@ kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "4.3", opti
260
260
  reqwest = { version = "0.13.2", default-features = false , features = [
261
261
  "json", "native-tls", "blocking", ], optional = true }
262
262
  # Use native-tls for fastembed on Windows
263
- fastembed = { version = "5.11", default-features = false, features = [
263
+ fastembed = { version = "5.12", default-features = false, features = [
264
264
  "hf-hub-native-tls",
265
265
  "ort-load-dynamic",
266
266
  ], optional = true }
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.4.2 Release**
20
+ > **🚀 Version 4.4.3 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -35,14 +35,15 @@ pub fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) ->
35
35
  ///
36
36
  /// This function takes a collection of text segments (produced by a text splitter)
37
37
  /// and constructs Chunk objects with proper metadata, including:
38
- /// - Byte offsets accounting for overlap
38
+ /// - Byte offsets derived from the chunk's position in the source text
39
39
  /// - Chunk indices and total count
40
40
  /// - Page boundary information (if provided)
41
41
  ///
42
42
  /// # Arguments
43
43
  ///
44
+ /// * `source_text` - The original text that the chunks were split from. Chunk
45
+ /// slices must borrow from this text (as `text-splitter` guarantees).
44
46
  /// * `text_chunks` - Iterator of text segments to convert into chunks
45
- /// * `overlap` - Number of characters to overlap between chunks
46
47
  /// * `page_boundaries` - Optional page boundary markers for mapping chunks to pages
47
48
  ///
48
49
  /// # Returns
@@ -53,8 +54,8 @@ pub fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) ->
53
54
  ///
54
55
  /// Returns an error if page boundary calculation fails.
55
56
  pub fn build_chunks<'a, I>(
57
+ source_text: &'a str,
56
58
  text_chunks: I,
57
- overlap: usize,
58
59
  page_boundaries: Option<&[PageBoundary]>,
59
60
  ) -> Result<Vec<Chunk>>
60
61
  where
@@ -62,86 +63,37 @@ where
62
63
  {
63
64
  let chunks_vec: Vec<&str> = text_chunks.into_iter().collect();
64
65
  let total_chunks = chunks_vec.len();
65
- let mut byte_offset = 0;
66
+ let source_start = source_text.as_ptr() as usize;
66
67
  let mut chunks = Vec::with_capacity(total_chunks);
67
68
 
68
69
  for (index, chunk_text) in chunks_vec.into_iter().enumerate() {
69
- let chunk = build_single_chunk(
70
- chunk_text,
71
- index,
72
- total_chunks,
73
- &mut byte_offset,
74
- overlap,
75
- page_boundaries,
76
- )?;
77
- chunks.push(chunk);
70
+ let byte_start = chunk_text.as_ptr() as usize - source_start;
71
+ let byte_end = byte_start + chunk_text.len();
72
+
73
+ let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
74
+ calculate_page_range(byte_start, byte_end, boundaries)?
75
+ } else {
76
+ (None, None)
77
+ };
78
+
79
+ chunks.push(Chunk {
80
+ content: chunk_text.to_string(),
81
+ embedding: None,
82
+ metadata: ChunkMetadata {
83
+ byte_start,
84
+ byte_end,
85
+ token_count: None,
86
+ chunk_index: index,
87
+ total_chunks,
88
+ first_page,
89
+ last_page,
90
+ },
91
+ });
78
92
  }
79
93
 
80
94
  Ok(chunks)
81
95
  }
82
96
 
83
- /// Build a single chunk with metadata.
84
- ///
85
- /// # Arguments
86
- ///
87
- /// * `chunk_text` - The text content for this chunk
88
- /// * `index` - Zero-based index of this chunk
89
- /// * `total_chunks` - Total number of chunks in the collection
90
- /// * `byte_offset` - Mutable reference to current byte offset (will be updated)
91
- /// * `overlap` - Number of characters to overlap between chunks
92
- /// * `page_boundaries` - Optional page boundary markers
93
- ///
94
- /// # Returns
95
- ///
96
- /// A complete Chunk object with all metadata filled in.
97
- ///
98
- /// # Errors
99
- ///
100
- /// Returns an error if page boundary calculation fails.
101
- fn build_single_chunk(
102
- chunk_text: &str,
103
- index: usize,
104
- total_chunks: usize,
105
- byte_offset: &mut usize,
106
- overlap: usize,
107
- page_boundaries: Option<&[PageBoundary]>,
108
- ) -> Result<Chunk> {
109
- let byte_start = *byte_offset;
110
- let chunk_length = chunk_text.len();
111
- let byte_end = byte_start + chunk_length;
112
-
113
- // Calculate overlap for next chunk (not applicable to last chunk)
114
- let overlap_chars = if index < total_chunks - 1 {
115
- overlap.min(chunk_length)
116
- } else {
117
- 0
118
- };
119
-
120
- // Update offset for next chunk, accounting for overlap
121
- *byte_offset = byte_end - overlap_chars;
122
-
123
- // Calculate page range if boundaries are provided
124
- let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
125
- calculate_page_range(byte_start, byte_end, boundaries)?
126
- } else {
127
- (None, None)
128
- };
129
-
130
- Ok(Chunk {
131
- content: chunk_text.to_string(),
132
- embedding: None,
133
- metadata: ChunkMetadata {
134
- byte_start,
135
- byte_end,
136
- token_count: None,
137
- chunk_index: index,
138
- total_chunks,
139
- first_page,
140
- last_page,
141
- },
142
- })
143
- }
144
-
145
97
  #[cfg(test)]
146
98
  mod tests {
147
99
  use super::*;
@@ -162,15 +114,16 @@ mod tests {
162
114
 
163
115
  #[test]
164
116
  fn test_build_chunks_empty() {
117
+ let source = "";
165
118
  let text_chunks: Vec<&str> = vec![];
166
- let result = build_chunks(text_chunks, 5, None).unwrap();
119
+ let result = build_chunks(source, text_chunks, None).unwrap();
167
120
  assert_eq!(result.len(), 0);
168
121
  }
169
122
 
170
123
  #[test]
171
124
  fn test_build_chunks_single() {
172
- let text_chunks = vec!["Single chunk"];
173
- let result = build_chunks(text_chunks, 5, None).unwrap();
125
+ let source = "Single chunk";
126
+ let result = build_chunks(source, vec![source], None).unwrap();
174
127
  assert_eq!(result.len(), 1);
175
128
  assert_eq!(result[0].content, "Single chunk");
176
129
  assert_eq!(result[0].metadata.chunk_index, 0);
@@ -179,29 +132,10 @@ mod tests {
179
132
  assert_eq!(result[0].metadata.byte_end, 12);
180
133
  }
181
134
 
182
- #[test]
183
- fn test_build_chunks_multiple_with_overlap() {
184
- let text_chunks = vec!["First chunk here", "Second chunk here", "Third chunk here"];
185
- let overlap = 5;
186
- let result = build_chunks(text_chunks, overlap, None).unwrap();
187
-
188
- assert_eq!(result.len(), 3);
189
-
190
- // First chunk
191
- assert_eq!(result[0].content, "First chunk here");
192
- assert_eq!(result[0].metadata.byte_start, 0);
193
- assert_eq!(result[0].metadata.byte_end, 16);
194
-
195
- // Second chunk should start before first ends (overlap)
196
- assert!(result[1].metadata.byte_start < result[0].metadata.byte_end);
197
-
198
- // Third chunk should start before second ends (overlap)
199
- assert!(result[2].metadata.byte_start < result[1].metadata.byte_end);
200
- }
201
-
202
135
  #[test]
203
136
  fn test_build_chunks_with_page_boundaries() {
204
- let text_chunks = vec!["First chunk", "Second chunk"];
137
+ let source = "First chunkSecond chunk";
138
+ let text_chunks = vec![&source[0..11], &source[11..23]];
205
139
  let boundaries = vec![
206
140
  PageBoundary {
207
141
  byte_start: 0,
@@ -215,7 +149,7 @@ mod tests {
215
149
  },
216
150
  ];
217
151
 
218
- let result = build_chunks(text_chunks, 0, Some(&boundaries)).unwrap();
152
+ let result = build_chunks(source, text_chunks, Some(&boundaries)).unwrap();
219
153
 
220
154
  assert_eq!(result.len(), 2);
221
155
  assert_eq!(result[0].metadata.first_page, Some(1));
@@ -223,65 +157,32 @@ mod tests {
223
157
  }
224
158
 
225
159
  #[test]
226
- fn test_build_chunks_offset_tracking() {
227
- let text_chunks = vec!["AAAAA", "BBBBB", "CCCCC"];
228
- let overlap = 2;
229
- let result = build_chunks(text_chunks, overlap, None).unwrap();
160
+ fn test_build_chunks_offset_from_source() {
161
+ let source = "AAAAABBBBBCCCCC";
162
+ // Overlapping slices from source
163
+ let text_chunks = vec![&source[0..5], &source[3..8], &source[6..11]];
164
+ let result = build_chunks(source, text_chunks, None).unwrap();
230
165
 
231
166
  assert_eq!(result.len(), 3);
232
167
 
233
- // First chunk: 0-5
234
168
  assert_eq!(result[0].metadata.byte_start, 0);
235
169
  assert_eq!(result[0].metadata.byte_end, 5);
236
170
 
237
- // Second chunk: 3-8 (overlap of 2)
238
171
  assert_eq!(result[1].metadata.byte_start, 3);
239
172
  assert_eq!(result[1].metadata.byte_end, 8);
240
173
 
241
- // Third chunk: 6-11 (overlap of 2, but last chunk so no further adjustment)
242
174
  assert_eq!(result[2].metadata.byte_start, 6);
243
175
  assert_eq!(result[2].metadata.byte_end, 11);
244
176
  }
245
177
 
246
- #[test]
247
- fn test_build_single_chunk_metadata() {
248
- let mut offset = 0;
249
- let chunk = build_single_chunk("Test content", 0, 1, &mut offset, 5, None).unwrap();
250
-
251
- assert_eq!(chunk.content, "Test content");
252
- assert_eq!(chunk.metadata.byte_start, 0);
253
- assert_eq!(chunk.metadata.byte_end, 12);
254
- assert_eq!(chunk.metadata.chunk_index, 0);
255
- assert_eq!(chunk.metadata.total_chunks, 1);
256
- assert_eq!(chunk.metadata.first_page, None);
257
- assert_eq!(chunk.metadata.last_page, None);
258
- }
259
-
260
- #[test]
261
- fn test_build_single_chunk_with_overlap() {
262
- let mut offset = 0;
263
-
264
- // First chunk
265
- let chunk1 = build_single_chunk("0123456789", 0, 2, &mut offset, 3, None).unwrap();
266
- assert_eq!(chunk1.metadata.byte_start, 0);
267
- assert_eq!(chunk1.metadata.byte_end, 10);
268
- assert_eq!(offset, 7); // 10 - 3 (overlap)
269
-
270
- // Second chunk
271
- let chunk2 = build_single_chunk("ABCDEFGHIJ", 1, 2, &mut offset, 3, None).unwrap();
272
- assert_eq!(chunk2.metadata.byte_start, 7);
273
- assert_eq!(chunk2.metadata.byte_end, 17);
274
- assert_eq!(offset, 17); // Last chunk, no overlap subtracted
275
- }
276
-
277
178
  #[test]
278
179
  fn test_build_chunks_no_overlap() {
279
- let text_chunks = vec!["AAAAA", "BBBBB", "CCCCC"];
280
- let result = build_chunks(text_chunks, 0, None).unwrap();
180
+ let source = "AAAAABBBBBCCCCC";
181
+ let text_chunks = vec![&source[0..5], &source[5..10], &source[10..15]];
182
+ let result = build_chunks(source, text_chunks, None).unwrap();
281
183
 
282
184
  assert_eq!(result.len(), 3);
283
185
 
284
- // Chunks should be contiguous with no overlap
285
186
  assert_eq!(result[0].metadata.byte_start, 0);
286
187
  assert_eq!(result[0].metadata.byte_end, 5);
287
188