spnl 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spnl might be problematic. Click here for more details.
- {spnl-0.6.0 → spnl-0.6.1}/Cargo.lock +44 -33
- {spnl-0.6.0 → spnl-0.6.1}/PKG-INFO +1 -1
- {spnl-0.6.0 → spnl-0.6.1}/spnl/Cargo.toml +1 -1
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/tokenize.rs +133 -10
- {spnl-0.6.0 → spnl-0.6.1}/Cargo.toml +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/README.md +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/pyproject.toml +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/README.md +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/examples/.gitignore +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/examples/user.py +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/embed.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/index/layer1.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/index/mod.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/index/raptor.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/index/simple_embed_retrieve.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/index/windowing.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/mod.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/options.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/retrieve.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/augment/storage.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/chat_template.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/execute.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/generate/backend/mod.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/generate/backend/openai.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/generate/backend/spnl.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/generate/mod.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/lib.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/lisp.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/plan.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/pull.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/python.rs +0 -0
- {spnl-0.6.0 → spnl-0.6.1}/spnl/src/query.rs +0 -0
|
@@ -149,9 +149,9 @@ dependencies = [
|
|
|
149
149
|
|
|
150
150
|
[[package]]
|
|
151
151
|
name = "anyhow"
|
|
152
|
-
version = "1.0.
|
|
152
|
+
version = "1.0.100"
|
|
153
153
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
154
|
-
checksum = "
|
|
154
|
+
checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
|
|
155
155
|
|
|
156
156
|
[[package]]
|
|
157
157
|
name = "arc-swap"
|
|
@@ -2890,7 +2890,7 @@ dependencies = [
|
|
|
2890
2890
|
|
|
2891
2891
|
[[package]]
|
|
2892
2892
|
name = "haystack"
|
|
2893
|
-
version = "0.6.
|
|
2893
|
+
version = "0.6.1"
|
|
2894
2894
|
dependencies = [
|
|
2895
2895
|
"anyhow",
|
|
2896
2896
|
"clap",
|
|
@@ -3500,9 +3500,9 @@ dependencies = [
|
|
|
3500
3500
|
|
|
3501
3501
|
[[package]]
|
|
3502
3502
|
name = "js-sys"
|
|
3503
|
-
version = "0.3.
|
|
3503
|
+
version = "0.3.80"
|
|
3504
3504
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3505
|
-
checksum = "
|
|
3505
|
+
checksum = "852f13bec5eba4ba9afbeb93fd7c13fe56147f055939ae21c43a29a0ecb2702e"
|
|
3506
3506
|
dependencies = [
|
|
3507
3507
|
"once_cell",
|
|
3508
3508
|
"wasm-bindgen",
|
|
@@ -5031,7 +5031,7 @@ checksum = "9cd31dcfdbbd7431a807ef4df6edd6473228e94d5c805e8cf671227a21bad068"
|
|
|
5031
5031
|
dependencies = [
|
|
5032
5032
|
"anyhow",
|
|
5033
5033
|
"clap",
|
|
5034
|
-
"itertools 0.
|
|
5034
|
+
"itertools 0.13.0",
|
|
5035
5035
|
"proc-macro2",
|
|
5036
5036
|
"quote",
|
|
5037
5037
|
"rand 0.8.5",
|
|
@@ -5218,7 +5218,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
5218
5218
|
checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf"
|
|
5219
5219
|
dependencies = [
|
|
5220
5220
|
"heck",
|
|
5221
|
-
"itertools 0.
|
|
5221
|
+
"itertools 0.13.0",
|
|
5222
5222
|
"log",
|
|
5223
5223
|
"multimap",
|
|
5224
5224
|
"once_cell",
|
|
@@ -5238,7 +5238,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
5238
5238
|
checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d"
|
|
5239
5239
|
dependencies = [
|
|
5240
5240
|
"anyhow",
|
|
5241
|
-
"itertools 0.
|
|
5241
|
+
"itertools 0.13.0",
|
|
5242
5242
|
"proc-macro2",
|
|
5243
5243
|
"quote",
|
|
5244
5244
|
"syn 2.0.106",
|
|
@@ -6158,10 +6158,11 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
|
|
|
6158
6158
|
|
|
6159
6159
|
[[package]]
|
|
6160
6160
|
name = "serde"
|
|
6161
|
-
version = "1.0.
|
|
6161
|
+
version = "1.0.225"
|
|
6162
6162
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6163
|
-
checksum = "
|
|
6163
|
+
checksum = "fd6c24dee235d0da097043389623fb913daddf92c76e9f5a1db88607a0bcbd1d"
|
|
6164
6164
|
dependencies = [
|
|
6165
|
+
"serde_core",
|
|
6165
6166
|
"serde_derive",
|
|
6166
6167
|
]
|
|
6167
6168
|
|
|
@@ -6175,11 +6176,20 @@ dependencies = [
|
|
|
6175
6176
|
"serde",
|
|
6176
6177
|
]
|
|
6177
6178
|
|
|
6179
|
+
[[package]]
|
|
6180
|
+
name = "serde_core"
|
|
6181
|
+
version = "1.0.225"
|
|
6182
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6183
|
+
checksum = "659356f9a0cb1e529b24c01e43ad2bdf520ec4ceaf83047b83ddcc2251f96383"
|
|
6184
|
+
dependencies = [
|
|
6185
|
+
"serde_derive",
|
|
6186
|
+
]
|
|
6187
|
+
|
|
6178
6188
|
[[package]]
|
|
6179
6189
|
name = "serde_derive"
|
|
6180
|
-
version = "1.0.
|
|
6190
|
+
version = "1.0.225"
|
|
6181
6191
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6182
|
-
checksum = "
|
|
6192
|
+
checksum = "0ea936adf78b1f766949a4977b91d2f5595825bd6ec079aa9543ad2685fc4516"
|
|
6183
6193
|
dependencies = [
|
|
6184
6194
|
"proc-macro2",
|
|
6185
6195
|
"quote",
|
|
@@ -6188,14 +6198,15 @@ dependencies = [
|
|
|
6188
6198
|
|
|
6189
6199
|
[[package]]
|
|
6190
6200
|
name = "serde_json"
|
|
6191
|
-
version = "1.0.
|
|
6201
|
+
version = "1.0.145"
|
|
6192
6202
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6193
|
-
checksum = "
|
|
6203
|
+
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
|
|
6194
6204
|
dependencies = [
|
|
6195
6205
|
"itoa",
|
|
6196
6206
|
"memchr",
|
|
6197
6207
|
"ryu",
|
|
6198
6208
|
"serde",
|
|
6209
|
+
"serde_core",
|
|
6199
6210
|
]
|
|
6200
6211
|
|
|
6201
6212
|
[[package]]
|
|
@@ -6483,7 +6494,7 @@ dependencies = [
|
|
|
6483
6494
|
|
|
6484
6495
|
[[package]]
|
|
6485
6496
|
name = "spnl"
|
|
6486
|
-
version = "0.6.
|
|
6497
|
+
version = "0.6.1"
|
|
6487
6498
|
dependencies = [
|
|
6488
6499
|
"anyhow",
|
|
6489
6500
|
"arrow-array",
|
|
@@ -6521,7 +6532,7 @@ dependencies = [
|
|
|
6521
6532
|
|
|
6522
6533
|
[[package]]
|
|
6523
6534
|
name = "spnl-cli"
|
|
6524
|
-
version = "0.6.
|
|
6535
|
+
version = "0.6.1"
|
|
6525
6536
|
dependencies = [
|
|
6526
6537
|
"anyhow",
|
|
6527
6538
|
"clap",
|
|
@@ -6536,7 +6547,7 @@ dependencies = [
|
|
|
6536
6547
|
|
|
6537
6548
|
[[package]]
|
|
6538
6549
|
name = "spnl-wasm"
|
|
6539
|
-
version = "0.6.
|
|
6550
|
+
version = "0.6.1"
|
|
6540
6551
|
dependencies = [
|
|
6541
6552
|
"serde_json",
|
|
6542
6553
|
"spnl",
|
|
@@ -7000,9 +7011,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
|
|
7000
7011
|
|
|
7001
7012
|
[[package]]
|
|
7002
7013
|
name = "tokenizers"
|
|
7003
|
-
version = "0.22.
|
|
7014
|
+
version = "0.22.1"
|
|
7004
7015
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7005
|
-
checksum = "
|
|
7016
|
+
checksum = "6475a27088c98ea96d00b39a9ddfb63780d1ad4cceb6f48374349a96ab2b7842"
|
|
7006
7017
|
dependencies = [
|
|
7007
7018
|
"ahash",
|
|
7008
7019
|
"aho-corasick",
|
|
@@ -7454,9 +7465,9 @@ dependencies = [
|
|
|
7454
7465
|
|
|
7455
7466
|
[[package]]
|
|
7456
7467
|
name = "wasm-bindgen"
|
|
7457
|
-
version = "0.2.
|
|
7468
|
+
version = "0.2.103"
|
|
7458
7469
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7459
|
-
checksum = "
|
|
7470
|
+
checksum = "ab10a69fbd0a177f5f649ad4d8d3305499c42bab9aef2f7ff592d0ec8f833819"
|
|
7460
7471
|
dependencies = [
|
|
7461
7472
|
"cfg-if",
|
|
7462
7473
|
"once_cell",
|
|
@@ -7467,9 +7478,9 @@ dependencies = [
|
|
|
7467
7478
|
|
|
7468
7479
|
[[package]]
|
|
7469
7480
|
name = "wasm-bindgen-backend"
|
|
7470
|
-
version = "0.2.
|
|
7481
|
+
version = "0.2.103"
|
|
7471
7482
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7472
|
-
checksum = "
|
|
7483
|
+
checksum = "0bb702423545a6007bbc368fde243ba47ca275e549c8a28617f56f6ba53b1d1c"
|
|
7473
7484
|
dependencies = [
|
|
7474
7485
|
"bumpalo",
|
|
7475
7486
|
"log",
|
|
@@ -7481,9 +7492,9 @@ dependencies = [
|
|
|
7481
7492
|
|
|
7482
7493
|
[[package]]
|
|
7483
7494
|
name = "wasm-bindgen-futures"
|
|
7484
|
-
version = "0.4.
|
|
7495
|
+
version = "0.4.53"
|
|
7485
7496
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7486
|
-
checksum = "
|
|
7497
|
+
checksum = "a0b221ff421256839509adbb55998214a70d829d3a28c69b4a6672e9d2a42f67"
|
|
7487
7498
|
dependencies = [
|
|
7488
7499
|
"cfg-if",
|
|
7489
7500
|
"js-sys",
|
|
@@ -7494,9 +7505,9 @@ dependencies = [
|
|
|
7494
7505
|
|
|
7495
7506
|
[[package]]
|
|
7496
7507
|
name = "wasm-bindgen-macro"
|
|
7497
|
-
version = "0.2.
|
|
7508
|
+
version = "0.2.103"
|
|
7498
7509
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7499
|
-
checksum = "
|
|
7510
|
+
checksum = "fc65f4f411d91494355917b605e1480033152658d71f722a90647f56a70c88a0"
|
|
7500
7511
|
dependencies = [
|
|
7501
7512
|
"quote",
|
|
7502
7513
|
"wasm-bindgen-macro-support",
|
|
@@ -7504,9 +7515,9 @@ dependencies = [
|
|
|
7504
7515
|
|
|
7505
7516
|
[[package]]
|
|
7506
7517
|
name = "wasm-bindgen-macro-support"
|
|
7507
|
-
version = "0.2.
|
|
7518
|
+
version = "0.2.103"
|
|
7508
7519
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7509
|
-
checksum = "
|
|
7520
|
+
checksum = "ffc003a991398a8ee604a401e194b6b3a39677b3173d6e74495eb51b82e99a32"
|
|
7510
7521
|
dependencies = [
|
|
7511
7522
|
"proc-macro2",
|
|
7512
7523
|
"quote",
|
|
@@ -7517,9 +7528,9 @@ dependencies = [
|
|
|
7517
7528
|
|
|
7518
7529
|
[[package]]
|
|
7519
7530
|
name = "wasm-bindgen-shared"
|
|
7520
|
-
version = "0.2.
|
|
7531
|
+
version = "0.2.103"
|
|
7521
7532
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7522
|
-
checksum = "
|
|
7533
|
+
checksum = "293c37f4efa430ca14db3721dfbe48d8c33308096bd44d80ebaa775ab71ba1cf"
|
|
7523
7534
|
dependencies = [
|
|
7524
7535
|
"unicode-ident",
|
|
7525
7536
|
]
|
|
@@ -7539,9 +7550,9 @@ dependencies = [
|
|
|
7539
7550
|
|
|
7540
7551
|
[[package]]
|
|
7541
7552
|
name = "web-sys"
|
|
7542
|
-
version = "0.3.
|
|
7553
|
+
version = "0.3.80"
|
|
7543
7554
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7544
|
-
checksum = "
|
|
7555
|
+
checksum = "fbe734895e869dc429d78c4b433f8d17d95f8d05317440b4fad5ab2d33e596dc"
|
|
7545
7556
|
dependencies = [
|
|
7546
7557
|
"js-sys",
|
|
7547
7558
|
"wasm-bindgen",
|
|
@@ -14,6 +14,7 @@ struct Tokenizer {
|
|
|
14
14
|
cross_token: Option<u32>,
|
|
15
15
|
plus_token: Option<u32>,
|
|
16
16
|
block_size: usize,
|
|
17
|
+
assistant_suffix_num_tokens: usize,
|
|
17
18
|
}
|
|
18
19
|
|
|
19
20
|
impl Tokenizer {
|
|
@@ -33,8 +34,14 @@ impl Tokenizer {
|
|
|
33
34
|
}
|
|
34
35
|
|
|
35
36
|
fn assistanttok(&self, m: &str, tokens: &mut Vec<u32>) -> tokenizers::tokenizer::Result<()> {
|
|
37
|
+
let encoding = self.tok.encode_fast(self.assistant(m), false)?;
|
|
38
|
+
let extra = encoding.get_ids();
|
|
39
|
+
|
|
36
40
|
self.extend_crop(
|
|
37
|
-
|
|
41
|
+
// TODO: for now, we always drop any assistant suffix. we
|
|
42
|
+
// will need to figure out how to isolatge these on their
|
|
43
|
+
// own block
|
|
44
|
+
&extra[0..extra.len() - self.assistant_suffix_num_tokens],
|
|
38
45
|
tokens,
|
|
39
46
|
);
|
|
40
47
|
Ok(())
|
|
@@ -75,7 +82,7 @@ impl Tokenizer {
|
|
|
75
82
|
// `tokens.len()+self.block_size-1`.
|
|
76
83
|
let end = extra.len() + tokens.len();
|
|
77
84
|
let nearest_block_boundary = end / self.block_size * self.block_size;
|
|
78
|
-
let amount_to_crop = end - nearest_block_boundary;
|
|
85
|
+
let amount_to_crop = ::std::cmp::min(extra.len(), end - nearest_block_boundary);
|
|
79
86
|
let extra_end = extra.len() - amount_to_crop;
|
|
80
87
|
|
|
81
88
|
self.extend(&extra[0..extra_end], tokens);
|
|
@@ -106,13 +113,44 @@ impl TokenizerState {
|
|
|
106
113
|
block_size: usize,
|
|
107
114
|
) -> Result<::std::sync::Arc<Tokenizer>, ::std::sync::Arc<tokenizers::tokenizer::Error>> {
|
|
108
115
|
self.cache.try_get_with(model.clone(), || {
|
|
116
|
+
let tok = tokenizers::tokenizer::Tokenizer::from_pretrained(model, None)?;
|
|
117
|
+
let tmpl = chat_template::detect(model)?;
|
|
118
|
+
|
|
119
|
+
let m = "hello";
|
|
120
|
+
let binding = tok.encode_fast(
|
|
121
|
+
chat_template::apply(tmpl, &[Message::Assistant(m.to_owned())], false),
|
|
122
|
+
false,
|
|
123
|
+
)?;
|
|
124
|
+
let binding2 = tok.encode_fast(m, false)?;
|
|
125
|
+
let with_chat_template = binding.get_ids();
|
|
126
|
+
let without_chat_template = binding2.get_ids();
|
|
127
|
+
|
|
128
|
+
// TODO this is imperfect...
|
|
129
|
+
let start_of_message_idx = with_chat_template
|
|
130
|
+
.iter()
|
|
131
|
+
.position(|t| *t == without_chat_template[0]);
|
|
132
|
+
let end_of_message_idx = start_of_message_idx
|
|
133
|
+
.map(|start_of_message_idx| start_of_message_idx + without_chat_template.len());
|
|
134
|
+
// [pppppmmmmmmmmmss] <- ppppp are the prefix speical tokens added by chat template; ss suffix special tokens
|
|
135
|
+
// ^ start_of_message_idx
|
|
136
|
+
// ^ end_of_message_idx
|
|
137
|
+
let assistant_suffix_num_tokens = if let Some(end_of_message_idx) = end_of_message_idx {
|
|
138
|
+
with_chat_template.len() - end_of_message_idx
|
|
139
|
+
} else {
|
|
140
|
+
eprintln!(
|
|
141
|
+
"Warning: could not determine length of end of assistant special token sequence"
|
|
142
|
+
);
|
|
143
|
+
0
|
|
144
|
+
};
|
|
145
|
+
|
|
109
146
|
Ok(::std::sync::Arc::new(Tokenizer {
|
|
110
|
-
tmpl
|
|
111
|
-
tok
|
|
147
|
+
tmpl,
|
|
148
|
+
tok,
|
|
112
149
|
pad_token,
|
|
113
150
|
cross_token,
|
|
114
151
|
plus_token,
|
|
115
152
|
block_size,
|
|
153
|
+
assistant_suffix_num_tokens,
|
|
116
154
|
}))
|
|
117
155
|
})
|
|
118
156
|
}
|
|
@@ -382,15 +420,9 @@ pub fn tokenize_query(
|
|
|
382
420
|
temperature,
|
|
383
421
|
} = query.g;
|
|
384
422
|
|
|
385
|
-
let s = ::std::time::Instant::now();
|
|
386
423
|
let tok = state
|
|
387
424
|
.get_or_create(&model, pad_token, cross_token, plus_token, block_size)
|
|
388
425
|
.map_err(handle_arc_err)?;
|
|
389
|
-
println!(
|
|
390
|
-
"Spnl tokenize_query from pretrained {model}. Loaded in {:?}",
|
|
391
|
-
s.elapsed()
|
|
392
|
-
);
|
|
393
|
-
|
|
394
426
|
let mut tokens: Vec<u32> = vec![];
|
|
395
427
|
tokenize_part(&input, &tok, &mut tokens)
|
|
396
428
|
.and_then(|()| add_final_assistant_token(&tok, &mut tokens))
|
|
@@ -460,3 +492,94 @@ pub fn tokenize_prepare(
|
|
|
460
492
|
_ => todo!(),
|
|
461
493
|
}
|
|
462
494
|
}
|
|
495
|
+
|
|
496
|
+
#[cfg(test)]
|
|
497
|
+
mod tests {
|
|
498
|
+
use super::*;
|
|
499
|
+
use itertools::Itertools;
|
|
500
|
+
|
|
501
|
+
const PAD_TOKEN: u32 = 27;
|
|
502
|
+
const BLOCK_SIZE: usize = 16;
|
|
503
|
+
|
|
504
|
+
const MODEL: &str = "ibm-granite/granite-3.3-2b-instruct"; // TODO find smaller model with public tokenizers.json
|
|
505
|
+
const START_OF_ROLE: u32 = 49152;
|
|
506
|
+
const END_OF_ROLE: u32 = 49153;
|
|
507
|
+
const END_OF_TEXT: u32 = 0;
|
|
508
|
+
const USER: u32 = 496;
|
|
509
|
+
const ASSISTANT: u32 = 17594;
|
|
510
|
+
const HELLO: u32 = 7656;
|
|
511
|
+
const LONGER: u32 = 8928;
|
|
512
|
+
|
|
513
|
+
fn tok() -> Result<::std::sync::Arc<Tokenizer>, ::std::sync::Arc<tokenizers::tokenizer::Error>>
|
|
514
|
+
{
|
|
515
|
+
init(2).get_or_create(&MODEL.into(), PAD_TOKEN, None, None, BLOCK_SIZE)
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
#[test]
|
|
519
|
+
fn create_tokenizer() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
|
|
520
|
+
tok().map(|_| ())
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
#[test]
|
|
524
|
+
fn user() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
|
|
525
|
+
assert_eq!(
|
|
526
|
+
tok().map(|tok| tok.user("hello"))?,
|
|
527
|
+
"<|start_of_role|>user<|end_of_role|>hello<|end_of_text|>"
|
|
528
|
+
);
|
|
529
|
+
Ok(())
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
#[test]
|
|
533
|
+
fn usertok() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
|
|
534
|
+
let mut tokens = vec![];
|
|
535
|
+
tok()?.usertok("hello", &mut tokens)?;
|
|
536
|
+
assert_eq!(
|
|
537
|
+
tokens,
|
|
538
|
+
[START_OF_ROLE, USER, END_OF_ROLE, HELLO, END_OF_TEXT]
|
|
539
|
+
);
|
|
540
|
+
Ok(())
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
#[test]
|
|
544
|
+
fn assistant() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
|
|
545
|
+
assert_eq!(
|
|
546
|
+
tok().map(|tok| tok.assistant("hello"))?,
|
|
547
|
+
"<|start_of_role|>assistant<|end_of_role|>hello<|end_of_text|>"
|
|
548
|
+
);
|
|
549
|
+
Ok(())
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
#[test]
|
|
553
|
+
fn assistanttok_fully_cropped() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
|
|
554
|
+
let mut tokens = vec![];
|
|
555
|
+
tok()?.assistanttok("hello", &mut tokens)?;
|
|
556
|
+
let empty: &[u32] = &[];
|
|
557
|
+
assert_eq!(tokens, empty);
|
|
558
|
+
Ok(())
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
#[test]
|
|
562
|
+
fn assistanttok_partially_cropped() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>>
|
|
563
|
+
{
|
|
564
|
+
let repeat_input = 17; // repeat this many times for the input message
|
|
565
|
+
let repeat_output = 12; // expect this many repetitions after cropping
|
|
566
|
+
let mut tokens = vec![];
|
|
567
|
+
tok()?.assistanttok(
|
|
568
|
+
format!(
|
|
569
|
+
"hello {}",
|
|
570
|
+
::std::iter::repeat_n("longer", repeat_input).join(" ")
|
|
571
|
+
)
|
|
572
|
+
.as_str(),
|
|
573
|
+
&mut tokens,
|
|
574
|
+
)?;
|
|
575
|
+
assert_eq!(
|
|
576
|
+
tokens,
|
|
577
|
+
[START_OF_ROLE, ASSISTANT, END_OF_ROLE, HELLO]
|
|
578
|
+
.into_iter()
|
|
579
|
+
.chain(::std::iter::repeat_n(LONGER, repeat_output))
|
|
580
|
+
// .chain([END_OF_TEXT])
|
|
581
|
+
.collect::<Vec<u32>>(),
|
|
582
|
+
);
|
|
583
|
+
Ok(())
|
|
584
|
+
}
|
|
585
|
+
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|