tiktoken_ruby 0.0.13 → 0.0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vscode/settings.json +3 -0
- data/CHANGELOG.md +21 -0
- data/Gemfile.lock +9 -9
- data/README.md +33 -0
- data/ext/tiktoken_ruby/src/core_bpe_wrapper.rs +121 -7
- data/lib/tiktoken_ruby/encoding.rb +7 -0
- data/lib/tiktoken_ruby/version.rb +1 -1
- metadata +3 -3
- data/script/release +0 -43
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 71cce652b2f6a2ca962823d1947603a5224f305901cb4d8c822ca32b58192d47
|
|
4
|
+
data.tar.gz: d2cd0525f5f784a5904e1b7bc05cb3dc0c00f1f10561d52a99c6ef12b1351e89
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fe7572bf7a82f77441335a273e90e6b4bce92be2b6fb6073c1409de635af8c3521b0181ba8d3422691f8d5242b6a3b08ec899fad465084bb7fc570d153e44b00
|
|
7
|
+
data.tar.gz: 4f95fa2c39ed53c1d40cb6928f294f1617b35042abd4b8cdcf641dac016b4dba9a15b3f46835cd85600af8a73152c5f3488694db9cdf9d3dd22707ab109a133d
|
data/.vscode/settings.json
CHANGED
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# [v0.0.14.1] - 20-12-2025
|
|
2
|
+
## What's Changed
|
|
3
|
+
* Cut v0.0.12 by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/78
|
|
4
|
+
* Bump magnus from 0.8.0 to 0.8.1 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/83
|
|
5
|
+
* Bump actions/checkout from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/82
|
|
6
|
+
* Bump standard from 1.50.0 to 1.51.1 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/81
|
|
7
|
+
* Bump actions/upload-artifact from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/84
|
|
8
|
+
* Bump magnus from 0.8.1 to 0.8.2 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/85
|
|
9
|
+
* Bump the bundler-dependencies group with 2 updates by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/86
|
|
10
|
+
* Support by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/87
|
|
11
|
+
* Bump actions/checkout from 5 to 6 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/89
|
|
12
|
+
* Bump standard from 1.51.1 to 1.52.0 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/88
|
|
13
|
+
* release GVL while encoding / decoding tokens by @tenderworks in https://github.com/IAPark/tiktoken_ruby/pull/90
|
|
14
|
+
* Drop Ruby 3.1 support; automate release process by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/92
|
|
15
|
+
* Rewrite history by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/93
|
|
16
|
+
* Force workflow rebuild by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/94
|
|
17
|
+
|
|
18
|
+
## New Contributors
|
|
19
|
+
* @tenderworks made their first contribution in https://github.com/IAPark/tiktoken_ruby/pull/90
|
|
20
|
+
|
|
21
|
+
**Full Changelog**: https://github.com/IAPark/tiktoken_ruby/compare/v0.0.12...v0.0.14.1
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
tiktoken_ruby (0.0.
|
|
4
|
+
tiktoken_ruby (0.0.14.1)
|
|
5
5
|
rb_sys (~> 0.9)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -9,15 +9,15 @@ GEM
|
|
|
9
9
|
specs:
|
|
10
10
|
ast (2.4.3)
|
|
11
11
|
diff-lcs (1.6.2)
|
|
12
|
-
json (2.
|
|
12
|
+
json (2.16.0)
|
|
13
13
|
language_server-protocol (3.17.0.5)
|
|
14
14
|
lint_roller (1.1.0)
|
|
15
15
|
minitest (5.25.5)
|
|
16
16
|
parallel (1.27.0)
|
|
17
|
-
parser (3.3.
|
|
17
|
+
parser (3.3.10.0)
|
|
18
18
|
ast (~> 2.4.1)
|
|
19
19
|
racc
|
|
20
|
-
prism (1.
|
|
20
|
+
prism (1.6.0)
|
|
21
21
|
racc (1.8.1)
|
|
22
22
|
rainbow (3.1.1)
|
|
23
23
|
rake (13.3.1)
|
|
@@ -40,7 +40,7 @@ GEM
|
|
|
40
40
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
41
41
|
rspec-support (~> 3.13.0)
|
|
42
42
|
rspec-support (3.13.6)
|
|
43
|
-
rubocop (1.
|
|
43
|
+
rubocop (1.81.7)
|
|
44
44
|
json (~> 2.3)
|
|
45
45
|
language_server-protocol (~> 3.17.0.2)
|
|
46
46
|
lint_roller (~> 1.1.0)
|
|
@@ -48,10 +48,10 @@ GEM
|
|
|
48
48
|
parser (>= 3.3.0.2)
|
|
49
49
|
rainbow (>= 2.2.2, < 4.0)
|
|
50
50
|
regexp_parser (>= 2.9.3, < 3.0)
|
|
51
|
-
rubocop-ast (>= 1.
|
|
51
|
+
rubocop-ast (>= 1.47.1, < 2.0)
|
|
52
52
|
ruby-progressbar (~> 1.7)
|
|
53
53
|
unicode-display_width (>= 2.4.0, < 4.0)
|
|
54
|
-
rubocop-ast (1.
|
|
54
|
+
rubocop-ast (1.48.0)
|
|
55
55
|
parser (>= 3.3.7.2)
|
|
56
56
|
prism (~> 1.4)
|
|
57
57
|
rubocop-performance (1.25.0)
|
|
@@ -59,10 +59,10 @@ GEM
|
|
|
59
59
|
rubocop (>= 1.75.0, < 2.0)
|
|
60
60
|
rubocop-ast (>= 1.38.0, < 2.0)
|
|
61
61
|
ruby-progressbar (1.13.0)
|
|
62
|
-
standard (1.
|
|
62
|
+
standard (1.52.0)
|
|
63
63
|
language_server-protocol (~> 3.17.0.2)
|
|
64
64
|
lint_roller (~> 1.0)
|
|
65
|
-
rubocop (~> 1.
|
|
65
|
+
rubocop (~> 1.81.7)
|
|
66
66
|
standard-custom (~> 1.0.0)
|
|
67
67
|
standard-performance (~> 1.8)
|
|
68
68
|
standard-custom (1.0.2)
|
data/README.md
CHANGED
|
@@ -36,6 +36,39 @@ enc = Tiktoken.encoding_for_model("gpt-4")
|
|
|
36
36
|
enc.encode("hello world").length #=> 2
|
|
37
37
|
```
|
|
38
38
|
|
|
39
|
+
### Encoding methods
|
|
40
|
+
|
|
41
|
+
There are three methods for encoding text:
|
|
42
|
+
|
|
43
|
+
- `encode_ordinary(text)` - Encodes text, always treating special tokens as ordinary text
|
|
44
|
+
- `encode(text, allowed_special: [])` - Encodes text, treating special tokens as text unless listed in `allowed_special`
|
|
45
|
+
- `encode_with_special_tokens(text)` - Encodes text, recognizing and parsing all special tokens
|
|
46
|
+
|
|
47
|
+
**Special tokens** are control sequences used by OpenAI models, such as `<|endoftext|>`, `<|fim_prefix|>`, `<|fim_middle|>`, and `<|fim_suffix|>`. The encoding methods differ in how they handle these sequences:
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
enc = Tiktoken.get_encoding("cl100k_base")
|
|
51
|
+
text = "Hello<|endoftext|>World"
|
|
52
|
+
|
|
53
|
+
# encode_ordinary: treats <|endoftext|> as literal characters (9 tokens)
|
|
54
|
+
enc.encode_ordinary(text)
|
|
55
|
+
#=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
|
|
56
|
+
|
|
57
|
+
# encode: same as encode_ordinary by default
|
|
58
|
+
enc.encode(text)
|
|
59
|
+
#=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
|
|
60
|
+
|
|
61
|
+
# encode with allowed_special: recognizes the specified special token (3 tokens)
|
|
62
|
+
enc.encode(text, allowed_special: ["<|endoftext|>"])
|
|
63
|
+
#=> [9906, 100257, 10343]
|
|
64
|
+
|
|
65
|
+
# encode_with_special_tokens: recognizes ALL special tokens (3 tokens)
|
|
66
|
+
enc.encode_with_special_tokens(text)
|
|
67
|
+
#=> [9906, 100257, 10343]
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
All methods round-trip correctly through `decode`.
|
|
71
|
+
|
|
39
72
|
## Development
|
|
40
73
|
|
|
41
74
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
use std::collections::HashSet;
|
|
2
|
+
use std::ffi::c_void;
|
|
2
3
|
|
|
3
4
|
use tiktoken_rs::Rank;
|
|
4
5
|
|
|
@@ -9,13 +10,82 @@ pub struct CoreBPEWrapper {
|
|
|
9
10
|
core_bpe: tiktoken_rs::CoreBPE,
|
|
10
11
|
}
|
|
11
12
|
|
|
13
|
+
struct EncodeOrdinaryData {
|
|
14
|
+
core_bpe: *const tiktoken_rs::CoreBPE,
|
|
15
|
+
text: String,
|
|
16
|
+
result: Vec<Rank>,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
struct EncodeData {
|
|
20
|
+
core_bpe: *const tiktoken_rs::CoreBPE,
|
|
21
|
+
text: String,
|
|
22
|
+
allowed_special: HashSet<String>,
|
|
23
|
+
result: Vec<Rank>,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
struct EncodeSpecialData {
|
|
27
|
+
core_bpe: *const tiktoken_rs::CoreBPE,
|
|
28
|
+
text: String,
|
|
29
|
+
result: Vec<Rank>,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
struct DecodeData {
|
|
33
|
+
core_bpe: *const tiktoken_rs::CoreBPE,
|
|
34
|
+
ids: Vec<Rank>,
|
|
35
|
+
result: Result<String, String>,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
unsafe extern "C" fn encode_ordinary_without_gvl(data: *mut c_void) -> *mut c_void {
|
|
39
|
+
let data = &mut *(data as *mut EncodeOrdinaryData);
|
|
40
|
+
let core_bpe = &*data.core_bpe;
|
|
41
|
+
data.result = core_bpe.encode_ordinary(&data.text);
|
|
42
|
+
std::ptr::null_mut()
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
unsafe extern "C" fn encode_without_gvl(data: *mut c_void) -> *mut c_void {
|
|
46
|
+
let data = &mut *(data as *mut EncodeData);
|
|
47
|
+
let core_bpe = &*data.core_bpe;
|
|
48
|
+
let allowed_special: HashSet<&str> = data.allowed_special.iter().map(|s| s.as_str()).collect();
|
|
49
|
+
data.result = core_bpe.encode(&data.text, &allowed_special).0;
|
|
50
|
+
std::ptr::null_mut()
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
unsafe extern "C" fn encode_special_without_gvl(data: *mut c_void) -> *mut c_void {
|
|
54
|
+
let data = &mut *(data as *mut EncodeSpecialData);
|
|
55
|
+
let core_bpe = &*data.core_bpe;
|
|
56
|
+
data.result = core_bpe.encode_with_special_tokens(&data.text);
|
|
57
|
+
std::ptr::null_mut()
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
unsafe extern "C" fn decode_without_gvl(data: *mut c_void) -> *mut c_void {
|
|
61
|
+
let data = &mut *(data as *mut DecodeData);
|
|
62
|
+
let core_bpe = &*data.core_bpe;
|
|
63
|
+
data.result = core_bpe.decode(std::mem::take(&mut data.ids)).map_err(|e| e.to_string());
|
|
64
|
+
std::ptr::null_mut()
|
|
65
|
+
}
|
|
66
|
+
|
|
12
67
|
impl CoreBPEWrapper {
|
|
13
68
|
pub fn new(core_bpe: tiktoken_rs::CoreBPE) -> Self {
|
|
14
69
|
Self { core_bpe }
|
|
15
70
|
}
|
|
16
71
|
|
|
17
72
|
pub fn encode_ordinary(&self, text: String) -> Vec<Rank> {
|
|
18
|
-
|
|
73
|
+
let mut data = EncodeOrdinaryData {
|
|
74
|
+
core_bpe: &self.core_bpe as *const _,
|
|
75
|
+
text,
|
|
76
|
+
result: Vec::new(),
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
unsafe {
|
|
80
|
+
rb_sys::rb_thread_call_without_gvl(
|
|
81
|
+
Some(encode_ordinary_without_gvl),
|
|
82
|
+
&mut data as *mut _ as *mut c_void,
|
|
83
|
+
None,
|
|
84
|
+
std::ptr::null_mut(),
|
|
85
|
+
);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
data.result
|
|
19
89
|
}
|
|
20
90
|
|
|
21
91
|
pub fn encode(
|
|
@@ -24,24 +94,68 @@ impl CoreBPEWrapper {
|
|
|
24
94
|
allowed_special: magnus::RArray,
|
|
25
95
|
) -> Result<Vec<Rank>, magnus::Error> {
|
|
26
96
|
let allowed_special: Vec<String> = allowed_special.to_vec()?;
|
|
27
|
-
let allowed_special: Vec<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
|
|
28
|
-
let allowed_special: HashSet<&str> = HashSet::from_iter(allowed_special.iter().cloned());
|
|
29
97
|
|
|
30
|
-
|
|
98
|
+
let mut data = EncodeData {
|
|
99
|
+
core_bpe: &self.core_bpe as *const _,
|
|
100
|
+
text,
|
|
101
|
+
allowed_special: HashSet::from_iter(allowed_special),
|
|
102
|
+
result: Vec::new(),
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
unsafe {
|
|
106
|
+
rb_sys::rb_thread_call_without_gvl(
|
|
107
|
+
Some(encode_without_gvl),
|
|
108
|
+
&mut data as *mut _ as *mut c_void,
|
|
109
|
+
None,
|
|
110
|
+
std::ptr::null_mut(),
|
|
111
|
+
);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
Ok(data.result)
|
|
31
115
|
}
|
|
32
116
|
|
|
33
117
|
pub fn encode_with_special_tokens(&self, text: String) -> Vec<Rank> {
|
|
34
|
-
|
|
118
|
+
let mut data = EncodeSpecialData {
|
|
119
|
+
core_bpe: &self.core_bpe as *const _,
|
|
120
|
+
text,
|
|
121
|
+
result: Vec::new(),
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
unsafe {
|
|
125
|
+
rb_sys::rb_thread_call_without_gvl(
|
|
126
|
+
Some(encode_special_without_gvl),
|
|
127
|
+
&mut data as *mut _ as *mut c_void,
|
|
128
|
+
None,
|
|
129
|
+
std::ptr::null_mut(),
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
data.result
|
|
35
134
|
}
|
|
36
135
|
|
|
37
136
|
pub fn decode(&self, ids: Vec<Rank>) -> Result<String, magnus::Error> {
|
|
38
|
-
|
|
137
|
+
let mut data = DecodeData {
|
|
138
|
+
core_bpe: &self.core_bpe as *const _,
|
|
139
|
+
ids,
|
|
140
|
+
result: Err(String::new()),
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
unsafe {
|
|
144
|
+
rb_sys::rb_thread_call_without_gvl(
|
|
145
|
+
Some(decode_without_gvl),
|
|
146
|
+
&mut data as *mut _ as *mut c_void,
|
|
147
|
+
None,
|
|
148
|
+
std::ptr::null_mut(),
|
|
149
|
+
);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
data.result.map_err(|e| {
|
|
39
153
|
let error = match uncicode_error() {
|
|
40
154
|
Ok(error) => error,
|
|
41
155
|
Err(e) => return e,
|
|
42
156
|
};
|
|
43
157
|
|
|
44
|
-
magnus::Error::new(error, e
|
|
158
|
+
magnus::Error::new(error, e)
|
|
45
159
|
})
|
|
46
160
|
}
|
|
47
161
|
}
|
|
@@ -40,6 +40,13 @@ class Tiktoken::Encoding
|
|
|
40
40
|
@ext_base_bpe.encode(text, allowed_special)
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
+
# Encodes the text as a list of integer tokens, including special tokens.
|
|
44
|
+
# @param text [String] The text to encode
|
|
45
|
+
# @return [Array<Integer>] The encoded tokens
|
|
46
|
+
def encode_with_special_tokens(text)
|
|
47
|
+
@ext_base_bpe.encode_with_special_tokens(text)
|
|
48
|
+
end
|
|
49
|
+
|
|
43
50
|
# Decodes the tokens back into text
|
|
44
51
|
# @param tokens [Array<Integer>] The tokens to decode
|
|
45
52
|
# @return [String] The decoded text
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tiktoken_ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.14.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- IAPark
|
|
@@ -36,6 +36,7 @@ files:
|
|
|
36
36
|
- ".rspec"
|
|
37
37
|
- ".standard.yml"
|
|
38
38
|
- ".vscode/settings.json"
|
|
39
|
+
- CHANGELOG.md
|
|
39
40
|
- Cargo.lock
|
|
40
41
|
- Cargo.toml
|
|
41
42
|
- Gemfile
|
|
@@ -51,7 +52,6 @@ files:
|
|
|
51
52
|
- lib/tiktoken_ruby.rb
|
|
52
53
|
- lib/tiktoken_ruby/encoding.rb
|
|
53
54
|
- lib/tiktoken_ruby/version.rb
|
|
54
|
-
- script/release
|
|
55
55
|
- sig/tiktoken_ruby.rbs
|
|
56
56
|
homepage: https://github.com/IAPark/tiktoken_ruby
|
|
57
57
|
licenses:
|
|
@@ -67,7 +67,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
67
67
|
requirements:
|
|
68
68
|
- - ">="
|
|
69
69
|
- !ruby/object:Gem::Version
|
|
70
|
-
version: 3.
|
|
70
|
+
version: 3.2.0
|
|
71
71
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
72
72
|
requirements:
|
|
73
73
|
- - ">="
|
data/script/release
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
|
|
3
|
-
set -e
|
|
4
|
-
|
|
5
|
-
if [ -z "${TIKTOKEN_PUBLISH_KEY}" ]; then
|
|
6
|
-
echo "Error: TIKTOKEN_PUBLISH_KEY is not set. This is the RubyGems API key to push the gem."
|
|
7
|
-
exit 1
|
|
8
|
-
fi
|
|
9
|
-
|
|
10
|
-
run_id=""
|
|
11
|
-
# Parse arguments
|
|
12
|
-
while [[ "$#" -gt 0 ]]; do
|
|
13
|
-
case $1 in
|
|
14
|
-
--run-id)
|
|
15
|
-
run_id="$2"
|
|
16
|
-
shift 2
|
|
17
|
-
;;
|
|
18
|
-
*)
|
|
19
|
-
echo "Unknown parameter passed: $1"
|
|
20
|
-
exit 1
|
|
21
|
-
;;
|
|
22
|
-
esac
|
|
23
|
-
done
|
|
24
|
-
|
|
25
|
-
if [ -z "${run_id}" ]; then
|
|
26
|
-
echo "Error: --run-id is not provided. Please provide the GitHub Action run id for the cross-compile workflow."
|
|
27
|
-
exit 1
|
|
28
|
-
fi
|
|
29
|
-
|
|
30
|
-
version=$(grep VERSION lib/tiktoken_ruby/version.rb | head -n 1 | cut -d'"' -f2)
|
|
31
|
-
echo "Building tiktoken_ruby v$version, using artifacts from run $run_id"
|
|
32
|
-
|
|
33
|
-
rm -rf pkg/cross-compiled
|
|
34
|
-
gh run download "$run_id" -D pkg/cross-compiled
|
|
35
|
-
|
|
36
|
-
for gem in pkg/cross-compiled/cross-gem-*/tiktoken_ruby-"$version"*.gem ; do
|
|
37
|
-
echo "Publishing $gem"
|
|
38
|
-
GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
|
|
39
|
-
done
|
|
40
|
-
|
|
41
|
-
# last but not least, the uncompiled gem
|
|
42
|
-
bundle exec rake package
|
|
43
|
-
GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "pkg/tiktoken_ruby-$version.gem" --host https://rubygems.org
|