tiktoken_ruby 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +13 -11
- data/Gemfile.lock +39 -39
- data/ext/tiktoken_ruby/Cargo.toml +3 -3
- data/ext/tiktoken_ruby/src/core_bpe_wrapper.rs +17 -14
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +4 -1
- data/script/release +28 -18
- metadata +10 -10
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9997f4334fdaff90a631036be451cad90eb58ce6919a9592de2ca09d7f8baf9e
|
|
4
|
+
data.tar.gz: 160c540bf8a76278ebcabfe80c50e9fdb5802a0742d4692295e659ba8626b492
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ea271891e7ca2fbfb637da4945c5f6da55f72b8c39efe20c4f83d5d059d2b9997f4344feadbd6109a29859729e4bba245fe51338e77c548e9e9a24bd1981a6e9
|
|
7
|
+
data.tar.gz: 3b6eeda82acaa6b6abb324e911196bce53d20a2bbec7750e07cb3b9d7a8381ed9ebd90b68a4e770e8bb1b4be332748216bc653d31389591348bc5192656e417f
|
data/Cargo.lock
CHANGED
|
@@ -115,12 +115,13 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
|
|
115
115
|
|
|
116
116
|
[[package]]
|
|
117
117
|
name = "fancy-regex"
|
|
118
|
-
version = "0.
|
|
118
|
+
version = "0.13.0"
|
|
119
119
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
120
|
-
checksum = "
|
|
120
|
+
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
|
|
121
121
|
dependencies = [
|
|
122
122
|
"bit-set",
|
|
123
|
-
"regex",
|
|
123
|
+
"regex-automata",
|
|
124
|
+
"regex-syntax",
|
|
124
125
|
]
|
|
125
126
|
|
|
126
127
|
[[package]]
|
|
@@ -178,9 +179,9 @@ dependencies = [
|
|
|
178
179
|
|
|
179
180
|
[[package]]
|
|
180
181
|
name = "magnus"
|
|
181
|
-
version = "0.
|
|
182
|
+
version = "0.7.1"
|
|
182
183
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
183
|
-
checksum = "
|
|
184
|
+
checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
|
|
184
185
|
dependencies = [
|
|
185
186
|
"magnus-macros",
|
|
186
187
|
"rb-sys",
|
|
@@ -264,18 +265,18 @@ dependencies = [
|
|
|
264
265
|
|
|
265
266
|
[[package]]
|
|
266
267
|
name = "rb-sys"
|
|
267
|
-
version = "0.9.
|
|
268
|
+
version = "0.9.106"
|
|
268
269
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
269
|
-
checksum = "
|
|
270
|
+
checksum = "17b6efdbc8c1a22cb8b5d7ead0237c16c362c9ef6fbdc09e2d1040615b0f4cd0"
|
|
270
271
|
dependencies = [
|
|
271
272
|
"rb-sys-build",
|
|
272
273
|
]
|
|
273
274
|
|
|
274
275
|
[[package]]
|
|
275
276
|
name = "rb-sys-build"
|
|
276
|
-
version = "0.9.
|
|
277
|
+
version = "0.9.106"
|
|
277
278
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
278
|
-
checksum = "
|
|
279
|
+
checksum = "e1d88c51e52f8636a5efc24ec5987056e64e48a91ed2a1af96cb5564686cc10f"
|
|
279
280
|
dependencies = [
|
|
280
281
|
"bindgen",
|
|
281
282
|
"lazy_static",
|
|
@@ -399,9 +400,9 @@ dependencies = [
|
|
|
399
400
|
|
|
400
401
|
[[package]]
|
|
401
402
|
name = "tiktoken-rs"
|
|
402
|
-
version = "0.
|
|
403
|
+
version = "0.6.0"
|
|
403
404
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
404
|
-
checksum = "
|
|
405
|
+
checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6"
|
|
405
406
|
dependencies = [
|
|
406
407
|
"anyhow",
|
|
407
408
|
"base64",
|
|
@@ -409,6 +410,7 @@ dependencies = [
|
|
|
409
410
|
"fancy-regex",
|
|
410
411
|
"lazy_static",
|
|
411
412
|
"parking_lot",
|
|
413
|
+
"regex",
|
|
412
414
|
"rustc-hash",
|
|
413
415
|
]
|
|
414
416
|
|
data/Gemfile.lock
CHANGED
|
@@ -1,73 +1,73 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
tiktoken_ruby (0.0.
|
|
5
|
-
rb_sys (
|
|
4
|
+
tiktoken_ruby (0.0.11)
|
|
5
|
+
rb_sys (= 0.9.106)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
8
8
|
remote: https://rubygems.org/
|
|
9
9
|
specs:
|
|
10
10
|
ast (2.4.2)
|
|
11
|
-
diff-lcs (1.5.
|
|
12
|
-
json (2.
|
|
11
|
+
diff-lcs (1.5.1)
|
|
12
|
+
json (2.9.1)
|
|
13
13
|
language_server-protocol (3.17.0.3)
|
|
14
14
|
lint_roller (1.1.0)
|
|
15
15
|
minitest (5.21.2)
|
|
16
|
-
parallel (1.
|
|
17
|
-
parser (3.3.0
|
|
16
|
+
parallel (1.26.3)
|
|
17
|
+
parser (3.3.6.0)
|
|
18
18
|
ast (~> 2.4.1)
|
|
19
19
|
racc
|
|
20
|
-
racc (1.
|
|
20
|
+
racc (1.8.1)
|
|
21
21
|
rainbow (3.1.1)
|
|
22
|
-
rake (13.1
|
|
23
|
-
rake-compiler (1.2.
|
|
22
|
+
rake (13.2.1)
|
|
23
|
+
rake-compiler (1.2.9)
|
|
24
24
|
rake
|
|
25
|
-
rb_sys (0.9.
|
|
26
|
-
regexp_parser (2.
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
rspec-
|
|
30
|
-
rspec-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
rspec-expectations (3.12.3)
|
|
25
|
+
rb_sys (0.9.106)
|
|
26
|
+
regexp_parser (2.10.0)
|
|
27
|
+
rspec (3.13.0)
|
|
28
|
+
rspec-core (~> 3.13.0)
|
|
29
|
+
rspec-expectations (~> 3.13.0)
|
|
30
|
+
rspec-mocks (~> 3.13.0)
|
|
31
|
+
rspec-core (3.13.2)
|
|
32
|
+
rspec-support (~> 3.13.0)
|
|
33
|
+
rspec-expectations (3.13.3)
|
|
35
34
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
36
|
-
rspec-support (~> 3.
|
|
37
|
-
rspec-mocks (3.
|
|
35
|
+
rspec-support (~> 3.13.0)
|
|
36
|
+
rspec-mocks (3.13.2)
|
|
38
37
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
39
|
-
rspec-support (~> 3.
|
|
40
|
-
rspec-support (3.
|
|
41
|
-
rubocop (1.
|
|
38
|
+
rspec-support (~> 3.13.0)
|
|
39
|
+
rspec-support (3.13.2)
|
|
40
|
+
rubocop (1.69.2)
|
|
42
41
|
json (~> 2.3)
|
|
43
42
|
language_server-protocol (>= 3.17.0)
|
|
44
43
|
parallel (~> 1.10)
|
|
45
|
-
parser (>= 3.
|
|
44
|
+
parser (>= 3.3.0.2)
|
|
46
45
|
rainbow (>= 2.2.2, < 4.0)
|
|
47
|
-
regexp_parser (>=
|
|
48
|
-
|
|
49
|
-
rubocop-ast (>= 1.30.0, < 2.0)
|
|
46
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
|
47
|
+
rubocop-ast (>= 1.36.2, < 2.0)
|
|
50
48
|
ruby-progressbar (~> 1.7)
|
|
51
|
-
unicode-display_width (>= 2.4.0, <
|
|
52
|
-
rubocop-ast (1.
|
|
53
|
-
parser (>= 3.
|
|
54
|
-
rubocop-performance (1.
|
|
49
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
|
50
|
+
rubocop-ast (1.37.0)
|
|
51
|
+
parser (>= 3.3.1.0)
|
|
52
|
+
rubocop-performance (1.23.0)
|
|
55
53
|
rubocop (>= 1.48.1, < 2.0)
|
|
56
|
-
rubocop-ast (>= 1.
|
|
54
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
|
57
55
|
ruby-progressbar (1.13.0)
|
|
58
|
-
standard (1.
|
|
56
|
+
standard (1.43.0)
|
|
59
57
|
language_server-protocol (~> 3.17.0.2)
|
|
60
58
|
lint_roller (~> 1.0)
|
|
61
|
-
rubocop (~> 1.
|
|
59
|
+
rubocop (~> 1.69.1)
|
|
62
60
|
standard-custom (~> 1.0.0)
|
|
63
|
-
standard-performance (~> 1.
|
|
61
|
+
standard-performance (~> 1.6)
|
|
64
62
|
standard-custom (1.0.2)
|
|
65
63
|
lint_roller (~> 1.0)
|
|
66
64
|
rubocop (~> 1.50)
|
|
67
|
-
standard-performance (1.
|
|
65
|
+
standard-performance (1.6.0)
|
|
68
66
|
lint_roller (~> 1.1)
|
|
69
|
-
rubocop-performance (~> 1.
|
|
70
|
-
unicode-display_width (
|
|
67
|
+
rubocop-performance (~> 1.23.0)
|
|
68
|
+
unicode-display_width (3.1.3)
|
|
69
|
+
unicode-emoji (~> 4.0, >= 4.0.4)
|
|
70
|
+
unicode-emoji (4.0.4)
|
|
71
71
|
yard (0.9.34)
|
|
72
72
|
yard-doctest (0.1.17)
|
|
73
73
|
minitest
|
|
@@ -10,6 +10,6 @@ publish = false
|
|
|
10
10
|
crate-type = ["cdylib"]
|
|
11
11
|
|
|
12
12
|
[dependencies]
|
|
13
|
-
magnus = { version = "0.
|
|
14
|
-
rb-sys = { version = "0.9.
|
|
15
|
-
tiktoken-rs = { version = "0.
|
|
13
|
+
magnus = { version = "0.7.1" }
|
|
14
|
+
rb-sys = { version = "0.9.106", features = ["stable-api-compiled-fallback"] }
|
|
15
|
+
tiktoken-rs = { version = "0.6.0" }
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
use std::collections::HashSet;
|
|
2
2
|
|
|
3
|
-
use
|
|
3
|
+
use tiktoken_rs::Rank;
|
|
4
4
|
|
|
5
|
+
use crate::uncicode_error;
|
|
5
6
|
|
|
6
7
|
#[magnus::wrap(class = "Tiktoken::Ext::CoreBPE")]
|
|
7
8
|
pub struct CoreBPEWrapper {
|
|
@@ -13,11 +14,15 @@ impl CoreBPEWrapper {
|
|
|
13
14
|
Self { core_bpe }
|
|
14
15
|
}
|
|
15
16
|
|
|
16
|
-
pub fn encode_ordinary(&self, text: String) -> Vec<
|
|
17
|
+
pub fn encode_ordinary(&self, text: String) -> Vec<Rank> {
|
|
17
18
|
self.core_bpe.encode_ordinary(text.as_str())
|
|
18
19
|
}
|
|
19
20
|
|
|
20
|
-
pub fn encode(
|
|
21
|
+
pub fn encode(
|
|
22
|
+
&self,
|
|
23
|
+
text: String,
|
|
24
|
+
allowed_special: magnus::RArray,
|
|
25
|
+
) -> Result<Vec<Rank>, magnus::Error> {
|
|
21
26
|
let allowed_special: Vec<String> = allowed_special.to_vec()?;
|
|
22
27
|
let allowed_special: Vec<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
|
|
23
28
|
let allowed_special: HashSet<&str> = HashSet::from_iter(allowed_special.iter().cloned());
|
|
@@ -25,20 +30,18 @@ impl CoreBPEWrapper {
|
|
|
25
30
|
Ok(self.core_bpe.encode(text.as_str(), allowed_special))
|
|
26
31
|
}
|
|
27
32
|
|
|
28
|
-
pub fn encode_with_special_tokens(&self, text: String) -> Vec<
|
|
33
|
+
pub fn encode_with_special_tokens(&self, text: String) -> Vec<Rank> {
|
|
29
34
|
self.core_bpe.encode_with_special_tokens(text.as_str())
|
|
30
35
|
}
|
|
31
36
|
|
|
32
|
-
pub fn decode(&self, ids: Vec<
|
|
33
|
-
self.core_bpe.decode(ids)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
};
|
|
39
|
-
|
|
40
|
-
magnus::Error::new(error, e.to_string())
|
|
41
|
-
})
|
|
37
|
+
pub fn decode(&self, ids: Vec<Rank>) -> Result<String, magnus::Error> {
|
|
38
|
+
self.core_bpe.decode(ids).map_err(|e| {
|
|
39
|
+
let error = match uncicode_error() {
|
|
40
|
+
Ok(error) => error,
|
|
41
|
+
Err(e) => return e,
|
|
42
|
+
};
|
|
42
43
|
|
|
44
|
+
magnus::Error::new(error, e.to_string())
|
|
45
|
+
})
|
|
43
46
|
}
|
|
44
47
|
}
|
data/lib/tiktoken_ruby.rb
CHANGED
|
@@ -69,9 +69,12 @@ module Tiktoken
|
|
|
69
69
|
]
|
|
70
70
|
|
|
71
71
|
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
|
72
|
-
# that is also MIT licensed but by OpenAI
|
|
72
|
+
# that is also MIT licensed but by OpenAI;
|
|
73
|
+
# https://github.com/Congyuwang/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs#L50
|
|
74
|
+
# is the source of the mapping for the Rust library
|
|
73
75
|
MODEL_TO_ENCODING_NAME = {
|
|
74
76
|
# chat
|
|
77
|
+
"chatgpt-4o-latest": "o200k_base",
|
|
75
78
|
"gpt-4o": "o200k_base",
|
|
76
79
|
"gpt-4": "cl100k_base",
|
|
77
80
|
"gpt-3.5-turbo": "cl100k_base",
|
data/script/release
CHANGED
|
@@ -7,25 +7,35 @@ if [ -z "${TIKTOKEN_PUBLISH_KEY}" ]; then
|
|
|
7
7
|
exit 1
|
|
8
8
|
fi
|
|
9
9
|
|
|
10
|
+
run_id=""
|
|
11
|
+
# Parse arguments
|
|
12
|
+
while [[ "$#" -gt 0 ]]; do
|
|
13
|
+
case $1 in
|
|
14
|
+
--run-id)
|
|
15
|
+
run_id="$2"
|
|
16
|
+
shift 2
|
|
17
|
+
;;
|
|
18
|
+
*)
|
|
19
|
+
echo "Unknown parameter passed: $1"
|
|
20
|
+
exit 1
|
|
21
|
+
;;
|
|
22
|
+
esac
|
|
23
|
+
done
|
|
24
|
+
|
|
25
|
+
if [ -z "${run_id}" ]; then
|
|
26
|
+
echo "Error: --run-id is not provided. Please provide the GitHub Action run id for the cross-compile workflow."
|
|
27
|
+
exit 1
|
|
28
|
+
fi
|
|
29
|
+
|
|
10
30
|
version=$(grep VERSION lib/tiktoken_ruby/version.rb | head -n 1 | cut -d'"' -f2)
|
|
11
|
-
echo "Building tiktoken_ruby v$version"
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"arm-linux"
|
|
20
|
-
"x64-mingw-ucrt"
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
# for target in "${targets[@]}"; do
|
|
24
|
-
# bundle exec rb-sys-dock -p "$target" --ruby-versions 3.2 --build
|
|
25
|
-
# done
|
|
26
|
-
|
|
27
|
-
for gem in pkg/tiktoken_ruby-"$version"-*.gem ; do
|
|
28
|
-
GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
|
|
31
|
+
echo "Building tiktoken_ruby v$version, using artifacts from run $run_id"
|
|
32
|
+
|
|
33
|
+
rm -rf pkg/cross-compiled
|
|
34
|
+
gh run download "$run_id" -D pkg/cross-compiled
|
|
35
|
+
|
|
36
|
+
for gem in pkg/cross-compiled/cross-gem-*/tiktoken_ruby-"$version"*.gem ; do
|
|
37
|
+
echo "Publishing $gem"
|
|
38
|
+
GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
|
|
29
39
|
done
|
|
30
40
|
|
|
31
41
|
# last but not least, the uncompiled gem
|
metadata
CHANGED
|
@@ -1,29 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tiktoken_ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.11
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- IAPark
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2025-01-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- -
|
|
17
|
+
- - '='
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: 0.9.
|
|
19
|
+
version: 0.9.106
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- -
|
|
24
|
+
- - '='
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 0.9.
|
|
26
|
+
version: 0.9.106
|
|
27
27
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
|
28
28
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|
|
29
29
|
it to OpenAI APIs.
|
|
@@ -61,7 +61,7 @@ metadata:
|
|
|
61
61
|
homepage_uri: https://github.com/IAPark/tiktoken_ruby
|
|
62
62
|
source_code_uri: https://github.com/IAPark/tiktoken_ruby
|
|
63
63
|
documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
|
|
64
|
-
post_install_message:
|
|
64
|
+
post_install_message:
|
|
65
65
|
rdoc_options: []
|
|
66
66
|
require_paths:
|
|
67
67
|
- lib
|
|
@@ -76,8 +76,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
76
76
|
- !ruby/object:Gem::Version
|
|
77
77
|
version: 3.4.0
|
|
78
78
|
requirements: []
|
|
79
|
-
rubygems_version: 3.5.
|
|
80
|
-
signing_key:
|
|
79
|
+
rubygems_version: 3.5.22
|
|
80
|
+
signing_key:
|
|
81
81
|
specification_version: 4
|
|
82
82
|
summary: Ruby wrapper for Tiktoken
|
|
83
83
|
test_files: []
|