tiktoken_ruby 0.0.10 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +13 -11
- data/Gemfile.lock +39 -39
- data/ext/tiktoken_ruby/Cargo.toml +3 -3
- data/ext/tiktoken_ruby/src/core_bpe_wrapper.rs +17 -14
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +4 -1
- data/script/release +28 -18
- metadata +10 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9997f4334fdaff90a631036be451cad90eb58ce6919a9592de2ca09d7f8baf9e
|
4
|
+
data.tar.gz: 160c540bf8a76278ebcabfe80c50e9fdb5802a0742d4692295e659ba8626b492
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ea271891e7ca2fbfb637da4945c5f6da55f72b8c39efe20c4f83d5d059d2b9997f4344feadbd6109a29859729e4bba245fe51338e77c548e9e9a24bd1981a6e9
|
7
|
+
data.tar.gz: 3b6eeda82acaa6b6abb324e911196bce53d20a2bbec7750e07cb3b9d7a8381ed9ebd90b68a4e770e8bb1b4be332748216bc653d31389591348bc5192656e417f
|
data/Cargo.lock
CHANGED
@@ -115,12 +115,13 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
|
115
115
|
|
116
116
|
[[package]]
|
117
117
|
name = "fancy-regex"
|
118
|
-
version = "0.
|
118
|
+
version = "0.13.0"
|
119
119
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
120
|
-
checksum = "
|
120
|
+
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
|
121
121
|
dependencies = [
|
122
122
|
"bit-set",
|
123
|
-
"regex",
|
123
|
+
"regex-automata",
|
124
|
+
"regex-syntax",
|
124
125
|
]
|
125
126
|
|
126
127
|
[[package]]
|
@@ -178,9 +179,9 @@ dependencies = [
|
|
178
179
|
|
179
180
|
[[package]]
|
180
181
|
name = "magnus"
|
181
|
-
version = "0.
|
182
|
+
version = "0.7.1"
|
182
183
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
183
|
-
checksum = "
|
184
|
+
checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
|
184
185
|
dependencies = [
|
185
186
|
"magnus-macros",
|
186
187
|
"rb-sys",
|
@@ -264,18 +265,18 @@ dependencies = [
|
|
264
265
|
|
265
266
|
[[package]]
|
266
267
|
name = "rb-sys"
|
267
|
-
version = "0.9.
|
268
|
+
version = "0.9.106"
|
268
269
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
269
|
-
checksum = "
|
270
|
+
checksum = "17b6efdbc8c1a22cb8b5d7ead0237c16c362c9ef6fbdc09e2d1040615b0f4cd0"
|
270
271
|
dependencies = [
|
271
272
|
"rb-sys-build",
|
272
273
|
]
|
273
274
|
|
274
275
|
[[package]]
|
275
276
|
name = "rb-sys-build"
|
276
|
-
version = "0.9.
|
277
|
+
version = "0.9.106"
|
277
278
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
278
|
-
checksum = "
|
279
|
+
checksum = "e1d88c51e52f8636a5efc24ec5987056e64e48a91ed2a1af96cb5564686cc10f"
|
279
280
|
dependencies = [
|
280
281
|
"bindgen",
|
281
282
|
"lazy_static",
|
@@ -399,9 +400,9 @@ dependencies = [
|
|
399
400
|
|
400
401
|
[[package]]
|
401
402
|
name = "tiktoken-rs"
|
402
|
-
version = "0.
|
403
|
+
version = "0.6.0"
|
403
404
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
404
|
-
checksum = "
|
405
|
+
checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6"
|
405
406
|
dependencies = [
|
406
407
|
"anyhow",
|
407
408
|
"base64",
|
@@ -409,6 +410,7 @@ dependencies = [
|
|
409
410
|
"fancy-regex",
|
410
411
|
"lazy_static",
|
411
412
|
"parking_lot",
|
413
|
+
"regex",
|
412
414
|
"rustc-hash",
|
413
415
|
]
|
414
416
|
|
data/Gemfile.lock
CHANGED
@@ -1,73 +1,73 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tiktoken_ruby (0.0.
|
5
|
-
rb_sys (
|
4
|
+
tiktoken_ruby (0.0.11)
|
5
|
+
rb_sys (= 0.9.106)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
10
|
ast (2.4.2)
|
11
|
-
diff-lcs (1.5.
|
12
|
-
json (2.
|
11
|
+
diff-lcs (1.5.1)
|
12
|
+
json (2.9.1)
|
13
13
|
language_server-protocol (3.17.0.3)
|
14
14
|
lint_roller (1.1.0)
|
15
15
|
minitest (5.21.2)
|
16
|
-
parallel (1.
|
17
|
-
parser (3.3.0
|
16
|
+
parallel (1.26.3)
|
17
|
+
parser (3.3.6.0)
|
18
18
|
ast (~> 2.4.1)
|
19
19
|
racc
|
20
|
-
racc (1.
|
20
|
+
racc (1.8.1)
|
21
21
|
rainbow (3.1.1)
|
22
|
-
rake (13.1
|
23
|
-
rake-compiler (1.2.
|
22
|
+
rake (13.2.1)
|
23
|
+
rake-compiler (1.2.9)
|
24
24
|
rake
|
25
|
-
rb_sys (0.9.
|
26
|
-
regexp_parser (2.
|
27
|
-
|
28
|
-
|
29
|
-
rspec-
|
30
|
-
rspec-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
rspec-expectations (3.12.3)
|
25
|
+
rb_sys (0.9.106)
|
26
|
+
regexp_parser (2.10.0)
|
27
|
+
rspec (3.13.0)
|
28
|
+
rspec-core (~> 3.13.0)
|
29
|
+
rspec-expectations (~> 3.13.0)
|
30
|
+
rspec-mocks (~> 3.13.0)
|
31
|
+
rspec-core (3.13.2)
|
32
|
+
rspec-support (~> 3.13.0)
|
33
|
+
rspec-expectations (3.13.3)
|
35
34
|
diff-lcs (>= 1.2.0, < 2.0)
|
36
|
-
rspec-support (~> 3.
|
37
|
-
rspec-mocks (3.
|
35
|
+
rspec-support (~> 3.13.0)
|
36
|
+
rspec-mocks (3.13.2)
|
38
37
|
diff-lcs (>= 1.2.0, < 2.0)
|
39
|
-
rspec-support (~> 3.
|
40
|
-
rspec-support (3.
|
41
|
-
rubocop (1.
|
38
|
+
rspec-support (~> 3.13.0)
|
39
|
+
rspec-support (3.13.2)
|
40
|
+
rubocop (1.69.2)
|
42
41
|
json (~> 2.3)
|
43
42
|
language_server-protocol (>= 3.17.0)
|
44
43
|
parallel (~> 1.10)
|
45
|
-
parser (>= 3.
|
44
|
+
parser (>= 3.3.0.2)
|
46
45
|
rainbow (>= 2.2.2, < 4.0)
|
47
|
-
regexp_parser (>=
|
48
|
-
|
49
|
-
rubocop-ast (>= 1.30.0, < 2.0)
|
46
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
47
|
+
rubocop-ast (>= 1.36.2, < 2.0)
|
50
48
|
ruby-progressbar (~> 1.7)
|
51
|
-
unicode-display_width (>= 2.4.0, <
|
52
|
-
rubocop-ast (1.
|
53
|
-
parser (>= 3.
|
54
|
-
rubocop-performance (1.
|
49
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
50
|
+
rubocop-ast (1.37.0)
|
51
|
+
parser (>= 3.3.1.0)
|
52
|
+
rubocop-performance (1.23.0)
|
55
53
|
rubocop (>= 1.48.1, < 2.0)
|
56
|
-
rubocop-ast (>= 1.
|
54
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
57
55
|
ruby-progressbar (1.13.0)
|
58
|
-
standard (1.
|
56
|
+
standard (1.43.0)
|
59
57
|
language_server-protocol (~> 3.17.0.2)
|
60
58
|
lint_roller (~> 1.0)
|
61
|
-
rubocop (~> 1.
|
59
|
+
rubocop (~> 1.69.1)
|
62
60
|
standard-custom (~> 1.0.0)
|
63
|
-
standard-performance (~> 1.
|
61
|
+
standard-performance (~> 1.6)
|
64
62
|
standard-custom (1.0.2)
|
65
63
|
lint_roller (~> 1.0)
|
66
64
|
rubocop (~> 1.50)
|
67
|
-
standard-performance (1.
|
65
|
+
standard-performance (1.6.0)
|
68
66
|
lint_roller (~> 1.1)
|
69
|
-
rubocop-performance (~> 1.
|
70
|
-
unicode-display_width (
|
67
|
+
rubocop-performance (~> 1.23.0)
|
68
|
+
unicode-display_width (3.1.3)
|
69
|
+
unicode-emoji (~> 4.0, >= 4.0.4)
|
70
|
+
unicode-emoji (4.0.4)
|
71
71
|
yard (0.9.34)
|
72
72
|
yard-doctest (0.1.17)
|
73
73
|
minitest
|
@@ -10,6 +10,6 @@ publish = false
|
|
10
10
|
crate-type = ["cdylib"]
|
11
11
|
|
12
12
|
[dependencies]
|
13
|
-
magnus = { version = "0.
|
14
|
-
rb-sys = { version = "0.9.
|
15
|
-
tiktoken-rs = { version = "0.
|
13
|
+
magnus = { version = "0.7.1" }
|
14
|
+
rb-sys = { version = "0.9.106", features = ["stable-api-compiled-fallback"] }
|
15
|
+
tiktoken-rs = { version = "0.6.0" }
|
@@ -1,7 +1,8 @@
|
|
1
1
|
use std::collections::HashSet;
|
2
2
|
|
3
|
-
use
|
3
|
+
use tiktoken_rs::Rank;
|
4
4
|
|
5
|
+
use crate::uncicode_error;
|
5
6
|
|
6
7
|
#[magnus::wrap(class = "Tiktoken::Ext::CoreBPE")]
|
7
8
|
pub struct CoreBPEWrapper {
|
@@ -13,11 +14,15 @@ impl CoreBPEWrapper {
|
|
13
14
|
Self { core_bpe }
|
14
15
|
}
|
15
16
|
|
16
|
-
pub fn encode_ordinary(&self, text: String) -> Vec<
|
17
|
+
pub fn encode_ordinary(&self, text: String) -> Vec<Rank> {
|
17
18
|
self.core_bpe.encode_ordinary(text.as_str())
|
18
19
|
}
|
19
20
|
|
20
|
-
pub fn encode(
|
21
|
+
pub fn encode(
|
22
|
+
&self,
|
23
|
+
text: String,
|
24
|
+
allowed_special: magnus::RArray,
|
25
|
+
) -> Result<Vec<Rank>, magnus::Error> {
|
21
26
|
let allowed_special: Vec<String> = allowed_special.to_vec()?;
|
22
27
|
let allowed_special: Vec<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
|
23
28
|
let allowed_special: HashSet<&str> = HashSet::from_iter(allowed_special.iter().cloned());
|
@@ -25,20 +30,18 @@ impl CoreBPEWrapper {
|
|
25
30
|
Ok(self.core_bpe.encode(text.as_str(), allowed_special))
|
26
31
|
}
|
27
32
|
|
28
|
-
pub fn encode_with_special_tokens(&self, text: String) -> Vec<
|
33
|
+
pub fn encode_with_special_tokens(&self, text: String) -> Vec<Rank> {
|
29
34
|
self.core_bpe.encode_with_special_tokens(text.as_str())
|
30
35
|
}
|
31
36
|
|
32
|
-
pub fn decode(&self, ids: Vec<
|
33
|
-
self.core_bpe.decode(ids)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
};
|
39
|
-
|
40
|
-
magnus::Error::new(error, e.to_string())
|
41
|
-
})
|
37
|
+
pub fn decode(&self, ids: Vec<Rank>) -> Result<String, magnus::Error> {
|
38
|
+
self.core_bpe.decode(ids).map_err(|e| {
|
39
|
+
let error = match uncicode_error() {
|
40
|
+
Ok(error) => error,
|
41
|
+
Err(e) => return e,
|
42
|
+
};
|
42
43
|
|
44
|
+
magnus::Error::new(error, e.to_string())
|
45
|
+
})
|
43
46
|
}
|
44
47
|
}
|
data/lib/tiktoken_ruby.rb
CHANGED
@@ -69,9 +69,12 @@ module Tiktoken
|
|
69
69
|
]
|
70
70
|
|
71
71
|
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
72
|
-
# that is also MIT licensed but by OpenAI
|
72
|
+
# that is also MIT licensed but by OpenAI;
|
73
|
+
# https://github.com/Congyuwang/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs#L50
|
74
|
+
# is the source of the mapping for the Rust library
|
73
75
|
MODEL_TO_ENCODING_NAME = {
|
74
76
|
# chat
|
77
|
+
"chatgpt-4o-latest": "o200k_base",
|
75
78
|
"gpt-4o": "o200k_base",
|
76
79
|
"gpt-4": "cl100k_base",
|
77
80
|
"gpt-3.5-turbo": "cl100k_base",
|
data/script/release
CHANGED
@@ -7,25 +7,35 @@ if [ -z "${TIKTOKEN_PUBLISH_KEY}" ]; then
|
|
7
7
|
exit 1
|
8
8
|
fi
|
9
9
|
|
10
|
+
run_id=""
|
11
|
+
# Parse arguments
|
12
|
+
while [[ "$#" -gt 0 ]]; do
|
13
|
+
case $1 in
|
14
|
+
--run-id)
|
15
|
+
run_id="$2"
|
16
|
+
shift 2
|
17
|
+
;;
|
18
|
+
*)
|
19
|
+
echo "Unknown parameter passed: $1"
|
20
|
+
exit 1
|
21
|
+
;;
|
22
|
+
esac
|
23
|
+
done
|
24
|
+
|
25
|
+
if [ -z "${run_id}" ]; then
|
26
|
+
echo "Error: --run-id is not provided. Please provide the GitHub Action run id for the cross-compile workflow."
|
27
|
+
exit 1
|
28
|
+
fi
|
29
|
+
|
10
30
|
version=$(grep VERSION lib/tiktoken_ruby/version.rb | head -n 1 | cut -d'"' -f2)
|
11
|
-
echo "Building tiktoken_ruby v$version"
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
"
|
18
|
-
"
|
19
|
-
"arm-linux"
|
20
|
-
"x64-mingw-ucrt"
|
21
|
-
)
|
22
|
-
|
23
|
-
# for target in "${targets[@]}"; do
|
24
|
-
# bundle exec rb-sys-dock -p "$target" --ruby-versions 3.2 --build
|
25
|
-
# done
|
26
|
-
|
27
|
-
for gem in pkg/tiktoken_ruby-"$version"-*.gem ; do
|
28
|
-
GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
|
31
|
+
echo "Building tiktoken_ruby v$version, using artifacts from run $run_id"
|
32
|
+
|
33
|
+
rm -rf pkg/cross-compiled
|
34
|
+
gh run download "$run_id" -D pkg/cross-compiled
|
35
|
+
|
36
|
+
for gem in pkg/cross-compiled/cross-gem-*/tiktoken_ruby-"$version"*.gem ; do
|
37
|
+
echo "Publishing $gem"
|
38
|
+
GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
|
29
39
|
done
|
30
40
|
|
31
41
|
# last but not least, the uncompiled gem
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiktoken_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- IAPark
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-01-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '='
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.9.
|
19
|
+
version: 0.9.106
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - '='
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.9.
|
26
|
+
version: 0.9.106
|
27
27
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
28
28
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|
29
29
|
it to OpenAI APIs.
|
@@ -61,7 +61,7 @@ metadata:
|
|
61
61
|
homepage_uri: https://github.com/IAPark/tiktoken_ruby
|
62
62
|
source_code_uri: https://github.com/IAPark/tiktoken_ruby
|
63
63
|
documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
|
64
|
-
post_install_message:
|
64
|
+
post_install_message:
|
65
65
|
rdoc_options: []
|
66
66
|
require_paths:
|
67
67
|
- lib
|
@@ -76,8 +76,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: 3.4.0
|
78
78
|
requirements: []
|
79
|
-
rubygems_version: 3.5.
|
80
|
-
signing_key:
|
79
|
+
rubygems_version: 3.5.22
|
80
|
+
signing_key:
|
81
81
|
specification_version: 4
|
82
82
|
summary: Ruby wrapper for Tiktoken
|
83
83
|
test_files: []
|