tiktoken_ruby 0.0.10 → 0.0.11.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: df7ca54ce72c32b1a0a0b278553f35365bf283af271f98436b4d330f6fb05735
4
- data.tar.gz: 5a225b39a85ce71779bb7e58c6f6c47eb8a884b813377783d833b27af53cbb9e
3
+ metadata.gz: 923d3291c75ea3e0d93b45c9a50e11323b421c3e6c93f140f8c5b64e708bc203
4
+ data.tar.gz: 2a1f843387a971b4b9735e52abf58471381bc592895af99b1b4091723b246266
5
5
  SHA512:
6
- metadata.gz: d15ec5fee05da4104e7111e237e5bf5e80291d226c51afd90fa580ebed87571140db4b06a17e39a82892489a6a932cb02d594a9cbf047315c5094bb4f25bd99b
7
- data.tar.gz: 444bd7e59d054395689a3f45beb3c288262e1e9e1903dc565821780ad9929de6aabbdbd9f543506b3cc662a54835892e9b34b9ece259d6f3f27d5642a8f1b75a
6
+ metadata.gz: 98aa2b547a129a5377a838ddfae4df66aee8de87fea916787283dbd2227066d2f99808aa0b71a522e27b58dd59b214c000cc3b933ebf9b9ff2aad56c6cb43536
7
+ data.tar.gz: 3beb0aead95f22b024cbe4e7d105698a1c0dbacc247956d7c91dcf0e468103a577a4a0bb6bf8cb2402516eb9ad521acb3d3af1090f626ab59380eb886e6c06ec
data/Cargo.lock CHANGED
@@ -115,12 +115,13 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
115
115
 
116
116
  [[package]]
117
117
  name = "fancy-regex"
118
- version = "0.12.0"
118
+ version = "0.13.0"
119
119
  source = "registry+https://github.com/rust-lang/crates.io-index"
120
- checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05"
120
+ checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
121
121
  dependencies = [
122
122
  "bit-set",
123
- "regex",
123
+ "regex-automata",
124
+ "regex-syntax",
124
125
  ]
125
126
 
126
127
  [[package]]
@@ -178,9 +179,9 @@ dependencies = [
178
179
 
179
180
  [[package]]
180
181
  name = "magnus"
181
- version = "0.6.4"
182
+ version = "0.7.1"
182
183
  source = "registry+https://github.com/rust-lang/crates.io-index"
183
- checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
184
+ checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
184
185
  dependencies = [
185
186
  "magnus-macros",
186
187
  "rb-sys",
@@ -264,18 +265,18 @@ dependencies = [
264
265
 
265
266
  [[package]]
266
267
  name = "rb-sys"
267
- version = "0.9.105"
268
+ version = "0.9.106"
268
269
  source = "registry+https://github.com/rust-lang/crates.io-index"
269
- checksum = "4b3a1f3ce8e7c36d777d52fe7a99039fe4fea7c8ec355a4c4f3a17f92a14029f"
270
+ checksum = "17b6efdbc8c1a22cb8b5d7ead0237c16c362c9ef6fbdc09e2d1040615b0f4cd0"
270
271
  dependencies = [
271
272
  "rb-sys-build",
272
273
  ]
273
274
 
274
275
  [[package]]
275
276
  name = "rb-sys-build"
276
- version = "0.9.105"
277
+ version = "0.9.106"
277
278
  source = "registry+https://github.com/rust-lang/crates.io-index"
278
- checksum = "3e6b246c29c0809e1cbe60a1ba9e093da72a4676d02adc68469297d1e589bbf0"
279
+ checksum = "e1d88c51e52f8636a5efc24ec5987056e64e48a91ed2a1af96cb5564686cc10f"
279
280
  dependencies = [
280
281
  "bindgen",
281
282
  "lazy_static",
@@ -399,9 +400,9 @@ dependencies = [
399
400
 
400
401
  [[package]]
401
402
  name = "tiktoken-rs"
402
- version = "0.5.9"
403
+ version = "0.6.0"
403
404
  source = "registry+https://github.com/rust-lang/crates.io-index"
404
- checksum = "c314e7ce51440f9e8f5a497394682a57b7c323d0f4d0a6b1b13c429056e0e234"
405
+ checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6"
405
406
  dependencies = [
406
407
  "anyhow",
407
408
  "base64",
@@ -409,6 +410,7 @@ dependencies = [
409
410
  "fancy-regex",
410
411
  "lazy_static",
411
412
  "parking_lot",
413
+ "regex",
412
414
  "rustc-hash",
413
415
  ]
414
416
 
data/Gemfile.lock CHANGED
@@ -1,73 +1,73 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.10)
5
- rb_sys (>= 0.9.87)
4
+ tiktoken_ruby (0.0.11.1)
5
+ rb_sys (= 0.9.106)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
10
  ast (2.4.2)
11
- diff-lcs (1.5.0)
12
- json (2.7.1)
11
+ diff-lcs (1.5.1)
12
+ json (2.9.1)
13
13
  language_server-protocol (3.17.0.3)
14
14
  lint_roller (1.1.0)
15
15
  minitest (5.21.2)
16
- parallel (1.24.0)
17
- parser (3.3.0.4)
16
+ parallel (1.26.3)
17
+ parser (3.3.6.0)
18
18
  ast (~> 2.4.1)
19
19
  racc
20
- racc (1.7.3)
20
+ racc (1.8.1)
21
21
  rainbow (3.1.1)
22
- rake (13.1.0)
23
- rake-compiler (1.2.5)
22
+ rake (13.2.1)
23
+ rake-compiler (1.2.9)
24
24
  rake
25
- rb_sys (0.9.105)
26
- regexp_parser (2.9.0)
27
- rexml (3.2.6)
28
- rspec (3.12.0)
29
- rspec-core (~> 3.12.0)
30
- rspec-expectations (~> 3.12.0)
31
- rspec-mocks (~> 3.12.0)
32
- rspec-core (3.12.2)
33
- rspec-support (~> 3.12.0)
34
- rspec-expectations (3.12.3)
25
+ rb_sys (0.9.106)
26
+ regexp_parser (2.10.0)
27
+ rspec (3.13.0)
28
+ rspec-core (~> 3.13.0)
29
+ rspec-expectations (~> 3.13.0)
30
+ rspec-mocks (~> 3.13.0)
31
+ rspec-core (3.13.2)
32
+ rspec-support (~> 3.13.0)
33
+ rspec-expectations (3.13.3)
35
34
  diff-lcs (>= 1.2.0, < 2.0)
36
- rspec-support (~> 3.12.0)
37
- rspec-mocks (3.12.6)
35
+ rspec-support (~> 3.13.0)
36
+ rspec-mocks (3.13.2)
38
37
  diff-lcs (>= 1.2.0, < 2.0)
39
- rspec-support (~> 3.12.0)
40
- rspec-support (3.12.1)
41
- rubocop (1.59.0)
38
+ rspec-support (~> 3.13.0)
39
+ rspec-support (3.13.2)
40
+ rubocop (1.69.2)
42
41
  json (~> 2.3)
43
42
  language_server-protocol (>= 3.17.0)
44
43
  parallel (~> 1.10)
45
- parser (>= 3.2.2.4)
44
+ parser (>= 3.3.0.2)
46
45
  rainbow (>= 2.2.2, < 4.0)
47
- regexp_parser (>= 1.8, < 3.0)
48
- rexml (>= 3.2.5, < 4.0)
49
- rubocop-ast (>= 1.30.0, < 2.0)
46
+ regexp_parser (>= 2.9.3, < 3.0)
47
+ rubocop-ast (>= 1.36.2, < 2.0)
50
48
  ruby-progressbar (~> 1.7)
51
- unicode-display_width (>= 2.4.0, < 3.0)
52
- rubocop-ast (1.30.0)
53
- parser (>= 3.2.1.0)
54
- rubocop-performance (1.20.2)
49
+ unicode-display_width (>= 2.4.0, < 4.0)
50
+ rubocop-ast (1.37.0)
51
+ parser (>= 3.3.1.0)
52
+ rubocop-performance (1.23.0)
55
53
  rubocop (>= 1.48.1, < 2.0)
56
- rubocop-ast (>= 1.30.0, < 2.0)
54
+ rubocop-ast (>= 1.31.1, < 2.0)
57
55
  ruby-progressbar (1.13.0)
58
- standard (1.33.0)
56
+ standard (1.43.0)
59
57
  language_server-protocol (~> 3.17.0.2)
60
58
  lint_roller (~> 1.0)
61
- rubocop (~> 1.59.0)
59
+ rubocop (~> 1.69.1)
62
60
  standard-custom (~> 1.0.0)
63
- standard-performance (~> 1.3)
61
+ standard-performance (~> 1.6)
64
62
  standard-custom (1.0.2)
65
63
  lint_roller (~> 1.0)
66
64
  rubocop (~> 1.50)
67
- standard-performance (1.3.1)
65
+ standard-performance (1.6.0)
68
66
  lint_roller (~> 1.1)
69
- rubocop-performance (~> 1.20.2)
70
- unicode-display_width (2.5.0)
67
+ rubocop-performance (~> 1.23.0)
68
+ unicode-display_width (3.1.3)
69
+ unicode-emoji (~> 4.0, >= 4.0.4)
70
+ unicode-emoji (4.0.4)
71
71
  yard (0.9.34)
72
72
  yard-doctest (0.1.17)
73
73
  minitest
@@ -10,6 +10,6 @@ publish = false
10
10
  crate-type = ["cdylib"]
11
11
 
12
12
  [dependencies]
13
- magnus = { version = "0.6.1" }
14
- rb-sys = { version = "0.9.87", features = ["stable-api-compiled-fallback"] }
15
- tiktoken-rs = { version = "0.5.9" }
13
+ magnus = { version = "0.7.1" }
14
+ rb-sys = { version = "0.9.106", features = ["stable-api-compiled-fallback"] }
15
+ tiktoken-rs = { version = "0.6.0" }
@@ -1,7 +1,8 @@
1
1
  use std::collections::HashSet;
2
2
 
3
- use crate::uncicode_error;
3
+ use tiktoken_rs::Rank;
4
4
 
5
+ use crate::uncicode_error;
5
6
 
6
7
  #[magnus::wrap(class = "Tiktoken::Ext::CoreBPE")]
7
8
  pub struct CoreBPEWrapper {
@@ -13,11 +14,15 @@ impl CoreBPEWrapper {
13
14
  Self { core_bpe }
14
15
  }
15
16
 
16
- pub fn encode_ordinary(&self, text: String) -> Vec<usize> {
17
+ pub fn encode_ordinary(&self, text: String) -> Vec<Rank> {
17
18
  self.core_bpe.encode_ordinary(text.as_str())
18
19
  }
19
20
 
20
- pub fn encode(&self, text: String, allowed_special: magnus::RArray) -> Result<Vec<usize>, magnus::Error> {
21
+ pub fn encode(
22
+ &self,
23
+ text: String,
24
+ allowed_special: magnus::RArray,
25
+ ) -> Result<Vec<Rank>, magnus::Error> {
21
26
  let allowed_special: Vec<String> = allowed_special.to_vec()?;
22
27
  let allowed_special: Vec<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
23
28
  let allowed_special: HashSet<&str> = HashSet::from_iter(allowed_special.iter().cloned());
@@ -25,20 +30,18 @@ impl CoreBPEWrapper {
25
30
  Ok(self.core_bpe.encode(text.as_str(), allowed_special))
26
31
  }
27
32
 
28
- pub fn encode_with_special_tokens(&self, text: String) -> Vec<usize> {
33
+ pub fn encode_with_special_tokens(&self, text: String) -> Vec<Rank> {
29
34
  self.core_bpe.encode_with_special_tokens(text.as_str())
30
35
  }
31
36
 
32
- pub fn decode(&self, ids: Vec<usize>) -> Result<String, magnus::Error> {
33
- self.core_bpe.decode(ids)
34
- .map_err(|e| {
35
- let error = match uncicode_error() {
36
- Ok(error) => error,
37
- Err(e) => return e
38
- };
39
-
40
- magnus::Error::new(error, e.to_string())
41
- })
37
+ pub fn decode(&self, ids: Vec<Rank>) -> Result<String, magnus::Error> {
38
+ self.core_bpe.decode(ids).map_err(|e| {
39
+ let error = match uncicode_error() {
40
+ Ok(error) => error,
41
+ Err(e) => return e,
42
+ };
42
43
 
44
+ magnus::Error::new(error, e.to_string())
45
+ })
43
46
  }
44
47
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.10"
4
+ VERSION = "0.0.11.1"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -69,9 +69,12 @@ module Tiktoken
69
69
  ]
70
70
 
71
71
  # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
72
- # that is also MIT licensed but by OpenAI
72
+ # that is also MIT licensed but by OpenAI;
73
+ # https://github.com/Congyuwang/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs#L50
74
+ # is the source of the mapping for the Rust library
73
75
  MODEL_TO_ENCODING_NAME = {
74
76
  # chat
77
+ "chatgpt-4o-latest": "o200k_base",
75
78
  "gpt-4o": "o200k_base",
76
79
  "gpt-4": "cl100k_base",
77
80
  "gpt-3.5-turbo": "cl100k_base",
data/script/release CHANGED
@@ -7,25 +7,35 @@ if [ -z "${TIKTOKEN_PUBLISH_KEY}" ]; then
7
7
  exit 1
8
8
  fi
9
9
 
10
+ run_id=""
11
+ # Parse arguments
12
+ while [[ "$#" -gt 0 ]]; do
13
+ case $1 in
14
+ --run-id)
15
+ run_id="$2"
16
+ shift 2
17
+ ;;
18
+ *)
19
+ echo "Unknown parameter passed: $1"
20
+ exit 1
21
+ ;;
22
+ esac
23
+ done
24
+
25
+ if [ -z "${run_id}" ]; then
26
+ echo "Error: --run-id is not provided. Please provide the GitHub Action run id for the cross-compile workflow."
27
+ exit 1
28
+ fi
29
+
10
30
  version=$(grep VERSION lib/tiktoken_ruby/version.rb | head -n 1 | cut -d'"' -f2)
11
- echo "Building tiktoken_ruby v$version"
12
-
13
- targets=(
14
- "arm64-darwin"
15
- "x86_64-darwin"
16
- "aarch64-linux"
17
- "x86_64-linux"
18
- "x86_64-linux-musl"
19
- "arm-linux"
20
- "x64-mingw-ucrt"
21
- )
22
-
23
- # for target in "${targets[@]}"; do
24
- # bundle exec rb-sys-dock -p "$target" --ruby-versions 3.2 --build
25
- # done
26
-
27
- for gem in pkg/tiktoken_ruby-"$version"-*.gem ; do
28
- GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
31
+ echo "Building tiktoken_ruby v$version, using artifacts from run $run_id"
32
+
33
+ rm -rf pkg/cross-compiled
34
+ gh run download "$run_id" -D pkg/cross-compiled
35
+
36
+ for gem in pkg/cross-compiled/cross-gem-*/tiktoken_ruby-"$version"*.gem ; do
37
+ echo "Publishing $gem"
38
+ GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
29
39
  done
30
40
 
31
41
  # last but not least, the uncompiled gem
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - IAPark
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-12-29 00:00:00.000000000 Z
11
+ date: 2025-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 0.9.87
19
+ version: 0.9.106
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 0.9.87
26
+ version: 0.9.106
27
27
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
28
28
  used by OpenAI. It can be used to count the number of tokens in text before sending
29
29
  it to OpenAI APIs.
@@ -61,7 +61,7 @@ metadata:
61
61
  homepage_uri: https://github.com/IAPark/tiktoken_ruby
62
62
  source_code_uri: https://github.com/IAPark/tiktoken_ruby
63
63
  documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
64
- post_install_message:
64
+ post_install_message:
65
65
  rdoc_options: []
66
66
  require_paths:
67
67
  - lib
@@ -76,8 +76,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
76
76
  - !ruby/object:Gem::Version
77
77
  version: 3.4.0
78
78
  requirements: []
79
- rubygems_version: 3.5.23
80
- signing_key:
79
+ rubygems_version: 3.5.22
80
+ signing_key:
81
81
  specification_version: 4
82
82
  summary: Ruby wrapper for Tiktoken
83
83
  test_files: []