tiktoken_ruby 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: df7ca54ce72c32b1a0a0b278553f35365bf283af271f98436b4d330f6fb05735
4
- data.tar.gz: 5a225b39a85ce71779bb7e58c6f6c47eb8a884b813377783d833b27af53cbb9e
3
+ metadata.gz: 9997f4334fdaff90a631036be451cad90eb58ce6919a9592de2ca09d7f8baf9e
4
+ data.tar.gz: 160c540bf8a76278ebcabfe80c50e9fdb5802a0742d4692295e659ba8626b492
5
5
  SHA512:
6
- metadata.gz: d15ec5fee05da4104e7111e237e5bf5e80291d226c51afd90fa580ebed87571140db4b06a17e39a82892489a6a932cb02d594a9cbf047315c5094bb4f25bd99b
7
- data.tar.gz: 444bd7e59d054395689a3f45beb3c288262e1e9e1903dc565821780ad9929de6aabbdbd9f543506b3cc662a54835892e9b34b9ece259d6f3f27d5642a8f1b75a
6
+ metadata.gz: ea271891e7ca2fbfb637da4945c5f6da55f72b8c39efe20c4f83d5d059d2b9997f4344feadbd6109a29859729e4bba245fe51338e77c548e9e9a24bd1981a6e9
7
+ data.tar.gz: 3b6eeda82acaa6b6abb324e911196bce53d20a2bbec7750e07cb3b9d7a8381ed9ebd90b68a4e770e8bb1b4be332748216bc653d31389591348bc5192656e417f
data/Cargo.lock CHANGED
@@ -115,12 +115,13 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
115
115
 
116
116
  [[package]]
117
117
  name = "fancy-regex"
118
- version = "0.12.0"
118
+ version = "0.13.0"
119
119
  source = "registry+https://github.com/rust-lang/crates.io-index"
120
- checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05"
120
+ checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
121
121
  dependencies = [
122
122
  "bit-set",
123
- "regex",
123
+ "regex-automata",
124
+ "regex-syntax",
124
125
  ]
125
126
 
126
127
  [[package]]
@@ -178,9 +179,9 @@ dependencies = [
178
179
 
179
180
  [[package]]
180
181
  name = "magnus"
181
- version = "0.6.4"
182
+ version = "0.7.1"
182
183
  source = "registry+https://github.com/rust-lang/crates.io-index"
183
- checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
184
+ checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
184
185
  dependencies = [
185
186
  "magnus-macros",
186
187
  "rb-sys",
@@ -264,18 +265,18 @@ dependencies = [
264
265
 
265
266
  [[package]]
266
267
  name = "rb-sys"
267
- version = "0.9.105"
268
+ version = "0.9.106"
268
269
  source = "registry+https://github.com/rust-lang/crates.io-index"
269
- checksum = "4b3a1f3ce8e7c36d777d52fe7a99039fe4fea7c8ec355a4c4f3a17f92a14029f"
270
+ checksum = "17b6efdbc8c1a22cb8b5d7ead0237c16c362c9ef6fbdc09e2d1040615b0f4cd0"
270
271
  dependencies = [
271
272
  "rb-sys-build",
272
273
  ]
273
274
 
274
275
  [[package]]
275
276
  name = "rb-sys-build"
276
- version = "0.9.105"
277
+ version = "0.9.106"
277
278
  source = "registry+https://github.com/rust-lang/crates.io-index"
278
- checksum = "3e6b246c29c0809e1cbe60a1ba9e093da72a4676d02adc68469297d1e589bbf0"
279
+ checksum = "e1d88c51e52f8636a5efc24ec5987056e64e48a91ed2a1af96cb5564686cc10f"
279
280
  dependencies = [
280
281
  "bindgen",
281
282
  "lazy_static",
@@ -399,9 +400,9 @@ dependencies = [
399
400
 
400
401
  [[package]]
401
402
  name = "tiktoken-rs"
402
- version = "0.5.9"
403
+ version = "0.6.0"
403
404
  source = "registry+https://github.com/rust-lang/crates.io-index"
404
- checksum = "c314e7ce51440f9e8f5a497394682a57b7c323d0f4d0a6b1b13c429056e0e234"
405
+ checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6"
405
406
  dependencies = [
406
407
  "anyhow",
407
408
  "base64",
@@ -409,6 +410,7 @@ dependencies = [
409
410
  "fancy-regex",
410
411
  "lazy_static",
411
412
  "parking_lot",
413
+ "regex",
412
414
  "rustc-hash",
413
415
  ]
414
416
 
data/Gemfile.lock CHANGED
@@ -1,73 +1,73 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.10)
5
- rb_sys (>= 0.9.87)
4
+ tiktoken_ruby (0.0.11)
5
+ rb_sys (= 0.9.106)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
10
  ast (2.4.2)
11
- diff-lcs (1.5.0)
12
- json (2.7.1)
11
+ diff-lcs (1.5.1)
12
+ json (2.9.1)
13
13
  language_server-protocol (3.17.0.3)
14
14
  lint_roller (1.1.0)
15
15
  minitest (5.21.2)
16
- parallel (1.24.0)
17
- parser (3.3.0.4)
16
+ parallel (1.26.3)
17
+ parser (3.3.6.0)
18
18
  ast (~> 2.4.1)
19
19
  racc
20
- racc (1.7.3)
20
+ racc (1.8.1)
21
21
  rainbow (3.1.1)
22
- rake (13.1.0)
23
- rake-compiler (1.2.5)
22
+ rake (13.2.1)
23
+ rake-compiler (1.2.9)
24
24
  rake
25
- rb_sys (0.9.105)
26
- regexp_parser (2.9.0)
27
- rexml (3.2.6)
28
- rspec (3.12.0)
29
- rspec-core (~> 3.12.0)
30
- rspec-expectations (~> 3.12.0)
31
- rspec-mocks (~> 3.12.0)
32
- rspec-core (3.12.2)
33
- rspec-support (~> 3.12.0)
34
- rspec-expectations (3.12.3)
25
+ rb_sys (0.9.106)
26
+ regexp_parser (2.10.0)
27
+ rspec (3.13.0)
28
+ rspec-core (~> 3.13.0)
29
+ rspec-expectations (~> 3.13.0)
30
+ rspec-mocks (~> 3.13.0)
31
+ rspec-core (3.13.2)
32
+ rspec-support (~> 3.13.0)
33
+ rspec-expectations (3.13.3)
35
34
  diff-lcs (>= 1.2.0, < 2.0)
36
- rspec-support (~> 3.12.0)
37
- rspec-mocks (3.12.6)
35
+ rspec-support (~> 3.13.0)
36
+ rspec-mocks (3.13.2)
38
37
  diff-lcs (>= 1.2.0, < 2.0)
39
- rspec-support (~> 3.12.0)
40
- rspec-support (3.12.1)
41
- rubocop (1.59.0)
38
+ rspec-support (~> 3.13.0)
39
+ rspec-support (3.13.2)
40
+ rubocop (1.69.2)
42
41
  json (~> 2.3)
43
42
  language_server-protocol (>= 3.17.0)
44
43
  parallel (~> 1.10)
45
- parser (>= 3.2.2.4)
44
+ parser (>= 3.3.0.2)
46
45
  rainbow (>= 2.2.2, < 4.0)
47
- regexp_parser (>= 1.8, < 3.0)
48
- rexml (>= 3.2.5, < 4.0)
49
- rubocop-ast (>= 1.30.0, < 2.0)
46
+ regexp_parser (>= 2.9.3, < 3.0)
47
+ rubocop-ast (>= 1.36.2, < 2.0)
50
48
  ruby-progressbar (~> 1.7)
51
- unicode-display_width (>= 2.4.0, < 3.0)
52
- rubocop-ast (1.30.0)
53
- parser (>= 3.2.1.0)
54
- rubocop-performance (1.20.2)
49
+ unicode-display_width (>= 2.4.0, < 4.0)
50
+ rubocop-ast (1.37.0)
51
+ parser (>= 3.3.1.0)
52
+ rubocop-performance (1.23.0)
55
53
  rubocop (>= 1.48.1, < 2.0)
56
- rubocop-ast (>= 1.30.0, < 2.0)
54
+ rubocop-ast (>= 1.31.1, < 2.0)
57
55
  ruby-progressbar (1.13.0)
58
- standard (1.33.0)
56
+ standard (1.43.0)
59
57
  language_server-protocol (~> 3.17.0.2)
60
58
  lint_roller (~> 1.0)
61
- rubocop (~> 1.59.0)
59
+ rubocop (~> 1.69.1)
62
60
  standard-custom (~> 1.0.0)
63
- standard-performance (~> 1.3)
61
+ standard-performance (~> 1.6)
64
62
  standard-custom (1.0.2)
65
63
  lint_roller (~> 1.0)
66
64
  rubocop (~> 1.50)
67
- standard-performance (1.3.1)
65
+ standard-performance (1.6.0)
68
66
  lint_roller (~> 1.1)
69
- rubocop-performance (~> 1.20.2)
70
- unicode-display_width (2.5.0)
67
+ rubocop-performance (~> 1.23.0)
68
+ unicode-display_width (3.1.3)
69
+ unicode-emoji (~> 4.0, >= 4.0.4)
70
+ unicode-emoji (4.0.4)
71
71
  yard (0.9.34)
72
72
  yard-doctest (0.1.17)
73
73
  minitest
@@ -10,6 +10,6 @@ publish = false
10
10
  crate-type = ["cdylib"]
11
11
 
12
12
  [dependencies]
13
- magnus = { version = "0.6.1" }
14
- rb-sys = { version = "0.9.87", features = ["stable-api-compiled-fallback"] }
15
- tiktoken-rs = { version = "0.5.9" }
13
+ magnus = { version = "0.7.1" }
14
+ rb-sys = { version = "0.9.106", features = ["stable-api-compiled-fallback"] }
15
+ tiktoken-rs = { version = "0.6.0" }
@@ -1,7 +1,8 @@
1
1
  use std::collections::HashSet;
2
2
 
3
- use crate::uncicode_error;
3
+ use tiktoken_rs::Rank;
4
4
 
5
+ use crate::uncicode_error;
5
6
 
6
7
  #[magnus::wrap(class = "Tiktoken::Ext::CoreBPE")]
7
8
  pub struct CoreBPEWrapper {
@@ -13,11 +14,15 @@ impl CoreBPEWrapper {
13
14
  Self { core_bpe }
14
15
  }
15
16
 
16
- pub fn encode_ordinary(&self, text: String) -> Vec<usize> {
17
+ pub fn encode_ordinary(&self, text: String) -> Vec<Rank> {
17
18
  self.core_bpe.encode_ordinary(text.as_str())
18
19
  }
19
20
 
20
- pub fn encode(&self, text: String, allowed_special: magnus::RArray) -> Result<Vec<usize>, magnus::Error> {
21
+ pub fn encode(
22
+ &self,
23
+ text: String,
24
+ allowed_special: magnus::RArray,
25
+ ) -> Result<Vec<Rank>, magnus::Error> {
21
26
  let allowed_special: Vec<String> = allowed_special.to_vec()?;
22
27
  let allowed_special: Vec<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
23
28
  let allowed_special: HashSet<&str> = HashSet::from_iter(allowed_special.iter().cloned());
@@ -25,20 +30,18 @@ impl CoreBPEWrapper {
25
30
  Ok(self.core_bpe.encode(text.as_str(), allowed_special))
26
31
  }
27
32
 
28
- pub fn encode_with_special_tokens(&self, text: String) -> Vec<usize> {
33
+ pub fn encode_with_special_tokens(&self, text: String) -> Vec<Rank> {
29
34
  self.core_bpe.encode_with_special_tokens(text.as_str())
30
35
  }
31
36
 
32
- pub fn decode(&self, ids: Vec<usize>) -> Result<String, magnus::Error> {
33
- self.core_bpe.decode(ids)
34
- .map_err(|e| {
35
- let error = match uncicode_error() {
36
- Ok(error) => error,
37
- Err(e) => return e
38
- };
39
-
40
- magnus::Error::new(error, e.to_string())
41
- })
37
+ pub fn decode(&self, ids: Vec<Rank>) -> Result<String, magnus::Error> {
38
+ self.core_bpe.decode(ids).map_err(|e| {
39
+ let error = match uncicode_error() {
40
+ Ok(error) => error,
41
+ Err(e) => return e,
42
+ };
42
43
 
44
+ magnus::Error::new(error, e.to_string())
45
+ })
43
46
  }
44
47
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.10"
4
+ VERSION = "0.0.11"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -69,9 +69,12 @@ module Tiktoken
69
69
  ]
70
70
 
71
71
  # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
72
- # that is also MIT licensed but by OpenAI
72
+ # that is also MIT licensed but by OpenAI;
73
+ # https://github.com/Congyuwang/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs#L50
74
+ # is the source of the mapping for the Rust library
73
75
  MODEL_TO_ENCODING_NAME = {
74
76
  # chat
77
+ "chatgpt-4o-latest": "o200k_base",
75
78
  "gpt-4o": "o200k_base",
76
79
  "gpt-4": "cl100k_base",
77
80
  "gpt-3.5-turbo": "cl100k_base",
data/script/release CHANGED
@@ -7,25 +7,35 @@ if [ -z "${TIKTOKEN_PUBLISH_KEY}" ]; then
7
7
  exit 1
8
8
  fi
9
9
 
10
+ run_id=""
11
+ # Parse arguments
12
+ while [[ "$#" -gt 0 ]]; do
13
+ case $1 in
14
+ --run-id)
15
+ run_id="$2"
16
+ shift 2
17
+ ;;
18
+ *)
19
+ echo "Unknown parameter passed: $1"
20
+ exit 1
21
+ ;;
22
+ esac
23
+ done
24
+
25
+ if [ -z "${run_id}" ]; then
26
+ echo "Error: --run-id is not provided. Please provide the GitHub Action run id for the cross-compile workflow."
27
+ exit 1
28
+ fi
29
+
10
30
  version=$(grep VERSION lib/tiktoken_ruby/version.rb | head -n 1 | cut -d'"' -f2)
11
- echo "Building tiktoken_ruby v$version"
12
-
13
- targets=(
14
- "arm64-darwin"
15
- "x86_64-darwin"
16
- "aarch64-linux"
17
- "x86_64-linux"
18
- "x86_64-linux-musl"
19
- "arm-linux"
20
- "x64-mingw-ucrt"
21
- )
22
-
23
- # for target in "${targets[@]}"; do
24
- # bundle exec rb-sys-dock -p "$target" --ruby-versions 3.2 --build
25
- # done
26
-
27
- for gem in pkg/tiktoken_ruby-"$version"-*.gem ; do
28
- GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
31
+ echo "Building tiktoken_ruby v$version, using artifacts from run $run_id"
32
+
33
+ rm -rf pkg/cross-compiled
34
+ gh run download "$run_id" -D pkg/cross-compiled
35
+
36
+ for gem in pkg/cross-compiled/cross-gem-*/tiktoken_ruby-"$version"*.gem ; do
37
+ echo "Publishing $gem"
38
+ GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
29
39
  done
30
40
 
31
41
  # last but not least, the uncompiled gem
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - IAPark
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-12-29 00:00:00.000000000 Z
11
+ date: 2025-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 0.9.87
19
+ version: 0.9.106
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 0.9.87
26
+ version: 0.9.106
27
27
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
28
28
  used by OpenAI. It can be used to count the number of tokens in text before sending
29
29
  it to OpenAI APIs.
@@ -61,7 +61,7 @@ metadata:
61
61
  homepage_uri: https://github.com/IAPark/tiktoken_ruby
62
62
  source_code_uri: https://github.com/IAPark/tiktoken_ruby
63
63
  documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
64
- post_install_message:
64
+ post_install_message:
65
65
  rdoc_options: []
66
66
  require_paths:
67
67
  - lib
@@ -76,8 +76,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
76
76
  - !ruby/object:Gem::Version
77
77
  version: 3.4.0
78
78
  requirements: []
79
- rubygems_version: 3.5.23
80
- signing_key:
79
+ rubygems_version: 3.5.22
80
+ signing_key:
81
81
  specification_version: 4
82
82
  summary: Ruby wrapper for Tiktoken
83
83
  test_files: []