tiktoken_ruby 0.0.5 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ee8bc941d0d8de83fb99c33acf9c831d2bd7542f54726a2ff3ec8acf41fdaab8
4
- data.tar.gz: ca70753b8823328f08123328ad4409b351e99a2d564e0fc8c8e779a4b378a2b3
3
+ metadata.gz: 347a10d045e27fca4cdfec03c4d2eac0150448b8f2125d5bcbcd1b92db83499a
4
+ data.tar.gz: de048e8320daa15b27ffa7ccdd9f7ec618cb2a3ad96fe2bedf71e6f780fc8b6f
5
5
  SHA512:
6
- metadata.gz: '031225799558c1bd563ca6d40b7525f956155e7dabe899c0a5c440eedc29727fb7e8504136251623ca63ad0d2877aadf4673ceb6cda5c440aa8f9362d119d119'
7
- data.tar.gz: 9df34194052e5bdf958d214e6a0d5a9cdda9341401e8226956dc2de05406d22b67a3c7a67fd998af95bf981f1ced34daf8084aeaf9afec0f6e409f416eaa362a
6
+ metadata.gz: 7c587d4b18e777a0f7692aab857fec1fc3b3bcceceab158439197123786475c65d51f5356351bcf0c9c327b94e2ff15a83b2144c2db329aedea0456c1d763ff9
7
+ data.tar.gz: abfdeb836d81555effa5ef6647a77b360dae2b975bb3dc23a22c779f6dd82256630942ddfef01a837ff5195896fa4251925e87dec0d245dce1c8e83719a488f1
data/Cargo.lock CHANGED
@@ -31,21 +31,22 @@ checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
31
31
 
32
32
  [[package]]
33
33
  name = "bindgen"
34
- version = "0.60.1"
34
+ version = "0.69.4"
35
35
  source = "registry+https://github.com/rust-lang/crates.io-index"
36
- checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6"
36
+ checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
37
37
  dependencies = [
38
- "bitflags",
38
+ "bitflags 2.4.0",
39
39
  "cexpr",
40
40
  "clang-sys",
41
+ "itertools",
41
42
  "lazy_static",
42
43
  "lazycell",
43
- "peeking_take_while",
44
44
  "proc-macro2",
45
45
  "quote",
46
46
  "regex",
47
47
  "rustc-hash",
48
48
  "shlex",
49
+ "syn",
49
50
  ]
50
51
 
51
52
  [[package]]
@@ -69,6 +70,12 @@ version = "1.3.2"
69
70
  source = "registry+https://github.com/rust-lang/crates.io-index"
70
71
  checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
71
72
 
73
+ [[package]]
74
+ name = "bitflags"
75
+ version = "2.4.0"
76
+ source = "registry+https://github.com/rust-lang/crates.io-index"
77
+ checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635"
78
+
72
79
  [[package]]
73
80
  name = "bstr"
74
81
  version = "1.4.0"
@@ -107,6 +114,12 @@ dependencies = [
107
114
  "libloading",
108
115
  ]
109
116
 
117
+ [[package]]
118
+ name = "either"
119
+ version = "1.10.0"
120
+ source = "registry+https://github.com/rust-lang/crates.io-index"
121
+ checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
122
+
110
123
  [[package]]
111
124
  name = "fancy-regex"
112
125
  version = "0.11.0"
@@ -123,6 +136,15 @@ version = "0.3.1"
123
136
  source = "registry+https://github.com/rust-lang/crates.io-index"
124
137
  checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
125
138
 
139
+ [[package]]
140
+ name = "itertools"
141
+ version = "0.12.1"
142
+ source = "registry+https://github.com/rust-lang/crates.io-index"
143
+ checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
144
+ dependencies = [
145
+ "either",
146
+ ]
147
+
126
148
  [[package]]
127
149
  name = "lazy_static"
128
150
  version = "1.4.0"
@@ -163,20 +185,21 @@ dependencies = [
163
185
 
164
186
  [[package]]
165
187
  name = "magnus"
166
- version = "0.4.4"
188
+ version = "0.6.1"
167
189
  source = "registry+https://github.com/rust-lang/crates.io-index"
168
- checksum = "fc87660cd7daa49fddbfd524c836de54d5c927d520cd163f43700c5087c57d6c"
190
+ checksum = "0516897a45f8ce8270a8910bcb94cd83538b19b6ae3a0c281a765df170b64695"
169
191
  dependencies = [
170
192
  "magnus-macros",
171
193
  "rb-sys",
172
194
  "rb-sys-env",
195
+ "seq-macro",
173
196
  ]
174
197
 
175
198
  [[package]]
176
199
  name = "magnus-macros"
177
- version = "0.3.0"
200
+ version = "0.6.0"
178
201
  source = "registry+https://github.com/rust-lang/crates.io-index"
179
- checksum = "206cb23bfeea05180c97522ef6a3e52a4eb17b0ed2f30ee3ca9c4f994d2378ae"
202
+ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
180
203
  dependencies = [
181
204
  "proc-macro2",
182
205
  "quote",
@@ -234,44 +257,38 @@ dependencies = [
234
257
  "windows-sys",
235
258
  ]
236
259
 
237
- [[package]]
238
- name = "peeking_take_while"
239
- version = "0.1.2"
240
- source = "registry+https://github.com/rust-lang/crates.io-index"
241
- checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
242
-
243
260
  [[package]]
244
261
  name = "proc-macro2"
245
- version = "1.0.52"
262
+ version = "1.0.66"
246
263
  source = "registry+https://github.com/rust-lang/crates.io-index"
247
- checksum = "1d0e1ae9e836cc3beddd63db0df682593d7e2d3d891ae8c9083d2113e1744224"
264
+ checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
248
265
  dependencies = [
249
266
  "unicode-ident",
250
267
  ]
251
268
 
252
269
  [[package]]
253
270
  name = "quote"
254
- version = "1.0.26"
271
+ version = "1.0.33"
255
272
  source = "registry+https://github.com/rust-lang/crates.io-index"
256
- checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
273
+ checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
257
274
  dependencies = [
258
275
  "proc-macro2",
259
276
  ]
260
277
 
261
278
  [[package]]
262
279
  name = "rb-sys"
263
- version = "0.9.68"
280
+ version = "0.9.87"
264
281
  source = "registry+https://github.com/rust-lang/crates.io-index"
265
- checksum = "4783528e031c3902524cfe685e0008d644e8b34bad4d1a19c1f39f4394d0777b"
282
+ checksum = "225103e3d69bbfe8831f9fd0d2461335f3a9dd06aa6e88bcb6d6970383494d06"
266
283
  dependencies = [
267
284
  "rb-sys-build",
268
285
  ]
269
286
 
270
287
  [[package]]
271
288
  name = "rb-sys-build"
272
- version = "0.9.68"
289
+ version = "0.9.87"
273
290
  source = "registry+https://github.com/rust-lang/crates.io-index"
274
- checksum = "14c04474335dd597126d20fe90d94504b3582a30cbc36bc72c7d6ed3ef8cf168"
291
+ checksum = "bacce8095a5167d5ede618bbd9353e9d9e2f32ddaf54be911106f0ee6baacf09"
275
292
  dependencies = [
276
293
  "bindgen",
277
294
  "lazy_static",
@@ -294,7 +311,7 @@ version = "0.2.16"
294
311
  source = "registry+https://github.com/rust-lang/crates.io-index"
295
312
  checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
296
313
  dependencies = [
297
- "bitflags",
314
+ "bitflags 1.3.2",
298
315
  ]
299
316
 
300
317
  [[package]]
@@ -332,6 +349,12 @@ version = "1.1.0"
332
349
  source = "registry+https://github.com/rust-lang/crates.io-index"
333
350
  checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
334
351
 
352
+ [[package]]
353
+ name = "seq-macro"
354
+ version = "0.3.5"
355
+ source = "registry+https://github.com/rust-lang/crates.io-index"
356
+ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
357
+
335
358
  [[package]]
336
359
  name = "serde"
337
360
  version = "1.0.157"
@@ -358,9 +381,9 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
358
381
 
359
382
  [[package]]
360
383
  name = "syn"
361
- version = "1.0.109"
384
+ version = "2.0.31"
362
385
  source = "registry+https://github.com/rust-lang/crates.io-index"
363
- checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
386
+ checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398"
364
387
  dependencies = [
365
388
  "proc-macro2",
366
389
  "quote",
@@ -386,6 +409,7 @@ name = "tiktoken_ruby"
386
409
  version = "0.1.0"
387
410
  dependencies = [
388
411
  "magnus",
412
+ "rb-sys",
389
413
  "tiktoken-rs",
390
414
  ]
391
415
 
data/Gemfile CHANGED
@@ -2,15 +2,11 @@
2
2
 
3
3
  source "https://rubygems.org"
4
4
 
5
- # Specify your gem's dependencies in tiktoken_ruby.gemspec
6
5
  gemspec
7
6
 
8
- gem "rake", "~> 13.0"
9
-
7
+ gem "rake"
10
8
  gem "rake-compiler"
11
-
12
- gem "rspec", "~> 3.0"
13
-
14
- gem "standard", "~> 1.3"
15
-
16
- gem "yard-doctest", "~> 0.1.17"
9
+ gem "rspec"
10
+ gem "standard"
11
+ gem "yard-doctest"
12
+ gem "racc"
data/Gemfile.lock CHANGED
@@ -1,64 +1,74 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.5)
5
- rb_sys (~> 0.9.68)
4
+ tiktoken_ruby (0.0.8)
5
+ rb_sys (>= 0.9.86)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
10
  ast (2.4.2)
11
11
  diff-lcs (1.5.0)
12
- json (2.6.3)
12
+ json (2.7.1)
13
13
  language_server-protocol (3.17.0.3)
14
- minitest (5.18.0)
15
- parallel (1.22.1)
16
- parser (3.2.1.1)
14
+ lint_roller (1.1.0)
15
+ minitest (5.21.2)
16
+ parallel (1.24.0)
17
+ parser (3.3.0.4)
17
18
  ast (~> 2.4.1)
19
+ racc
20
+ racc (1.7.3)
18
21
  rainbow (3.1.1)
19
- rake (13.0.6)
20
- rake-compiler (1.2.1)
22
+ rake (13.1.0)
23
+ rake-compiler (1.2.5)
21
24
  rake
22
- rb_sys (0.9.68)
23
- regexp_parser (2.7.0)
24
- rexml (3.2.5)
25
+ rb_sys (0.9.86)
26
+ regexp_parser (2.9.0)
27
+ rexml (3.2.6)
25
28
  rspec (3.12.0)
26
29
  rspec-core (~> 3.12.0)
27
30
  rspec-expectations (~> 3.12.0)
28
31
  rspec-mocks (~> 3.12.0)
29
- rspec-core (3.12.1)
32
+ rspec-core (3.12.2)
30
33
  rspec-support (~> 3.12.0)
31
- rspec-expectations (3.12.2)
34
+ rspec-expectations (3.12.3)
32
35
  diff-lcs (>= 1.2.0, < 2.0)
33
36
  rspec-support (~> 3.12.0)
34
- rspec-mocks (3.12.4)
37
+ rspec-mocks (3.12.6)
35
38
  diff-lcs (>= 1.2.0, < 2.0)
36
39
  rspec-support (~> 3.12.0)
37
- rspec-support (3.12.0)
38
- rubocop (1.48.1)
40
+ rspec-support (3.12.1)
41
+ rubocop (1.59.0)
39
42
  json (~> 2.3)
43
+ language_server-protocol (>= 3.17.0)
40
44
  parallel (~> 1.10)
41
- parser (>= 3.2.0.0)
45
+ parser (>= 3.2.2.4)
42
46
  rainbow (>= 2.2.2, < 4.0)
43
47
  regexp_parser (>= 1.8, < 3.0)
44
48
  rexml (>= 3.2.5, < 4.0)
45
- rubocop-ast (>= 1.26.0, < 2.0)
49
+ rubocop-ast (>= 1.30.0, < 2.0)
46
50
  ruby-progressbar (~> 1.7)
47
51
  unicode-display_width (>= 2.4.0, < 3.0)
48
- rubocop-ast (1.27.0)
52
+ rubocop-ast (1.30.0)
49
53
  parser (>= 3.2.1.0)
50
- rubocop-performance (1.16.0)
51
- rubocop (>= 1.7.0, < 2.0)
52
- rubocop-ast (>= 0.4.0)
54
+ rubocop-performance (1.20.2)
55
+ rubocop (>= 1.48.1, < 2.0)
56
+ rubocop-ast (>= 1.30.0, < 2.0)
53
57
  ruby-progressbar (1.13.0)
54
- standard (1.25.1)
58
+ standard (1.33.0)
55
59
  language_server-protocol (~> 3.17.0.2)
56
- rubocop (= 1.48.1)
57
- rubocop-performance (= 1.16.0)
58
- unicode-display_width (2.4.2)
59
- webrick (1.7.0)
60
- yard (0.9.28)
61
- webrick (~> 1.7.0)
60
+ lint_roller (~> 1.0)
61
+ rubocop (~> 1.59.0)
62
+ standard-custom (~> 1.0.0)
63
+ standard-performance (~> 1.3)
64
+ standard-custom (1.0.2)
65
+ lint_roller (~> 1.0)
66
+ rubocop (~> 1.50)
67
+ standard-performance (1.3.1)
68
+ lint_roller (~> 1.1)
69
+ rubocop-performance (~> 1.20.2)
70
+ unicode-display_width (2.5.0)
71
+ yard (0.9.34)
62
72
  yard-doctest (0.1.17)
63
73
  minitest
64
74
  yard
@@ -70,12 +80,13 @@ PLATFORMS
70
80
  x86_64-linux
71
81
 
72
82
  DEPENDENCIES
73
- rake (~> 13.0)
83
+ racc
84
+ rake
74
85
  rake-compiler
75
- rspec (~> 3.0)
76
- standard (~> 1.3)
86
+ rspec
87
+ standard
77
88
  tiktoken_ruby!
78
- yard-doctest (~> 0.1.17)
89
+ yard-doctest
79
90
 
80
91
  BUNDLED WITH
81
92
  2.4.6
data/README.md CHANGED
@@ -1,8 +1,14 @@
1
1
  [![Gem Version](https://badge.fury.io/rb/tiktoken_ruby.svg)](https://badge.fury.io/rb/tiktoken_ruby)
2
+
2
3
  # tiktoken_ruby
3
4
 
4
5
  [Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
5
- This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
6
+ This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
7
+
8
+ ## Request for maintainers
9
+
10
+ I can't really put substantial time into maintaining this. Probably nothing more than a couple hours every few months. If you have experience maintaining ruby gems and would like to
11
+ lend a hand please send me an email or reply to this [issue](https://github.com/IAPark/tiktoken_ruby/issues/26)
6
12
 
7
13
  ## Installation
8
14
 
@@ -15,17 +21,19 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
21
  $ gem install tiktoken_ruby
16
22
 
17
23
  ## Usage
24
+
18
25
  Usage should be very similar to the python library. Here's a simple example
19
26
 
20
27
  Encode and decode text
28
+
21
29
  ```ruby
22
30
  require 'tiktoken_ruby'
23
-
24
31
  enc = Tiktoken.get_encoding("cl100k_base")
25
32
  enc.decode(enc.encode("hello world")) #=> "hello world"
26
33
  ```
27
34
 
28
35
  Encoders can also be retrieved by model name
36
+
29
37
  ```ruby
30
38
  require 'tiktoken_ruby'
31
39
 
@@ -53,7 +61,6 @@ bundle exec rake compile
53
61
  bundle exec rake spec
54
62
  ```
55
63
 
56
-
57
64
  ## License
58
65
 
59
66
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -10,5 +10,6 @@ publish = false
10
10
  crate-type = ["cdylib"]
11
11
 
12
12
  [dependencies]
13
- magnus = { version = "0.4" }
13
+ magnus = { version = "0.6.1" }
14
+ rb-sys = { version = "*", features = ["stable-api-compiled-fallback"] }
14
15
  tiktoken-rs = { git = "https://github.com/IAPark/tiktoken-rs.git" }
@@ -1,7 +1,7 @@
1
1
  mod core_bpe_wrapper;
2
2
 
3
3
  use core_bpe_wrapper::CoreBPEWrapper;
4
- use magnus::{define_module, function, prelude::*, Error, method, class, RModule, ExceptionClass};
4
+ use magnus::{class, define_module, function, method, prelude::*, Error, ExceptionClass, RModule};
5
5
 
6
6
  fn r50k_base() -> CoreBPEWrapper {
7
7
  let core_bpe = tiktoken_rs::r50k_base().unwrap();
@@ -38,14 +38,18 @@ fn init() -> Result<(), Error> {
38
38
  factory_module.define_singleton_method("p50k_edit", function!(p50k_edit, 0))?;
39
39
  factory_module.define_singleton_method("cl100k_base", function!(cl100k_base, 0))?;
40
40
 
41
-
42
41
  let ext_module = module.define_module("Ext")?;
43
42
  let bpe_class = ext_module.define_class("CoreBPE", class::object())?;
44
43
 
45
- bpe_class.define_method("encode_ordinary", method!(CoreBPEWrapper::encode_ordinary, 1))?;
44
+ bpe_class.define_method(
45
+ "encode_ordinary",
46
+ method!(CoreBPEWrapper::encode_ordinary, 1),
47
+ )?;
46
48
  bpe_class.define_method("encode", method!(CoreBPEWrapper::encode, 2))?;
47
- bpe_class.define_method("encode_with_special_tokens", method!(CoreBPEWrapper::encode_with_special_tokens, 1))?;
48
-
49
+ bpe_class.define_method(
50
+ "encode_with_special_tokens",
51
+ method!(CoreBPEWrapper::encode_with_special_tokens, 1),
52
+ )?;
49
53
 
50
54
  bpe_class.define_method("decode", method!(CoreBPEWrapper::decode, 1))?;
51
55
  Ok(())
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Tiktoken::Encoding
4
+ CACHE_MUTEX = Mutex.new
5
+
4
6
  attr_reader :name
5
7
 
6
8
  # This returns a new Tiktoken::Encoding instance for the requested encoding
@@ -15,8 +17,10 @@ class Tiktoken::Encoding
15
17
  # @param encoding [Symbol] The name of the encoding to load
16
18
  # @return [Tiktoken::Encoding] The encoding instance
17
19
  def self.for_name_cached(encoding)
18
- @encodings ||= {}
19
- @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ CACHE_MUTEX.synchronize do
21
+ @encodings ||= {}
22
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
23
+ end
20
24
  end
21
25
 
22
26
  # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.5"
4
+ VERSION = "0.0.8"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -28,22 +28,22 @@ module Tiktoken
28
28
 
29
29
  # Gets the encoding for an OpenAI model
30
30
  # @param model_name [Symbol|String] The name of the model to get the encoding for
31
- # @return [Tiktoken::Encoding] The encoding instance
31
+ # @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
32
32
  # @example Count tokens for text
33
33
  # enc = Tiktoken.encoding_for_model("gpt-4")
34
34
  # enc.encode("hello world").length #=> 2
35
35
  def encoding_for_model(model_name)
36
- PREFIX_MODELS.each do |prefix|
37
- if model_name.to_s.start_with?("#{prefix}-")
38
- model_name = prefix
39
- break
40
- end
36
+ if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
37
+ return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
41
38
  end
42
39
 
43
- encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
44
- return nil unless encoding_name
40
+ _prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
41
+ model_name.start_with?(prefix.to_s)
42
+ end
45
43
 
46
- get_encoding(encoding_name)
44
+ if encoding
45
+ get_encoding(encoding)
46
+ end
47
47
  end
48
48
 
49
49
  # Lists all the encodings that are supported
@@ -67,12 +67,22 @@ module Tiktoken
67
67
  :cl100k_base
68
68
  ]
69
69
 
70
- # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
70
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
71
71
  # that is also MIT licensed but by OpenAI
72
72
  MODEL_TO_ENCODING_NAME = {
73
+ # chat
73
74
  "gpt-4": "cl100k_base",
74
75
  "gpt-3.5-turbo": "cl100k_base",
75
- # text
76
+ "gpt-35-turbo": "cl100k_base", # Azure deployment name
77
+ # base
78
+ "davinci-002": "cl100k_base",
79
+ "babbage-002": "cl100k_base",
80
+ # embeddings
81
+ "text-embedding-ada-002": "cl100k_base",
82
+ "text-embedding-3-small": "cl100k_base",
83
+ "text-embedding-3-large": "cl100k_base",
84
+ # DEPRECATED MODELS
85
+ # text (DEPRECATED)
76
86
  "text-davinci-003": "p50k_base",
77
87
  "text-davinci-002": "p50k_base",
78
88
  "text-davinci-001": "r50k_base",
@@ -83,19 +93,17 @@ module Tiktoken
83
93
  curie: "r50k_base",
84
94
  babbage: "r50k_base",
85
95
  ada: "r50k_base",
86
- # code
96
+ # code (DEPRECATED)
87
97
  "code-davinci-002": "p50k_base",
88
98
  "code-davinci-001": "p50k_base",
89
99
  "code-cushman-002": "p50k_base",
90
100
  "code-cushman-001": "p50k_base",
91
101
  "davinci-codex": "p50k_base",
92
102
  "cushman-codex": "p50k_base",
93
- # edit
103
+ # edit (DEPRECATED)
94
104
  "text-davinci-edit-001": "p50k_edit",
95
105
  "code-davinci-edit-001": "p50k_edit",
96
- # embeddings
97
- "text-embedding-ada-002": "cl100k_base",
98
- # old embeddings
106
+ # old embeddings (DEPRECATED)
99
107
  "text-similarity-davinci-001": "r50k_base",
100
108
  "text-similarity-curie-001": "r50k_base",
101
109
  "text-similarity-babbage-001": "r50k_base",
@@ -105,10 +113,21 @@ module Tiktoken
105
113
  "text-search-babbage-doc-001": "r50k_base",
106
114
  "text-search-ada-doc-001": "r50k_base",
107
115
  "code-search-babbage-code-001": "r50k_base",
108
- "code-search-ada-code-001": "r50k_base"
116
+ "code-search-ada-code-001": "r50k_base",
117
+ # open source
118
+ gpt2: "gpt2"
109
119
  }
110
120
 
111
- # these are models that have a versioned models that are otherwise identical
112
- PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
121
+ MODEL_PREFIX_TO_ENCODING = {
122
+ # chat
123
+ "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
124
+ "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
125
+ "gpt-35-turbo-": "cl100k_base", # Azure deployment name
126
+ # fine-tuned
127
+ "ft:gpt-4": "cl100k_base",
128
+ "ft:gpt-3.5-turbo": "cl100k_base",
129
+ "ft:davinci-002": "cl100k_base",
130
+ "ft:babbage-002": "cl100k_base"
131
+ }
113
132
  end
114
133
  end
@@ -7,12 +7,10 @@ Gem::Specification.new do |spec|
7
7
  spec.version = Tiktoken::VERSION
8
8
  spec.authors = ["IAPark"]
9
9
  spec.email = ["isaac.a.park@gmail.com"]
10
-
11
10
  spec.summary = "Ruby wrapper for Tiktoken"
12
11
  spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
13
12
  "a BPE tokenizer written by and used by OpenAI. It can be used to " \
14
13
  "count the number of tokens in text before sending it to OpenAI APIs."
15
-
16
14
  spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
17
15
  spec.license = "MIT"
18
16
  spec.required_ruby_version = ">= 2.7.0"
@@ -22,11 +20,6 @@ Gem::Specification.new do |spec|
22
20
  spec.metadata["homepage_uri"] = spec.homepage
23
21
  spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
24
22
  spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
25
-
26
- # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
27
-
28
- # Specify which files should be added to the gem when it is released.
29
- # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
30
23
  spec.files = Dir.chdir(__dir__) do
31
24
  `git ls-files -z`.split("\x0").reject do |f|
32
25
  (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
@@ -36,9 +29,5 @@ Gem::Specification.new do |spec|
36
29
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
37
30
  spec.require_paths = ["lib"]
38
31
  spec.extensions = ["ext/tiktoken_ruby/extconf.rb"]
39
-
40
- spec.add_dependency "rb_sys", "~> 0.9.68"
41
-
42
- # For more information and examples about making a new gem, check out our
43
- # guide at: https://bundler.io/guides/creating_gem.html
32
+ spec.add_dependency "rb_sys", ">= 0.9.86"
44
33
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-11 00:00:00.000000000 Z
11
+ date: 2024-04-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 0.9.68
19
+ version: 0.9.86
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.9.68
26
+ version: 0.9.86
27
27
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
28
28
  used by OpenAI. It can be used to count the number of tokens in text before sending
29
29
  it to OpenAI APIs.