tiktoken_ruby 0.0.7-x86_64-linux-musl → 0.0.9-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ddc84df649dfe1c46b98438d086bf0603510e3bdf692b1d572f249710372dc6c
4
- data.tar.gz: 5b2679459ca4e97c2a048752b5ea7dddc784efe4b5b30c8a00eff52b18e8c76b
3
+ metadata.gz: 68ab7cf30658d5a38c8a7f8cceb40cb7a5e99da1df56cea9d2eb660fcd7d3608
4
+ data.tar.gz: 748226cd09b35adc810cc381404c5a6745f84c353314416c55a798e55ebbc4a9
5
5
  SHA512:
6
- metadata.gz: 46bbfc78c272aac24d5288bd5947478359cc8a2789dc76619e2d7a393903dec037203373832f171a660d1288022dcc570e52adb219adf0007108e4be94a551cc
7
- data.tar.gz: 65f1261af3fe3c7d7c928a25ab679d6f3d7447955b7b07a1486a2c18cac466ed73440a8c71f2a00512080d3d22ad7881c2201bc148d4870516ae3cf5a80c86b0
6
+ metadata.gz: 8ffc23202ec430374c113b3aaa57b7c3ece7da8f192b0b722b6454dae52191932a4ca10ce30f2d580b23806170b5996b2c345cb00f4c8a1f71d425f69c02cc84
7
+ data.tar.gz: 904623d818d68385ca1c4cba8108331a85758fab54f1e2ba2597ee4fbde1fd20205c711607fc6e7cd85924acbc1e237e47868e8b58d411e01b146dac57fb538f
data/Gemfile.lock CHANGED
@@ -1,8 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.7)
5
- rb_sys (>= 0.9.86)
4
+ tiktoken_ruby (0.0.9)
5
+ rb_sys (= 0.9.87)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
@@ -22,7 +22,7 @@ GEM
22
22
  rake (13.1.0)
23
23
  rake-compiler (1.2.5)
24
24
  rake
25
- rb_sys (0.9.86)
25
+ rb_sys (0.9.87)
26
26
  regexp_parser (2.9.0)
27
27
  rexml (3.2.6)
28
28
  rspec (3.12.0)
@@ -89,4 +89,4 @@ DEPENDENCIES
89
89
  yard-doctest
90
90
 
91
91
  BUNDLED WITH
92
- 2.4.6
92
+ 2.4.4
data/README.md CHANGED
@@ -1,8 +1,14 @@
1
1
  [![Gem Version](https://badge.fury.io/rb/tiktoken_ruby.svg)](https://badge.fury.io/rb/tiktoken_ruby)
2
+
2
3
  # tiktoken_ruby
3
4
 
4
5
  [Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
5
- This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
6
+ This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
7
+
8
+ ## Request for maintainers
9
+
10
+ I can't really put substantial time into maintaining this. Probably nothing more than a couple hours every few months. If you have experience maintaining ruby gems and would like to
11
+ lend a hand please send me an email or reply to this [issue](https://github.com/IAPark/tiktoken_ruby/issues/26)
6
12
 
7
13
  ## Installation
8
14
 
@@ -15,17 +21,19 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
21
  $ gem install tiktoken_ruby
16
22
 
17
23
  ## Usage
24
+
18
25
  Usage should be very similar to the python library. Here's a simple example
19
26
 
20
27
  Encode and decode text
28
+
21
29
  ```ruby
22
30
  require 'tiktoken_ruby'
23
-
24
31
  enc = Tiktoken.get_encoding("cl100k_base")
25
32
  enc.decode(enc.encode("hello world")) #=> "hello world"
26
33
  ```
27
34
 
28
35
  Encoders can also be retrieved by model name
36
+
29
37
  ```ruby
30
38
  require 'tiktoken_ruby'
31
39
 
@@ -53,7 +61,6 @@ bundle exec rake compile
53
61
  bundle exec rake spec
54
62
  ```
55
63
 
56
-
57
64
  ## License
58
65
 
59
66
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
Binary file
Binary file
Binary file
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Tiktoken::Encoding
4
+ CACHE_MUTEX = Mutex.new
5
+
4
6
  attr_reader :name
5
7
 
6
8
  # This returns a new Tiktoken::Encoding instance for the requested encoding
@@ -15,8 +17,10 @@ class Tiktoken::Encoding
15
17
  # @param encoding [Symbol] The name of the encoding to load
16
18
  # @return [Tiktoken::Encoding] The encoding instance
17
19
  def self.for_name_cached(encoding)
18
- @encodings ||= {}
19
- @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ CACHE_MUTEX.synchronize do
21
+ @encodings ||= {}
22
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
23
+ end
20
24
  end
21
25
 
22
26
  # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.7"
4
+ VERSION = "0.0.9"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -28,7 +28,7 @@ module Tiktoken
28
28
 
29
29
  # Gets the encoding for an OpenAI model
30
30
  # @param model_name [Symbol|String] The name of the model to get the encoding for
31
- # @return [Tiktoken::Encoding] The encoding instance
31
+ # @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
32
32
  # @example Count tokens for text
33
33
  # enc = Tiktoken.encoding_for_model("gpt-4")
34
34
  # enc.encode("hello world").length #=> 2
@@ -37,10 +37,12 @@ module Tiktoken
37
37
  return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
38
38
  end
39
39
 
40
- MODEL_PREFIX_TO_ENCODING.each do |prefix, encoding|
41
- if model_name.start_with?(prefix.to_s)
42
- return get_encoding(encoding)
43
- end
40
+ _prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
41
+ model_name.start_with?(prefix.to_s)
42
+ end
43
+
44
+ if encoding
45
+ get_encoding(encoding)
44
46
  end
45
47
  end
46
48
 
@@ -62,13 +64,15 @@ module Tiktoken
62
64
  :r50k_base,
63
65
  :p50k_base,
64
66
  :p50k_edit,
65
- :cl100k_base
67
+ :cl100k_base,
68
+ :o200k_base
66
69
  ]
67
70
 
68
71
  # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
69
72
  # that is also MIT licensed but by OpenAI
70
73
  MODEL_TO_ENCODING_NAME = {
71
74
  # chat
75
+ "gpt-4o": "o200k_base",
72
76
  "gpt-4": "cl100k_base",
73
77
  "gpt-3.5-turbo": "cl100k_base",
74
78
  "gpt-35-turbo": "cl100k_base", # Azure deployment name
@@ -118,6 +122,7 @@ module Tiktoken
118
122
 
119
123
  MODEL_PREFIX_TO_ENCODING = {
120
124
  # chat
125
+ "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13, etc.
121
126
  "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
122
127
  "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
123
128
  "gpt-35-turbo-": "cl100k_base", # Azure deployment name
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.9
5
5
  platform: x86_64-linux-musl
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-02-13 00:00:00.000000000 Z
11
+ date: 2024-05-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
14
  used by OpenAI. It can be used to count the number of tokens in text before sending
@@ -28,7 +28,6 @@ files:
28
28
  - Rakefile
29
29
  - doctest_helper.rb
30
30
  - lib/tiktoken_ruby.rb
31
- - lib/tiktoken_ruby/3.0/tiktoken_ruby.so
32
31
  - lib/tiktoken_ruby/3.1/tiktoken_ruby.so
33
32
  - lib/tiktoken_ruby/3.2/tiktoken_ruby.so
34
33
  - lib/tiktoken_ruby/3.3/tiktoken_ruby.so
@@ -50,7 +49,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
50
49
  requirements:
51
50
  - - ">="
52
51
  - !ruby/object:Gem::Version
53
- version: '3.0'
52
+ version: '3.1'
54
53
  - - "<"
55
54
  - !ruby/object:Gem::Version
56
55
  version: 3.4.dev
Binary file