tiktoken_ruby 0.0.7-x86_64-linux → 0.0.9-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7b9c3466ceeeee7d87cca03d5efb8e2c614d40d9a09c702ff1368a29f98169dc
4
- data.tar.gz: 1eaedb15f958909e4b5252c0cfb56b458e7a8ac8d59f56f78beeb4ddf458f6f6
3
+ metadata.gz: b2981777cb81fd3fade2688e66f0f6711c19956a3737b834fa1db1a1c0e2aabe
4
+ data.tar.gz: fbbac592954875ceb1c469f8266e3b69a845a3a960c8f1de7e45c2b0cf2dd74c
5
5
  SHA512:
6
- metadata.gz: bad48ee5f6c36c5a4d2b2b266ccfe760bd8b1d4a42fd619da8bf26fde51853660fa86aba1e754803876fccb50b00209622d56d882645c0b7fe01d6de9d1b02c1
7
- data.tar.gz: b2c1a31c2adf480eb66df143d25931116ba667f03f3048e88b997f685d982ef9f940e715ea271cbd61ae61b4976165c6760dc06053387aea41aa251a6b23f976
6
+ metadata.gz: 603c244d51c2254f15ab748ccade9f2dd3a5ebaa04b4e02119636a23901d3b24103231473ee97ccab36b82a19234a111312cad8919a5b1477638e592b6aa059b
7
+ data.tar.gz: cdf87538056946d3799027098b2ce43a3134d419513c0ad7f69db5ee6014c4bade22ab6e5d006537a1b6fb0e6ab86d88b1be39ef16babdd18d4e0e37545f71b2
data/Gemfile.lock CHANGED
@@ -1,8 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.7)
5
- rb_sys (>= 0.9.86)
4
+ tiktoken_ruby (0.0.9)
5
+ rb_sys (= 0.9.87)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
@@ -22,7 +22,7 @@ GEM
22
22
  rake (13.1.0)
23
23
  rake-compiler (1.2.5)
24
24
  rake
25
- rb_sys (0.9.86)
25
+ rb_sys (0.9.87)
26
26
  regexp_parser (2.9.0)
27
27
  rexml (3.2.6)
28
28
  rspec (3.12.0)
@@ -89,4 +89,4 @@ DEPENDENCIES
89
89
  yard-doctest
90
90
 
91
91
  BUNDLED WITH
92
- 2.4.6
92
+ 2.4.4
data/README.md CHANGED
@@ -1,8 +1,14 @@
1
1
  [![Gem Version](https://badge.fury.io/rb/tiktoken_ruby.svg)](https://badge.fury.io/rb/tiktoken_ruby)
2
+
2
3
  # tiktoken_ruby
3
4
 
4
5
  [Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
5
- This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
6
+ This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
7
+
8
+ ## Request for maintainers
9
+
10
+ I can't really put substantial time into maintaining this. Probably nothing more than a couple hours every few months. If you have experience maintaining ruby gems and would like to
11
+ lend a hand please send me an email or reply to this [issue](https://github.com/IAPark/tiktoken_ruby/issues/26)
6
12
 
7
13
  ## Installation
8
14
 
@@ -15,17 +21,19 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
21
  $ gem install tiktoken_ruby
16
22
 
17
23
  ## Usage
24
+
18
25
  Usage should be very similar to the python library. Here's a simple example
19
26
 
20
27
  Encode and decode text
28
+
21
29
  ```ruby
22
30
  require 'tiktoken_ruby'
23
-
24
31
  enc = Tiktoken.get_encoding("cl100k_base")
25
32
  enc.decode(enc.encode("hello world")) #=> "hello world"
26
33
  ```
27
34
 
28
35
  Encoders can also be retrieved by model name
36
+
29
37
  ```ruby
30
38
  require 'tiktoken_ruby'
31
39
 
@@ -53,7 +61,6 @@ bundle exec rake compile
53
61
  bundle exec rake spec
54
62
  ```
55
63
 
56
-
57
64
  ## License
58
65
 
59
66
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
Binary file
Binary file
Binary file
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Tiktoken::Encoding
4
+ CACHE_MUTEX = Mutex.new
5
+
4
6
  attr_reader :name
5
7
 
6
8
  # This returns a new Tiktoken::Encoding instance for the requested encoding
@@ -15,8 +17,10 @@ class Tiktoken::Encoding
15
17
  # @param encoding [Symbol] The name of the encoding to load
16
18
  # @return [Tiktoken::Encoding] The encoding instance
17
19
  def self.for_name_cached(encoding)
18
- @encodings ||= {}
19
- @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ CACHE_MUTEX.synchronize do
21
+ @encodings ||= {}
22
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
23
+ end
20
24
  end
21
25
 
22
26
  # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.7"
4
+ VERSION = "0.0.9"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -28,7 +28,7 @@ module Tiktoken
28
28
 
29
29
  # Gets the encoding for an OpenAI model
30
30
  # @param model_name [Symbol|String] The name of the model to get the encoding for
31
- # @return [Tiktoken::Encoding] The encoding instance
31
+ # @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
32
32
  # @example Count tokens for text
33
33
  # enc = Tiktoken.encoding_for_model("gpt-4")
34
34
  # enc.encode("hello world").length #=> 2
@@ -37,10 +37,12 @@ module Tiktoken
37
37
  return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
38
38
  end
39
39
 
40
- MODEL_PREFIX_TO_ENCODING.each do |prefix, encoding|
41
- if model_name.start_with?(prefix.to_s)
42
- return get_encoding(encoding)
43
- end
40
+ _prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
41
+ model_name.start_with?(prefix.to_s)
42
+ end
43
+
44
+ if encoding
45
+ get_encoding(encoding)
44
46
  end
45
47
  end
46
48
 
@@ -62,13 +64,15 @@ module Tiktoken
62
64
  :r50k_base,
63
65
  :p50k_base,
64
66
  :p50k_edit,
65
- :cl100k_base
67
+ :cl100k_base,
68
+ :o200k_base
66
69
  ]
67
70
 
68
71
  # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
69
72
  # that is also MIT licensed but by OpenAI
70
73
  MODEL_TO_ENCODING_NAME = {
71
74
  # chat
75
+ "gpt-4o": "o200k_base",
72
76
  "gpt-4": "cl100k_base",
73
77
  "gpt-3.5-turbo": "cl100k_base",
74
78
  "gpt-35-turbo": "cl100k_base", # Azure deployment name
@@ -118,6 +122,7 @@ module Tiktoken
118
122
 
119
123
  MODEL_PREFIX_TO_ENCODING = {
120
124
  # chat
125
+ "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13, etc.
121
126
  "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
122
127
  "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
123
128
  "gpt-35-turbo-": "cl100k_base", # Azure deployment name
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.9
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-02-14 00:00:00.000000000 Z
11
+ date: 2024-05-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
14
  used by OpenAI. It can be used to count the number of tokens in text before sending
@@ -28,7 +28,6 @@ files:
28
28
  - Rakefile
29
29
  - doctest_helper.rb
30
30
  - lib/tiktoken_ruby.rb
31
- - lib/tiktoken_ruby/3.0/tiktoken_ruby.so
32
31
  - lib/tiktoken_ruby/3.1/tiktoken_ruby.so
33
32
  - lib/tiktoken_ruby/3.2/tiktoken_ruby.so
34
33
  - lib/tiktoken_ruby/3.3/tiktoken_ruby.so
@@ -50,7 +49,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
50
49
  requirements:
51
50
  - - ">="
52
51
  - !ruby/object:Gem::Version
53
- version: '3.0'
52
+ version: '3.1'
54
53
  - - "<"
55
54
  - !ruby/object:Gem::Version
56
55
  version: 3.4.dev
Binary file