tiktoken_ruby 0.0.6-x86_64-darwin → 0.0.8-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +5 -9
- data/Gemfile.lock +45 -34
- data/README.md +10 -3
- data/lib/tiktoken_ruby/3.1/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/3.2/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/{3.0 → 3.3}/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/encoding.rb +6 -2
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +38 -19
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5952edc6d35682fd1f1e11e2e3b592a910cbf564267ec1d674ebdcac8197e6ab
|
4
|
+
data.tar.gz: 456ccf90cd89bffdf2a8631a253b2bdb8ec97fe6bbd4b2d82b5198935a0b5bbb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5aaa9e3debf8189797adaee23dbdaa1ca7455dea5e7cfb86b0b16dac07db2df6a3322c3541c2af88a1fc22eaf8a6dc9bf56a56db80a780ad91a82854a6e4b663
|
7
|
+
data.tar.gz: 8c76c8e1220523539880a5b79d877aa3c2bd3d3ee51a0b6c6025502534cc9973c562b8aaeda291fa05376ef14cb81105449b690d06918b75839484bb811cbc8c
|
data/Gemfile
CHANGED
@@ -2,15 +2,11 @@
|
|
2
2
|
|
3
3
|
source "https://rubygems.org"
|
4
4
|
|
5
|
-
# Specify your gem's dependencies in tiktoken_ruby.gemspec
|
6
5
|
gemspec
|
7
6
|
|
8
|
-
gem "rake"
|
9
|
-
|
7
|
+
gem "rake"
|
10
8
|
gem "rake-compiler"
|
11
|
-
|
12
|
-
gem "
|
13
|
-
|
14
|
-
gem "
|
15
|
-
|
16
|
-
gem "yard-doctest", "~> 0.1.17"
|
9
|
+
gem "rspec"
|
10
|
+
gem "standard"
|
11
|
+
gem "yard-doctest"
|
12
|
+
gem "racc"
|
data/Gemfile.lock
CHANGED
@@ -1,64 +1,74 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tiktoken_ruby (0.0.
|
5
|
-
rb_sys (
|
4
|
+
tiktoken_ruby (0.0.8)
|
5
|
+
rb_sys (= 0.9.87)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
10
|
ast (2.4.2)
|
11
11
|
diff-lcs (1.5.0)
|
12
|
-
json (2.
|
12
|
+
json (2.7.1)
|
13
13
|
language_server-protocol (3.17.0.3)
|
14
|
-
|
15
|
-
|
16
|
-
|
14
|
+
lint_roller (1.1.0)
|
15
|
+
minitest (5.21.2)
|
16
|
+
parallel (1.24.0)
|
17
|
+
parser (3.3.0.4)
|
17
18
|
ast (~> 2.4.1)
|
19
|
+
racc
|
20
|
+
racc (1.7.3)
|
18
21
|
rainbow (3.1.1)
|
19
|
-
rake (13.0
|
20
|
-
rake-compiler (1.2.
|
22
|
+
rake (13.1.0)
|
23
|
+
rake-compiler (1.2.5)
|
21
24
|
rake
|
22
|
-
rb_sys (0.9.
|
23
|
-
regexp_parser (2.
|
24
|
-
rexml (3.2.
|
25
|
+
rb_sys (0.9.87)
|
26
|
+
regexp_parser (2.9.0)
|
27
|
+
rexml (3.2.6)
|
25
28
|
rspec (3.12.0)
|
26
29
|
rspec-core (~> 3.12.0)
|
27
30
|
rspec-expectations (~> 3.12.0)
|
28
31
|
rspec-mocks (~> 3.12.0)
|
29
|
-
rspec-core (3.12.
|
32
|
+
rspec-core (3.12.2)
|
30
33
|
rspec-support (~> 3.12.0)
|
31
|
-
rspec-expectations (3.12.
|
34
|
+
rspec-expectations (3.12.3)
|
32
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
33
36
|
rspec-support (~> 3.12.0)
|
34
|
-
rspec-mocks (3.12.
|
37
|
+
rspec-mocks (3.12.6)
|
35
38
|
diff-lcs (>= 1.2.0, < 2.0)
|
36
39
|
rspec-support (~> 3.12.0)
|
37
|
-
rspec-support (3.12.
|
38
|
-
rubocop (1.
|
40
|
+
rspec-support (3.12.1)
|
41
|
+
rubocop (1.59.0)
|
39
42
|
json (~> 2.3)
|
43
|
+
language_server-protocol (>= 3.17.0)
|
40
44
|
parallel (~> 1.10)
|
41
|
-
parser (>= 3.2.
|
45
|
+
parser (>= 3.2.2.4)
|
42
46
|
rainbow (>= 2.2.2, < 4.0)
|
43
47
|
regexp_parser (>= 1.8, < 3.0)
|
44
48
|
rexml (>= 3.2.5, < 4.0)
|
45
|
-
rubocop-ast (>= 1.
|
49
|
+
rubocop-ast (>= 1.30.0, < 2.0)
|
46
50
|
ruby-progressbar (~> 1.7)
|
47
51
|
unicode-display_width (>= 2.4.0, < 3.0)
|
48
|
-
rubocop-ast (1.
|
52
|
+
rubocop-ast (1.30.0)
|
49
53
|
parser (>= 3.2.1.0)
|
50
|
-
rubocop-performance (1.
|
51
|
-
rubocop (>= 1.
|
52
|
-
rubocop-ast (>=
|
54
|
+
rubocop-performance (1.20.2)
|
55
|
+
rubocop (>= 1.48.1, < 2.0)
|
56
|
+
rubocop-ast (>= 1.30.0, < 2.0)
|
53
57
|
ruby-progressbar (1.13.0)
|
54
|
-
standard (1.
|
58
|
+
standard (1.33.0)
|
55
59
|
language_server-protocol (~> 3.17.0.2)
|
56
|
-
|
57
|
-
rubocop
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
60
|
+
lint_roller (~> 1.0)
|
61
|
+
rubocop (~> 1.59.0)
|
62
|
+
standard-custom (~> 1.0.0)
|
63
|
+
standard-performance (~> 1.3)
|
64
|
+
standard-custom (1.0.2)
|
65
|
+
lint_roller (~> 1.0)
|
66
|
+
rubocop (~> 1.50)
|
67
|
+
standard-performance (1.3.1)
|
68
|
+
lint_roller (~> 1.1)
|
69
|
+
rubocop-performance (~> 1.20.2)
|
70
|
+
unicode-display_width (2.5.0)
|
71
|
+
yard (0.9.34)
|
62
72
|
yard-doctest (0.1.17)
|
63
73
|
minitest
|
64
74
|
yard
|
@@ -70,12 +80,13 @@ PLATFORMS
|
|
70
80
|
x86_64-linux
|
71
81
|
|
72
82
|
DEPENDENCIES
|
73
|
-
|
83
|
+
racc
|
84
|
+
rake
|
74
85
|
rake-compiler
|
75
|
-
rspec
|
76
|
-
standard
|
86
|
+
rspec
|
87
|
+
standard
|
77
88
|
tiktoken_ruby!
|
78
|
-
yard-doctest
|
89
|
+
yard-doctest
|
79
90
|
|
80
91
|
BUNDLED WITH
|
81
|
-
2.4.
|
92
|
+
2.4.4
|
data/README.md
CHANGED
@@ -1,8 +1,14 @@
|
|
1
1
|
[](https://badge.fury.io/rb/tiktoken_ruby)
|
2
|
+
|
2
3
|
# tiktoken_ruby
|
3
4
|
|
4
5
|
[Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
|
5
|
-
This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
|
6
|
+
This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
|
7
|
+
|
8
|
+
## Request for maintainers
|
9
|
+
|
10
|
+
I can't really put substantial time into maintaining this. Probably nothing more than a couple hours every few months. If you have experience maintaining ruby gems and would like to
|
11
|
+
lend a hand please send me an email or reply to this [issue](https://github.com/IAPark/tiktoken_ruby/issues/26)
|
6
12
|
|
7
13
|
## Installation
|
8
14
|
|
@@ -15,17 +21,19 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
15
21
|
$ gem install tiktoken_ruby
|
16
22
|
|
17
23
|
## Usage
|
24
|
+
|
18
25
|
Usage should be very similar to the python library. Here's a simple example
|
19
26
|
|
20
27
|
Encode and decode text
|
28
|
+
|
21
29
|
```ruby
|
22
30
|
require 'tiktoken_ruby'
|
23
|
-
|
24
31
|
enc = Tiktoken.get_encoding("cl100k_base")
|
25
32
|
enc.decode(enc.encode("hello world")) #=> "hello world"
|
26
33
|
```
|
27
34
|
|
28
35
|
Encoders can also be retrieved by model name
|
36
|
+
|
29
37
|
```ruby
|
30
38
|
require 'tiktoken_ruby'
|
31
39
|
|
@@ -53,7 +61,6 @@ bundle exec rake compile
|
|
53
61
|
bundle exec rake spec
|
54
62
|
```
|
55
63
|
|
56
|
-
|
57
64
|
## License
|
58
65
|
|
59
66
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,6 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
class Tiktoken::Encoding
|
4
|
+
CACHE_MUTEX = Mutex.new
|
5
|
+
|
4
6
|
attr_reader :name
|
5
7
|
|
6
8
|
# This returns a new Tiktoken::Encoding instance for the requested encoding
|
@@ -15,8 +17,10 @@ class Tiktoken::Encoding
|
|
15
17
|
# @param encoding [Symbol] The name of the encoding to load
|
16
18
|
# @return [Tiktoken::Encoding] The encoding instance
|
17
19
|
def self.for_name_cached(encoding)
|
18
|
-
|
19
|
-
|
20
|
+
CACHE_MUTEX.synchronize do
|
21
|
+
@encodings ||= {}
|
22
|
+
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
|
23
|
+
end
|
20
24
|
end
|
21
25
|
|
22
26
|
# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
|
data/lib/tiktoken_ruby.rb
CHANGED
@@ -28,22 +28,22 @@ module Tiktoken
|
|
28
28
|
|
29
29
|
# Gets the encoding for an OpenAI model
|
30
30
|
# @param model_name [Symbol|String] The name of the model to get the encoding for
|
31
|
-
# @return [Tiktoken::Encoding] The encoding instance
|
31
|
+
# @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
|
32
32
|
# @example Count tokens for text
|
33
33
|
# enc = Tiktoken.encoding_for_model("gpt-4")
|
34
34
|
# enc.encode("hello world").length #=> 2
|
35
35
|
def encoding_for_model(model_name)
|
36
|
-
|
37
|
-
|
38
|
-
model_name = prefix
|
39
|
-
break
|
40
|
-
end
|
36
|
+
if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
|
37
|
+
return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
|
41
38
|
end
|
42
39
|
|
43
|
-
|
44
|
-
|
40
|
+
_prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
|
41
|
+
model_name.start_with?(prefix.to_s)
|
42
|
+
end
|
45
43
|
|
46
|
-
|
44
|
+
if encoding
|
45
|
+
get_encoding(encoding)
|
46
|
+
end
|
47
47
|
end
|
48
48
|
|
49
49
|
# Lists all the encodings that are supported
|
@@ -67,12 +67,22 @@ module Tiktoken
|
|
67
67
|
:cl100k_base
|
68
68
|
]
|
69
69
|
|
70
|
-
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
70
|
+
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
71
71
|
# that is also MIT licensed but by OpenAI
|
72
72
|
MODEL_TO_ENCODING_NAME = {
|
73
|
+
# chat
|
73
74
|
"gpt-4": "cl100k_base",
|
74
75
|
"gpt-3.5-turbo": "cl100k_base",
|
75
|
-
#
|
76
|
+
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
77
|
+
# base
|
78
|
+
"davinci-002": "cl100k_base",
|
79
|
+
"babbage-002": "cl100k_base",
|
80
|
+
# embeddings
|
81
|
+
"text-embedding-ada-002": "cl100k_base",
|
82
|
+
"text-embedding-3-small": "cl100k_base",
|
83
|
+
"text-embedding-3-large": "cl100k_base",
|
84
|
+
# DEPRECATED MODELS
|
85
|
+
# text (DEPRECATED)
|
76
86
|
"text-davinci-003": "p50k_base",
|
77
87
|
"text-davinci-002": "p50k_base",
|
78
88
|
"text-davinci-001": "r50k_base",
|
@@ -83,19 +93,17 @@ module Tiktoken
|
|
83
93
|
curie: "r50k_base",
|
84
94
|
babbage: "r50k_base",
|
85
95
|
ada: "r50k_base",
|
86
|
-
# code
|
96
|
+
# code (DEPRECATED)
|
87
97
|
"code-davinci-002": "p50k_base",
|
88
98
|
"code-davinci-001": "p50k_base",
|
89
99
|
"code-cushman-002": "p50k_base",
|
90
100
|
"code-cushman-001": "p50k_base",
|
91
101
|
"davinci-codex": "p50k_base",
|
92
102
|
"cushman-codex": "p50k_base",
|
93
|
-
# edit
|
103
|
+
# edit (DEPRECATED)
|
94
104
|
"text-davinci-edit-001": "p50k_edit",
|
95
105
|
"code-davinci-edit-001": "p50k_edit",
|
96
|
-
# embeddings
|
97
|
-
"text-embedding-ada-002": "cl100k_base",
|
98
|
-
# old embeddings
|
106
|
+
# old embeddings (DEPRECATED)
|
99
107
|
"text-similarity-davinci-001": "r50k_base",
|
100
108
|
"text-similarity-curie-001": "r50k_base",
|
101
109
|
"text-similarity-babbage-001": "r50k_base",
|
@@ -105,10 +113,21 @@ module Tiktoken
|
|
105
113
|
"text-search-babbage-doc-001": "r50k_base",
|
106
114
|
"text-search-ada-doc-001": "r50k_base",
|
107
115
|
"code-search-babbage-code-001": "r50k_base",
|
108
|
-
"code-search-ada-code-001": "r50k_base"
|
116
|
+
"code-search-ada-code-001": "r50k_base",
|
117
|
+
# open source
|
118
|
+
gpt2: "gpt2"
|
109
119
|
}
|
110
120
|
|
111
|
-
|
112
|
-
|
121
|
+
MODEL_PREFIX_TO_ENCODING = {
|
122
|
+
# chat
|
123
|
+
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
124
|
+
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
125
|
+
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
|
126
|
+
# fine-tuned
|
127
|
+
"ft:gpt-4": "cl100k_base",
|
128
|
+
"ft:gpt-3.5-turbo": "cl100k_base",
|
129
|
+
"ft:davinci-002": "cl100k_base",
|
130
|
+
"ft:babbage-002": "cl100k_base"
|
131
|
+
}
|
113
132
|
end
|
114
133
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiktoken_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- IAPark
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
14
14
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|
@@ -28,9 +28,9 @@ files:
|
|
28
28
|
- Rakefile
|
29
29
|
- doctest_helper.rb
|
30
30
|
- lib/tiktoken_ruby.rb
|
31
|
-
- lib/tiktoken_ruby/3.0/tiktoken_ruby.bundle
|
32
31
|
- lib/tiktoken_ruby/3.1/tiktoken_ruby.bundle
|
33
32
|
- lib/tiktoken_ruby/3.2/tiktoken_ruby.bundle
|
33
|
+
- lib/tiktoken_ruby/3.3/tiktoken_ruby.bundle
|
34
34
|
- lib/tiktoken_ruby/encoding.rb
|
35
35
|
- lib/tiktoken_ruby/version.rb
|
36
36
|
- sig/tiktoken_ruby.rbs
|
@@ -49,10 +49,10 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
49
49
|
requirements:
|
50
50
|
- - ">="
|
51
51
|
- !ruby/object:Gem::Version
|
52
|
-
version: '3.
|
52
|
+
version: '3.1'
|
53
53
|
- - "<"
|
54
54
|
- !ruby/object:Gem::Version
|
55
|
-
version: 3.
|
55
|
+
version: 3.4.dev
|
56
56
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
57
|
requirements:
|
58
58
|
- - ">="
|