tiktoken_ruby 0.0.1-arm64-darwin → 0.0.3-arm64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +9 -1
- data/README.md +20 -6
- data/doctest_helper.rb +1 -0
- data/lib/tiktoken_ruby/2.7/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/3.0/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/3.1/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/3.2/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/encoding.rb +33 -5
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +108 -2
- metadata +8 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2222c8f701416004cca31e6ae0095b944e279c1cbfd1dfc035c25f4c3d8fc7ff
|
4
|
+
data.tar.gz: 2ea5b03f8eadfb064454d74db724880b8a5dd7746237b84328feb51ce5813cbe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 145d66621b038e291c93274c54528824e6eb1244ce4411d134e5dbf2f2aaefab9561cf06d04a66be343aa169c8e2586b9bea8e09631b58a9a96d23f829aa5020
|
7
|
+
data.tar.gz: 4b9af794c9be96281758fc70a0ddaccfb90e4b85fd63c161425175c65ed28a179df9a7daec82503bc0ddb9a9f978ad6da738db79220c1db57f0fedc86df88cf7
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tiktoken_ruby (0.0.
|
4
|
+
tiktoken_ruby (0.0.3)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
@@ -12,6 +12,7 @@ GEM
|
|
12
12
|
json (2.6.3)
|
13
13
|
language_server-protocol (3.17.0.3)
|
14
14
|
method_source (1.0.0)
|
15
|
+
minitest (5.18.0)
|
15
16
|
parallel (1.22.1)
|
16
17
|
parser (3.2.1.1)
|
17
18
|
ast (~> 2.4.1)
|
@@ -59,6 +60,12 @@ GEM
|
|
59
60
|
rubocop (= 1.48.1)
|
60
61
|
rubocop-performance (= 1.16.0)
|
61
62
|
unicode-display_width (2.4.2)
|
63
|
+
webrick (1.7.0)
|
64
|
+
yard (0.9.28)
|
65
|
+
webrick (~> 1.7.0)
|
66
|
+
yard-doctest (0.1.17)
|
67
|
+
minitest
|
68
|
+
yard
|
62
69
|
|
63
70
|
PLATFORMS
|
64
71
|
arm64-darwin-22
|
@@ -72,6 +79,7 @@ DEPENDENCIES
|
|
72
79
|
rspec (~> 3.0)
|
73
80
|
standard (~> 1.3)
|
74
81
|
tiktoken_ruby!
|
82
|
+
yard-doctest (~> 0.1.17)
|
75
83
|
|
76
84
|
BUNDLED WITH
|
77
85
|
2.4.6
|
data/README.md
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
[![Gem Version](https://badge.fury.io/rb/tiktoken_ruby.svg)](https://badge.fury.io/rb/tiktoken_ruby)
|
1
2
|
# tiktoken_ruby
|
2
3
|
|
3
4
|
[Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
|
@@ -5,19 +6,32 @@ This is a wrapper around it aimed primarily at enabling accurate counts of GPT m
|
|
5
6
|
|
6
7
|
## Installation
|
7
8
|
|
8
|
-
TODO: Replace `UPDATE_WITH_YOUR_GEM_NAME_PRIOR_TO_RELEASE_TO_RUBYGEMS_ORG` with your gem name right after releasing it to RubyGems.org. Please do not do it earlier due to security reasons. Alternatively, replace this section with instructions to install your gem from git if you don't plan to release to RubyGems.org.
|
9
|
-
|
10
9
|
Install the gem and add to the application's Gemfile by executing:
|
11
10
|
|
12
|
-
$ bundle add
|
11
|
+
$ bundle add tiktoken_ruby
|
13
12
|
|
14
13
|
If bundler is not being used to manage dependencies, install the gem by executing:
|
15
14
|
|
16
|
-
$ gem install
|
15
|
+
$ gem install tiktoken_ruby
|
17
16
|
|
18
17
|
## Usage
|
18
|
+
Usage should be very similar to the python library. here's a simple example
|
19
|
+
|
20
|
+
Encode and decode text
|
21
|
+
```ruby
|
22
|
+
require 'tiktoken_ruby'
|
23
|
+
|
24
|
+
enc = Tiktoken.get_encoding("cl100k_base")
|
25
|
+
enc.decode(enc.encode("hello world")) #=> "hello world"
|
26
|
+
```
|
27
|
+
|
28
|
+
Encoders can also be retrieved by model name
|
29
|
+
```ruby
|
30
|
+
require 'tiktoken_ruby'
|
19
31
|
|
20
|
-
|
32
|
+
enc = Tiktoken.encoding_for_model("gpt-4")
|
33
|
+
enc.encode("hello world").length #=> 2
|
34
|
+
```
|
21
35
|
|
22
36
|
## Development
|
23
37
|
|
@@ -27,7 +41,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
27
41
|
|
28
42
|
## Contributing
|
29
43
|
|
30
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
44
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
|
31
45
|
|
32
46
|
## License
|
33
47
|
|
data/doctest_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'lib/tiktoken_ruby.rb'
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,23 +1,51 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
class Tiktoken::Encoding
|
4
|
-
|
5
|
-
|
3
|
+
class Tiktoken::Encoding
|
4
|
+
attr_reader :name
|
5
|
+
|
6
|
+
# This returns a new Tiktoken::Encoding instance for the requested encoding
|
7
|
+
# @param encoding [Symbol] The name of the encoding to load
|
8
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
9
|
+
def self.for_name(encoding)
|
10
|
+
Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
|
6
11
|
end
|
7
12
|
|
8
|
-
|
9
|
-
|
13
|
+
# This returns a Tiktoken::Encoding instance for the requested encoding
|
14
|
+
# It will reuse an existing encoding if it's already been loaded
|
15
|
+
# @param encoding [Symbol] The name of the encoding to load
|
16
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
17
|
+
def self.for_name_cached(encoding)
|
18
|
+
@encodings ||= {}
|
19
|
+
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
|
10
20
|
end
|
11
21
|
|
22
|
+
# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
|
23
|
+
# basically it's unescaped
|
24
|
+
# @param text [String] The text to encode
|
25
|
+
# @return [Array<Integer>] The encoded tokens
|
12
26
|
def encode_ordinary(text)
|
13
27
|
@ext_base_bpe.encode_ordinary(text)
|
14
28
|
end
|
15
29
|
|
30
|
+
# Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
|
31
|
+
# as text unless they're in the allowed_special array. It's basically like the text was escaped
|
32
|
+
# @param text [String] The text to encode
|
33
|
+
# @param allowed_special [Array<String>] An array of special tokens to allow
|
34
|
+
# @return [Array<Integer>] The encoded tokens
|
16
35
|
def encode(text, allowed_special: [])
|
17
36
|
@ext_base_bpe.encode(text, allowed_special)
|
18
37
|
end
|
19
38
|
|
39
|
+
# Decodes the tokens back into text
|
40
|
+
# @param tokens [Array<Integer>] The tokens to decode
|
41
|
+
# @return [String] The decoded text
|
20
42
|
def decode(tokens)
|
21
43
|
@ext_base_bpe.decode(tokens)
|
22
44
|
end
|
45
|
+
|
46
|
+
private
|
47
|
+
def initialize(ext_base_bpe, name)
|
48
|
+
@ext_base_bpe = ext_base_bpe
|
49
|
+
@name = name
|
50
|
+
end
|
23
51
|
end
|
data/lib/tiktoken_ruby.rb
CHANGED
@@ -2,8 +2,114 @@
|
|
2
2
|
|
3
3
|
require_relative "tiktoken_ruby/version"
|
4
4
|
require_relative "tiktoken_ruby/encoding.rb"
|
5
|
-
|
5
|
+
|
6
|
+
begin
|
7
|
+
RUBY_VERSION =~ /(\d+\.\d+)/
|
8
|
+
require_relative "tiktoken_ruby/#{$1}/tiktoken_ruby"
|
9
|
+
rescue LoadError
|
10
|
+
require_relative "tiktoken_ruby/tiktoken_ruby"
|
11
|
+
end
|
6
12
|
|
7
13
|
module Tiktoken
|
8
|
-
class
|
14
|
+
class << self
|
15
|
+
|
16
|
+
# Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
|
17
|
+
# it will reuse the instance of that type that was previous loaded
|
18
|
+
# @param name [Symbol|String] The name of the encoding to load
|
19
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
20
|
+
# @example Encode and decode text
|
21
|
+
# enc = Tiktoken.get_encoding("cl100k_base")
|
22
|
+
# enc.decode(enc.encode("hello world")) #=> "hello world"
|
23
|
+
def get_encoding(name)
|
24
|
+
name = name.to_sym
|
25
|
+
return nil unless SUPPORTED_ENCODINGS.include?(name)
|
26
|
+
|
27
|
+
Tiktoken::Encoding.for_name_cached(name)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Gets the encoding for an OpenAI model
|
31
|
+
# @param model_name [Symbol|String] The name of the model to get the encoding for
|
32
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
33
|
+
# @example Count tokens for text
|
34
|
+
# enc = Tiktoken.encoding_for_model("gpt-4")
|
35
|
+
# enc.encode("hello world").length #=> 2
|
36
|
+
def encoding_for_model(model_name)
|
37
|
+
for prefix in PREFIX_MODELS
|
38
|
+
if model_name.to_s.start_with?("#{prefix}-")
|
39
|
+
model_name = prefix
|
40
|
+
break
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
|
45
|
+
return nil unless encoding_name
|
46
|
+
|
47
|
+
get_encoding(encoding_name)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Lists all the encodings that are supported
|
51
|
+
# @return [Array<Symbol>] The list of supported encodings
|
52
|
+
def list_encoding_names
|
53
|
+
SUPPORTED_ENCODINGS
|
54
|
+
end
|
55
|
+
|
56
|
+
# Lists all the models that are supported
|
57
|
+
# @return [Array<Symbol>] The list of supported models
|
58
|
+
def list_model_names
|
59
|
+
MODEL_TO_ENCODING_NAME.keys
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
SUPPORTED_ENCODINGS = [
|
65
|
+
:r50k_base,
|
66
|
+
:p50k_base,
|
67
|
+
:p50k_edit,
|
68
|
+
:cl100k_base,
|
69
|
+
]
|
70
|
+
|
71
|
+
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
|
72
|
+
# that is also MIT licensed but by OpenAI
|
73
|
+
MODEL_TO_ENCODING_NAME = {
|
74
|
+
"gpt-4": "cl100k_base",
|
75
|
+
"gpt-3.5-turbo": "cl100k_base",
|
76
|
+
# text
|
77
|
+
"text-davinci-003": "p50k_base",
|
78
|
+
"text-davinci-002": "p50k_base",
|
79
|
+
"text-davinci-001": "r50k_base",
|
80
|
+
"text-curie-001": "r50k_base",
|
81
|
+
"text-babbage-001": "r50k_base",
|
82
|
+
"text-ada-001": "r50k_base",
|
83
|
+
"davinci": "r50k_base",
|
84
|
+
"curie": "r50k_base",
|
85
|
+
"babbage": "r50k_base",
|
86
|
+
"ada": "r50k_base",
|
87
|
+
# code
|
88
|
+
"code-davinci-002": "p50k_base",
|
89
|
+
"code-davinci-001": "p50k_base",
|
90
|
+
"code-cushman-002": "p50k_base",
|
91
|
+
"code-cushman-001": "p50k_base",
|
92
|
+
"davinci-codex": "p50k_base",
|
93
|
+
"cushman-codex": "p50k_base",
|
94
|
+
# edit
|
95
|
+
"text-davinci-edit-001": "p50k_edit",
|
96
|
+
"code-davinci-edit-001": "p50k_edit",
|
97
|
+
# embeddings
|
98
|
+
"text-embedding-ada-002": "cl100k_base",
|
99
|
+
# old embeddings
|
100
|
+
"text-similarity-davinci-001": "r50k_base",
|
101
|
+
"text-similarity-curie-001": "r50k_base",
|
102
|
+
"text-similarity-babbage-001": "r50k_base",
|
103
|
+
"text-similarity-ada-001": "r50k_base",
|
104
|
+
"text-search-davinci-doc-001": "r50k_base",
|
105
|
+
"text-search-curie-doc-001": "r50k_base",
|
106
|
+
"text-search-babbage-doc-001": "r50k_base",
|
107
|
+
"text-search-ada-doc-001": "r50k_base",
|
108
|
+
"code-search-babbage-code-001": "r50k_base",
|
109
|
+
"code-search-ada-code-001": "r50k_base",
|
110
|
+
}
|
111
|
+
|
112
|
+
# these are models that have a versioned models that are otherwise identical
|
113
|
+
PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
|
114
|
+
end
|
9
115
|
end
|
metadata
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiktoken_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: arm64-darwin
|
6
6
|
authors:
|
7
7
|
- IAPark
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-03-
|
11
|
+
date: 2023-03-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
13
|
+
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
14
|
+
used by OpenAI. It can be used to count the number of tokens in text before sending
|
15
|
+
it to OpenAI APIs.
|
14
16
|
email:
|
15
17
|
- isaac.a.park@gmail.com
|
16
18
|
executables: []
|
@@ -24,6 +26,7 @@ files:
|
|
24
26
|
- LICENSE.txt
|
25
27
|
- README.md
|
26
28
|
- Rakefile
|
29
|
+
- doctest_helper.rb
|
27
30
|
- lib/tiktoken_ruby.rb
|
28
31
|
- lib/tiktoken_ruby/2.7/tiktoken_ruby.bundle
|
29
32
|
- lib/tiktoken_ruby/3.0/tiktoken_ruby.bundle
|
@@ -38,6 +41,7 @@ licenses:
|
|
38
41
|
metadata:
|
39
42
|
homepage_uri: https://github.com/IAPark/tiktoken_ruby
|
40
43
|
source_code_uri: https://github.com/IAPark/tiktoken_ruby
|
44
|
+
documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
|
41
45
|
post_install_message:
|
42
46
|
rdoc_options: []
|
43
47
|
require_paths:
|
@@ -54,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
54
58
|
requirements:
|
55
59
|
- - ">="
|
56
60
|
- !ruby/object:Gem::Version
|
57
|
-
version: 3.
|
61
|
+
version: 3.1.0
|
58
62
|
requirements: []
|
59
63
|
rubygems_version: 3.4.4
|
60
64
|
signing_key:
|