tiktoken_ruby 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +9 -1
- data/README.md +14 -3
- data/doctest_helper.rb +1 -0
- data/ext/tiktoken_ruby/src/lib.rs +1 -1
- data/lib/tiktoken_ruby/encoding.rb +33 -5
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +101 -1
- data/tiktoken_ruby.gemspec +6 -3
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: abd9dc6cb619b53de92f8e788dc81ca5df2a01e6de72152cbac3110ca1d228a6
|
4
|
+
data.tar.gz: c0f2e679550ec6a8bec569b3c16e735ac469839a58c4ce2747a6d37dd5399d17
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d2386df25639f6e713b450cd0f671bb7613c11f4d7925f566703e422ff22f21e48c8176ef277792908da161232242c58816c0d0964f66494089a5a93732ee062
|
7
|
+
data.tar.gz: d03fb53b1f9cd4f8d37cd0ceaa5b1943d55d981e4288c55bba5483b7253400e06f60618289c292539cf2a7fbad3752a85bd4ac02ef12c6ad870a6db22a0bd3d8
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tiktoken_ruby (0.0.
|
4
|
+
tiktoken_ruby (0.0.3)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
@@ -12,6 +12,7 @@ GEM
|
|
12
12
|
json (2.6.3)
|
13
13
|
language_server-protocol (3.17.0.3)
|
14
14
|
method_source (1.0.0)
|
15
|
+
minitest (5.18.0)
|
15
16
|
parallel (1.22.1)
|
16
17
|
parser (3.2.1.1)
|
17
18
|
ast (~> 2.4.1)
|
@@ -59,6 +60,12 @@ GEM
|
|
59
60
|
rubocop (= 1.48.1)
|
60
61
|
rubocop-performance (= 1.16.0)
|
61
62
|
unicode-display_width (2.4.2)
|
63
|
+
webrick (1.7.0)
|
64
|
+
yard (0.9.28)
|
65
|
+
webrick (~> 1.7.0)
|
66
|
+
yard-doctest (0.1.17)
|
67
|
+
minitest
|
68
|
+
yard
|
62
69
|
|
63
70
|
PLATFORMS
|
64
71
|
arm64-darwin-22
|
@@ -71,6 +78,7 @@ DEPENDENCIES
|
|
71
78
|
rspec (~> 3.0)
|
72
79
|
standard (~> 1.3)
|
73
80
|
tiktoken_ruby!
|
81
|
+
yard-doctest (~> 0.1.17)
|
74
82
|
|
75
83
|
BUNDLED WITH
|
76
84
|
2.4.6
|
data/README.md
CHANGED
@@ -15,11 +15,22 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
15
15
|
$ gem install tiktoken_ruby
|
16
16
|
|
17
17
|
## Usage
|
18
|
+
Usage should be very similar to the python library. here's a simple example
|
18
19
|
|
20
|
+
Encode and decode text
|
19
21
|
```ruby
|
20
|
-
|
21
|
-
|
22
|
-
|
22
|
+
require 'tiktoken_ruby'
|
23
|
+
|
24
|
+
enc = Tiktoken.get_encoding("cl100k_base")
|
25
|
+
enc.decode(enc.encode("hello world")) #=> "hello world"
|
26
|
+
```
|
27
|
+
|
28
|
+
Encoders can also be retrieved by model name
|
29
|
+
```ruby
|
30
|
+
require 'tiktoken_ruby'
|
31
|
+
|
32
|
+
enc = Tiktoken.encoding_for_model("gpt-4")
|
33
|
+
enc.encode("hello world").length #=> 2
|
23
34
|
```
|
24
35
|
|
25
36
|
## Development
|
data/doctest_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'lib/tiktoken_ruby.rb'
|
@@ -35,7 +35,7 @@ fn init() -> Result<(), Error> {
|
|
35
35
|
let factory_module = module.define_module("BpeFactory")?;
|
36
36
|
factory_module.define_singleton_method("r50k_base", function!(r50k_base, 0))?;
|
37
37
|
factory_module.define_singleton_method("p50k_base", function!(p50k_base, 0))?;
|
38
|
-
factory_module.define_singleton_method("
|
38
|
+
factory_module.define_singleton_method("p50k_edit", function!(p50k_edit, 0))?;
|
39
39
|
factory_module.define_singleton_method("cl100k_base", function!(cl100k_base, 0))?;
|
40
40
|
|
41
41
|
|
@@ -1,23 +1,51 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
class Tiktoken::Encoding
|
4
|
-
|
5
|
-
|
3
|
+
class Tiktoken::Encoding
|
4
|
+
attr_reader :name
|
5
|
+
|
6
|
+
# This returns a new Tiktoken::Encoding instance for the requested encoding
|
7
|
+
# @param encoding [Symbol] The name of the encoding to load
|
8
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
9
|
+
def self.for_name(encoding)
|
10
|
+
Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
|
6
11
|
end
|
7
12
|
|
8
|
-
|
9
|
-
|
13
|
+
# This returns a Tiktoken::Encoding instance for the requested encoding
|
14
|
+
# It will reuse an existing encoding if it's already been loaded
|
15
|
+
# @param encoding [Symbol] The name of the encoding to load
|
16
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
17
|
+
def self.for_name_cached(encoding)
|
18
|
+
@encodings ||= {}
|
19
|
+
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
|
10
20
|
end
|
11
21
|
|
22
|
+
# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
|
23
|
+
# basically it's unescaped
|
24
|
+
# @param text [String] The text to encode
|
25
|
+
# @return [Array<Integer>] The encoded tokens
|
12
26
|
def encode_ordinary(text)
|
13
27
|
@ext_base_bpe.encode_ordinary(text)
|
14
28
|
end
|
15
29
|
|
30
|
+
# Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
|
31
|
+
# as text unless they're in the allowed_special array. It's basically like the text was escaped
|
32
|
+
# @param text [String] The text to encode
|
33
|
+
# @param allowed_special [Array<String>] An array of special tokens to allow
|
34
|
+
# @return [Array<Integer>] The encoded tokens
|
16
35
|
def encode(text, allowed_special: [])
|
17
36
|
@ext_base_bpe.encode(text, allowed_special)
|
18
37
|
end
|
19
38
|
|
39
|
+
# Decodes the tokens back into text
|
40
|
+
# @param tokens [Array<Integer>] The tokens to decode
|
41
|
+
# @return [String] The decoded text
|
20
42
|
def decode(tokens)
|
21
43
|
@ext_base_bpe.decode(tokens)
|
22
44
|
end
|
45
|
+
|
46
|
+
private
|
47
|
+
def initialize(ext_base_bpe, name)
|
48
|
+
@ext_base_bpe = ext_base_bpe
|
49
|
+
@name = name
|
50
|
+
end
|
23
51
|
end
|
data/lib/tiktoken_ruby.rb
CHANGED
@@ -11,5 +11,105 @@ rescue LoadError
|
|
11
11
|
end
|
12
12
|
|
13
13
|
module Tiktoken
|
14
|
-
class
|
14
|
+
class << self
|
15
|
+
|
16
|
+
# Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
|
17
|
+
# it will reuse the instance of that type that was previous loaded
|
18
|
+
# @param name [Symbol|String] The name of the encoding to load
|
19
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
20
|
+
# @example Encode and decode text
|
21
|
+
# enc = Tiktoken.get_encoding("cl100k_base")
|
22
|
+
# enc.decode(enc.encode("hello world")) #=> "hello world"
|
23
|
+
def get_encoding(name)
|
24
|
+
name = name.to_sym
|
25
|
+
return nil unless SUPPORTED_ENCODINGS.include?(name)
|
26
|
+
|
27
|
+
Tiktoken::Encoding.for_name_cached(name)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Gets the encoding for an OpenAI model
|
31
|
+
# @param model_name [Symbol|String] The name of the model to get the encoding for
|
32
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
33
|
+
# @example Count tokens for text
|
34
|
+
# enc = Tiktoken.encoding_for_model("gpt-4")
|
35
|
+
# enc.encode("hello world").length #=> 2
|
36
|
+
def encoding_for_model(model_name)
|
37
|
+
for prefix in PREFIX_MODELS
|
38
|
+
if model_name.to_s.start_with?("#{prefix}-")
|
39
|
+
model_name = prefix
|
40
|
+
break
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
|
45
|
+
return nil unless encoding_name
|
46
|
+
|
47
|
+
get_encoding(encoding_name)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Lists all the encodings that are supported
|
51
|
+
# @return [Array<Symbol>] The list of supported encodings
|
52
|
+
def list_encoding_names
|
53
|
+
SUPPORTED_ENCODINGS
|
54
|
+
end
|
55
|
+
|
56
|
+
# Lists all the models that are supported
|
57
|
+
# @return [Array<Symbol>] The list of supported models
|
58
|
+
def list_model_names
|
59
|
+
MODEL_TO_ENCODING_NAME.keys
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
SUPPORTED_ENCODINGS = [
|
65
|
+
:r50k_base,
|
66
|
+
:p50k_base,
|
67
|
+
:p50k_edit,
|
68
|
+
:cl100k_base,
|
69
|
+
]
|
70
|
+
|
71
|
+
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
|
72
|
+
# that is also MIT licensed but by OpenAI
|
73
|
+
MODEL_TO_ENCODING_NAME = {
|
74
|
+
"gpt-4": "cl100k_base",
|
75
|
+
"gpt-3.5-turbo": "cl100k_base",
|
76
|
+
# text
|
77
|
+
"text-davinci-003": "p50k_base",
|
78
|
+
"text-davinci-002": "p50k_base",
|
79
|
+
"text-davinci-001": "r50k_base",
|
80
|
+
"text-curie-001": "r50k_base",
|
81
|
+
"text-babbage-001": "r50k_base",
|
82
|
+
"text-ada-001": "r50k_base",
|
83
|
+
"davinci": "r50k_base",
|
84
|
+
"curie": "r50k_base",
|
85
|
+
"babbage": "r50k_base",
|
86
|
+
"ada": "r50k_base",
|
87
|
+
# code
|
88
|
+
"code-davinci-002": "p50k_base",
|
89
|
+
"code-davinci-001": "p50k_base",
|
90
|
+
"code-cushman-002": "p50k_base",
|
91
|
+
"code-cushman-001": "p50k_base",
|
92
|
+
"davinci-codex": "p50k_base",
|
93
|
+
"cushman-codex": "p50k_base",
|
94
|
+
# edit
|
95
|
+
"text-davinci-edit-001": "p50k_edit",
|
96
|
+
"code-davinci-edit-001": "p50k_edit",
|
97
|
+
# embeddings
|
98
|
+
"text-embedding-ada-002": "cl100k_base",
|
99
|
+
# old embeddings
|
100
|
+
"text-similarity-davinci-001": "r50k_base",
|
101
|
+
"text-similarity-curie-001": "r50k_base",
|
102
|
+
"text-similarity-babbage-001": "r50k_base",
|
103
|
+
"text-similarity-ada-001": "r50k_base",
|
104
|
+
"text-search-davinci-doc-001": "r50k_base",
|
105
|
+
"text-search-curie-doc-001": "r50k_base",
|
106
|
+
"text-search-babbage-doc-001": "r50k_base",
|
107
|
+
"text-search-ada-doc-001": "r50k_base",
|
108
|
+
"code-search-babbage-code-001": "r50k_base",
|
109
|
+
"code-search-ada-code-001": "r50k_base",
|
110
|
+
}
|
111
|
+
|
112
|
+
# these are models that have a versioned models that are otherwise identical
|
113
|
+
PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
|
114
|
+
end
|
15
115
|
end
|
data/tiktoken_ruby.gemspec
CHANGED
@@ -9,17 +9,20 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.email = ["isaac.a.park@gmail.com"]
|
10
10
|
|
11
11
|
spec.summary = "Ruby wrapper for Tiktoken"
|
12
|
-
spec.description = "
|
12
|
+
spec.description = "An unofficial Ruby wrapper for Tiktoken, " +
|
13
|
+
"a BPE tokenizer written by and used by OpenAI. It can be used to " +
|
14
|
+
"count the number of tokens in text before sending it to OpenAI APIs."
|
15
|
+
|
13
16
|
spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
|
14
17
|
spec.license = "MIT"
|
15
18
|
spec.required_ruby_version = ">= 2.6.0"
|
16
19
|
spec.required_rubygems_version = ">= 3.1.0"
|
17
20
|
spec.platform = Gem::Platform::RUBY
|
18
21
|
|
19
|
-
#spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
|
20
|
-
|
21
22
|
spec.metadata["homepage_uri"] = spec.homepage
|
22
23
|
spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
|
24
|
+
spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
|
25
|
+
|
23
26
|
#spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
24
27
|
|
25
28
|
# Specify which files should be added to the gem when it is released.
|
metadata
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiktoken_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- IAPark
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-03-
|
11
|
+
date: 2023-03-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
13
|
+
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
14
|
+
used by OpenAI. It can be used to count the number of tokens in text before sending
|
15
|
+
it to OpenAI APIs.
|
14
16
|
email:
|
15
17
|
- isaac.a.park@gmail.com
|
16
18
|
executables: []
|
@@ -27,6 +29,7 @@ files:
|
|
27
29
|
- LICENSE.txt
|
28
30
|
- README.md
|
29
31
|
- Rakefile
|
32
|
+
- doctest_helper.rb
|
30
33
|
- ext/tiktoken_ruby/Cargo.toml
|
31
34
|
- ext/tiktoken_ruby/extconf.rb
|
32
35
|
- ext/tiktoken_ruby/src/core_bpe_wrapper.rs
|
@@ -42,6 +45,7 @@ licenses:
|
|
42
45
|
metadata:
|
43
46
|
homepage_uri: https://github.com/IAPark/tiktoken_ruby
|
44
47
|
source_code_uri: https://github.com/IAPark/tiktoken_ruby
|
48
|
+
documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
|
45
49
|
post_install_message:
|
46
50
|
rdoc_options: []
|
47
51
|
require_paths:
|