tiktoken_ruby 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/Gemfile.lock +3 -7
- data/README.md +12 -1
- data/Rakefile +0 -1
- data/doctest_helper.rb +1 -1
- data/lib/tiktoken_ruby/encoding.rb +48 -47
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +8 -9
- data/tiktoken_ruby.gemspec +7 -11
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1c461629e4b1174e825e05f4644e7bcdaac90849047997c6724358435c70f0bb
|
4
|
+
data.tar.gz: 244217e1b298a998c468723d2082bcf10bb46e6a80168d808a56ee4b618bbf6f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2920fba51441a33440435c310e2389723ed5e63dd150e0303743e09ee2ef5f68499e14f9df6254daa7a355568fbdf54e91a9ef3eebba30182f757e7f190ba11
|
7
|
+
data.tar.gz: b0da8ae135800a6405e546a03f1c17319978096716872a3d4b2fbe7fe812f2061f73d02eb8540f866d41a009d21c075273e096c3073be6ae1b5562f799f806c1
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,24 +1,19 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tiktoken_ruby (0.0.
|
4
|
+
tiktoken_ruby (0.0.4)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
ast (2.4.2)
|
10
|
-
coderay (1.1.3)
|
11
10
|
diff-lcs (1.5.0)
|
12
11
|
json (2.6.3)
|
13
12
|
language_server-protocol (3.17.0.3)
|
14
|
-
method_source (1.0.0)
|
15
13
|
minitest (5.18.0)
|
16
14
|
parallel (1.22.1)
|
17
15
|
parser (3.2.1.1)
|
18
16
|
ast (~> 2.4.1)
|
19
|
-
pry (0.14.2)
|
20
|
-
coderay (~> 1.1)
|
21
|
-
method_source (~> 1.0)
|
22
17
|
rainbow (3.1.1)
|
23
18
|
rake (13.0.6)
|
24
19
|
rake-compiler (1.2.1)
|
@@ -69,9 +64,10 @@ GEM
|
|
69
64
|
|
70
65
|
PLATFORMS
|
71
66
|
arm64-darwin-22
|
67
|
+
x86_64-darwin-22
|
68
|
+
x86_64-linux
|
72
69
|
|
73
70
|
DEPENDENCIES
|
74
|
-
pry (~> 0.14.2)
|
75
71
|
rake (~> 13.0)
|
76
72
|
rake-compiler
|
77
73
|
rb_sys
|
data/README.md
CHANGED
@@ -15,7 +15,7 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
15
15
|
$ gem install tiktoken_ruby
|
16
16
|
|
17
17
|
## Usage
|
18
|
-
Usage should be very similar to the python library.
|
18
|
+
Usage should be very similar to the python library. Here's a simple example
|
19
19
|
|
20
20
|
Encode and decode text
|
21
21
|
```ruby
|
@@ -43,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
43
43
|
|
44
44
|
Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
|
45
45
|
|
46
|
+
To get started with development:
|
47
|
+
|
48
|
+
```sh
|
49
|
+
git clone https://github.com/IAPark/tiktoken_ruby.git
|
50
|
+
cd tiktoken_ruby
|
51
|
+
bundle install
|
52
|
+
bundle exec rake compile
|
53
|
+
bundle exec rake spec
|
54
|
+
```
|
55
|
+
|
56
|
+
|
46
57
|
## License
|
47
58
|
|
48
59
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
CHANGED
data/doctest_helper.rb
CHANGED
@@ -1 +1 @@
|
|
1
|
-
require
|
1
|
+
require "lib/tiktoken_ruby"
|
@@ -1,51 +1,52 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
class Tiktoken::Encoding
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
4
|
+
attr_reader :name
|
5
|
+
|
6
|
+
# This returns a new Tiktoken::Encoding instance for the requested encoding
|
7
|
+
# @param encoding [Symbol] The name of the encoding to load
|
8
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
9
|
+
def self.for_name(encoding)
|
10
|
+
Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
|
11
|
+
end
|
12
|
+
|
13
|
+
# This returns a Tiktoken::Encoding instance for the requested encoding
|
14
|
+
# It will reuse an existing encoding if it's already been loaded
|
15
|
+
# @param encoding [Symbol] The name of the encoding to load
|
16
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
17
|
+
def self.for_name_cached(encoding)
|
18
|
+
@encodings ||= {}
|
19
|
+
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
|
23
|
+
# basically it's unescaped
|
24
|
+
# @param text [String] The text to encode
|
25
|
+
# @return [Array<Integer>] The encoded tokens
|
26
|
+
def encode_ordinary(text)
|
27
|
+
@ext_base_bpe.encode_ordinary(text)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
|
31
|
+
# as text unless they're in the allowed_special array. It's basically like the text was escaped
|
32
|
+
# @param text [String] The text to encode
|
33
|
+
# @param allowed_special [Array<String>] An array of special tokens to allow
|
34
|
+
# @return [Array<Integer>] The encoded tokens
|
35
|
+
def encode(text, allowed_special: [])
|
36
|
+
@ext_base_bpe.encode(text, allowed_special)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Decodes the tokens back into text
|
40
|
+
# @param tokens [Array<Integer>] The tokens to decode
|
41
|
+
# @return [String] The decoded text
|
42
|
+
def decode(tokens)
|
43
|
+
@ext_base_bpe.decode(tokens)
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def initialize(ext_base_bpe, name)
|
49
|
+
@ext_base_bpe = ext_base_bpe
|
50
|
+
@name = name
|
51
|
+
end
|
51
52
|
end
|
data/lib/tiktoken_ruby.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "tiktoken_ruby/version"
|
4
|
-
require_relative "tiktoken_ruby/encoding
|
4
|
+
require_relative "tiktoken_ruby/encoding"
|
5
5
|
|
6
6
|
begin
|
7
7
|
RUBY_VERSION =~ /(\d+\.\d+)/
|
@@ -12,7 +12,6 @@ end
|
|
12
12
|
|
13
13
|
module Tiktoken
|
14
14
|
class << self
|
15
|
-
|
16
15
|
# Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
|
17
16
|
# it will reuse the instance of that type that was previous loaded
|
18
17
|
# @param name [Symbol|String] The name of the encoding to load
|
@@ -34,7 +33,7 @@ module Tiktoken
|
|
34
33
|
# enc = Tiktoken.encoding_for_model("gpt-4")
|
35
34
|
# enc.encode("hello world").length #=> 2
|
36
35
|
def encoding_for_model(model_name)
|
37
|
-
|
36
|
+
PREFIX_MODELS.each do |prefix|
|
38
37
|
if model_name.to_s.start_with?("#{prefix}-")
|
39
38
|
model_name = prefix
|
40
39
|
break
|
@@ -65,7 +64,7 @@ module Tiktoken
|
|
65
64
|
:r50k_base,
|
66
65
|
:p50k_base,
|
67
66
|
:p50k_edit,
|
68
|
-
:cl100k_base
|
67
|
+
:cl100k_base
|
69
68
|
]
|
70
69
|
|
71
70
|
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
|
@@ -80,10 +79,10 @@ module Tiktoken
|
|
80
79
|
"text-curie-001": "r50k_base",
|
81
80
|
"text-babbage-001": "r50k_base",
|
82
81
|
"text-ada-001": "r50k_base",
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
82
|
+
davinci: "r50k_base",
|
83
|
+
curie: "r50k_base",
|
84
|
+
babbage: "r50k_base",
|
85
|
+
ada: "r50k_base",
|
87
86
|
# code
|
88
87
|
"code-davinci-002": "p50k_base",
|
89
88
|
"code-davinci-001": "p50k_base",
|
@@ -106,7 +105,7 @@ module Tiktoken
|
|
106
105
|
"text-search-babbage-doc-001": "r50k_base",
|
107
106
|
"text-search-ada-doc-001": "r50k_base",
|
108
107
|
"code-search-babbage-code-001": "r50k_base",
|
109
|
-
"code-search-ada-code-001": "r50k_base"
|
108
|
+
"code-search-ada-code-001": "r50k_base"
|
110
109
|
}
|
111
110
|
|
112
111
|
# these are models that have a versioned models that are otherwise identical
|
data/tiktoken_ruby.gemspec
CHANGED
@@ -9,21 +9,21 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.email = ["isaac.a.park@gmail.com"]
|
10
10
|
|
11
11
|
spec.summary = "Ruby wrapper for Tiktoken"
|
12
|
-
spec.description = "An unofficial Ruby wrapper for Tiktoken, "
|
13
|
-
|
14
|
-
|
12
|
+
spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
|
13
|
+
"a BPE tokenizer written by and used by OpenAI. It can be used to " \
|
14
|
+
"count the number of tokens in text before sending it to OpenAI APIs."
|
15
15
|
|
16
16
|
spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
|
17
17
|
spec.license = "MIT"
|
18
|
-
spec.required_ruby_version = ">= 2.
|
18
|
+
spec.required_ruby_version = ">= 2.7.0"
|
19
19
|
spec.required_rubygems_version = ">= 3.1.0"
|
20
20
|
spec.platform = Gem::Platform::RUBY
|
21
21
|
|
22
22
|
spec.metadata["homepage_uri"] = spec.homepage
|
23
23
|
spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
|
24
24
|
spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
|
25
|
-
|
26
|
-
#spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
25
|
+
|
26
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
27
27
|
|
28
28
|
# Specify which files should be added to the gem when it is released.
|
29
29
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
@@ -35,11 +35,7 @@ Gem::Specification.new do |spec|
|
|
35
35
|
spec.bindir = "exe"
|
36
36
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
37
37
|
spec.require_paths = ["lib"]
|
38
|
-
spec.extensions = ["ext/tiktoken_ruby/
|
39
|
-
|
40
|
-
# Uncomment to register a new dependency of your gem
|
41
|
-
# spec.add_dependency "example-gem", "~> 1.0"
|
42
|
-
# spec.add_dependency "rb_sys", "~> 0.9"
|
38
|
+
spec.extensions = ["ext/tiktoken_ruby/extconf.rb"]
|
43
39
|
|
44
40
|
# For more information and examples about making a new gem, check out our
|
45
41
|
# guide at: https://bundler.io/guides/creating_gem.html
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiktoken_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- IAPark
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-03-
|
11
|
+
date: 2023-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
14
14
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|
@@ -17,7 +17,7 @@ email:
|
|
17
17
|
- isaac.a.park@gmail.com
|
18
18
|
executables: []
|
19
19
|
extensions:
|
20
|
-
- ext/tiktoken_ruby/
|
20
|
+
- ext/tiktoken_ruby/extconf.rb
|
21
21
|
extra_rdoc_files: []
|
22
22
|
files:
|
23
23
|
- ".rspec"
|
@@ -54,7 +54,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
54
54
|
requirements:
|
55
55
|
- - ">="
|
56
56
|
- !ruby/object:Gem::Version
|
57
|
-
version: 2.
|
57
|
+
version: 2.7.0
|
58
58
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
59
|
requirements:
|
60
60
|
- - ">="
|