tiktoken_ruby 0.0.3-x64-mingw-ucrt → 0.0.4-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/Gemfile.lock +2 -7
- data/README.md +12 -1
- data/Rakefile +0 -1
- data/doctest_helper.rb +1 -1
- data/lib/tiktoken_ruby/3.1/tiktoken_ruby.so +0 -0
- data/lib/tiktoken_ruby/3.2/tiktoken_ruby.so +0 -0
- data/lib/tiktoken_ruby/encoding.rb +48 -47
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +8 -9
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d3ae645bd74a9706f834cfddab56cfba8233233243f0f9f7a6e38f453ea0d7f
|
4
|
+
data.tar.gz: c9aba2e4ffecf42dda1ec11b40e22c929dc471cf0fdffcbaa243e477b7aa18dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9163832bedcebe8d97b2319862bc7230ea78441b0883655392671e9269360dd74a65234b79a2277876a786e2473b0be3feadea6b865283217ed2a75b019de2f2
|
7
|
+
data.tar.gz: ece1e1a541aa4b63818096b837ae1ae2e0f4fef91e1ff6baebfd2471f10fb746d6830d47c170d7fd524adede6d7411abe78bfa650ef4e27997c5064ef871373a
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,24 +1,19 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tiktoken_ruby (0.0.
|
4
|
+
tiktoken_ruby (0.0.4)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
ast (2.4.2)
|
10
|
-
coderay (1.1.3)
|
11
10
|
diff-lcs (1.5.0)
|
12
11
|
json (2.6.3)
|
13
12
|
language_server-protocol (3.17.0.3)
|
14
|
-
method_source (1.0.0)
|
15
13
|
minitest (5.18.0)
|
16
14
|
parallel (1.22.1)
|
17
15
|
parser (3.2.1.1)
|
18
16
|
ast (~> 2.4.1)
|
19
|
-
pry (0.14.2)
|
20
|
-
coderay (~> 1.1)
|
21
|
-
method_source (~> 1.0)
|
22
17
|
rainbow (3.1.1)
|
23
18
|
rake (13.0.6)
|
24
19
|
rake-compiler (1.2.1)
|
@@ -69,10 +64,10 @@ GEM
|
|
69
64
|
|
70
65
|
PLATFORMS
|
71
66
|
arm64-darwin-22
|
67
|
+
x86_64-darwin-22
|
72
68
|
x86_64-linux
|
73
69
|
|
74
70
|
DEPENDENCIES
|
75
|
-
pry (~> 0.14.2)
|
76
71
|
rake (~> 13.0)
|
77
72
|
rake-compiler
|
78
73
|
rb_sys
|
data/README.md
CHANGED
@@ -15,7 +15,7 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
15
15
|
$ gem install tiktoken_ruby
|
16
16
|
|
17
17
|
## Usage
|
18
|
-
Usage should be very similar to the python library.
|
18
|
+
Usage should be very similar to the python library. Here's a simple example
|
19
19
|
|
20
20
|
Encode and decode text
|
21
21
|
```ruby
|
@@ -43,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
43
43
|
|
44
44
|
Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
|
45
45
|
|
46
|
+
To get started with development:
|
47
|
+
|
48
|
+
```sh
|
49
|
+
git clone https://github.com/IAPark/tiktoken_ruby.git
|
50
|
+
cd tiktoken_ruby
|
51
|
+
bundle install
|
52
|
+
bundle exec rake compile
|
53
|
+
bundle exec rake spec
|
54
|
+
```
|
55
|
+
|
56
|
+
|
46
57
|
## License
|
47
58
|
|
48
59
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
CHANGED
data/doctest_helper.rb
CHANGED
@@ -1 +1 @@
|
|
1
|
-
require
|
1
|
+
require "lib/tiktoken_ruby"
|
Binary file
|
Binary file
|
@@ -1,51 +1,52 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
class Tiktoken::Encoding
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
4
|
+
attr_reader :name
|
5
|
+
|
6
|
+
# This returns a new Tiktoken::Encoding instance for the requested encoding
|
7
|
+
# @param encoding [Symbol] The name of the encoding to load
|
8
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
9
|
+
def self.for_name(encoding)
|
10
|
+
Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
|
11
|
+
end
|
12
|
+
|
13
|
+
# This returns a Tiktoken::Encoding instance for the requested encoding
|
14
|
+
# It will reuse an existing encoding if it's already been loaded
|
15
|
+
# @param encoding [Symbol] The name of the encoding to load
|
16
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
17
|
+
def self.for_name_cached(encoding)
|
18
|
+
@encodings ||= {}
|
19
|
+
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
|
23
|
+
# basically it's unescaped
|
24
|
+
# @param text [String] The text to encode
|
25
|
+
# @return [Array<Integer>] The encoded tokens
|
26
|
+
def encode_ordinary(text)
|
27
|
+
@ext_base_bpe.encode_ordinary(text)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
|
31
|
+
# as text unless they're in the allowed_special array. It's basically like the text was escaped
|
32
|
+
# @param text [String] The text to encode
|
33
|
+
# @param allowed_special [Array<String>] An array of special tokens to allow
|
34
|
+
# @return [Array<Integer>] The encoded tokens
|
35
|
+
def encode(text, allowed_special: [])
|
36
|
+
@ext_base_bpe.encode(text, allowed_special)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Decodes the tokens back into text
|
40
|
+
# @param tokens [Array<Integer>] The tokens to decode
|
41
|
+
# @return [String] The decoded text
|
42
|
+
def decode(tokens)
|
43
|
+
@ext_base_bpe.decode(tokens)
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def initialize(ext_base_bpe, name)
|
49
|
+
@ext_base_bpe = ext_base_bpe
|
50
|
+
@name = name
|
51
|
+
end
|
51
52
|
end
|
data/lib/tiktoken_ruby.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "tiktoken_ruby/version"
|
4
|
-
require_relative "tiktoken_ruby/encoding
|
4
|
+
require_relative "tiktoken_ruby/encoding"
|
5
5
|
|
6
6
|
begin
|
7
7
|
RUBY_VERSION =~ /(\d+\.\d+)/
|
@@ -12,7 +12,6 @@ end
|
|
12
12
|
|
13
13
|
module Tiktoken
|
14
14
|
class << self
|
15
|
-
|
16
15
|
# Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
|
17
16
|
# it will reuse the instance of that type that was previous loaded
|
18
17
|
# @param name [Symbol|String] The name of the encoding to load
|
@@ -34,7 +33,7 @@ module Tiktoken
|
|
34
33
|
# enc = Tiktoken.encoding_for_model("gpt-4")
|
35
34
|
# enc.encode("hello world").length #=> 2
|
36
35
|
def encoding_for_model(model_name)
|
37
|
-
|
36
|
+
PREFIX_MODELS.each do |prefix|
|
38
37
|
if model_name.to_s.start_with?("#{prefix}-")
|
39
38
|
model_name = prefix
|
40
39
|
break
|
@@ -65,7 +64,7 @@ module Tiktoken
|
|
65
64
|
:r50k_base,
|
66
65
|
:p50k_base,
|
67
66
|
:p50k_edit,
|
68
|
-
:cl100k_base
|
67
|
+
:cl100k_base
|
69
68
|
]
|
70
69
|
|
71
70
|
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
|
@@ -80,10 +79,10 @@ module Tiktoken
|
|
80
79
|
"text-curie-001": "r50k_base",
|
81
80
|
"text-babbage-001": "r50k_base",
|
82
81
|
"text-ada-001": "r50k_base",
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
82
|
+
davinci: "r50k_base",
|
83
|
+
curie: "r50k_base",
|
84
|
+
babbage: "r50k_base",
|
85
|
+
ada: "r50k_base",
|
87
86
|
# code
|
88
87
|
"code-davinci-002": "p50k_base",
|
89
88
|
"code-davinci-001": "p50k_base",
|
@@ -106,7 +105,7 @@ module Tiktoken
|
|
106
105
|
"text-search-babbage-doc-001": "r50k_base",
|
107
106
|
"text-search-ada-doc-001": "r50k_base",
|
108
107
|
"code-search-babbage-code-001": "r50k_base",
|
109
|
-
"code-search-ada-code-001": "r50k_base"
|
108
|
+
"code-search-ada-code-001": "r50k_base"
|
110
109
|
}
|
111
110
|
|
112
111
|
# these are models that have a versioned models that are otherwise identical
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiktoken_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: x64-mingw-ucrt
|
6
6
|
authors:
|
7
7
|
- IAPark
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-03-
|
11
|
+
date: 2023-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
14
14
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|