tiktoken_ruby 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +22 -13
- data/Gemfile.lock +1 -1
- data/README.md +10 -3
- data/lib/tiktoken_ruby/encoding.rb +6 -2
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +7 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 347a10d045e27fca4cdfec03c4d2eac0150448b8f2125d5bcbcd1b92db83499a
|
4
|
+
data.tar.gz: de048e8320daa15b27ffa7ccdd9f7ec618cb2a3ad96fe2bedf71e6f780fc8b6f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c587d4b18e777a0f7692aab857fec1fc3b3bcceceab158439197123786475c65d51f5356351bcf0c9c327b94e2ff15a83b2144c2db329aedea0456c1d763ff9
|
7
|
+
data.tar.gz: abfdeb836d81555effa5ef6647a77b360dae2b975bb3dc23a22c779f6dd82256630942ddfef01a837ff5195896fa4251925e87dec0d245dce1c8e83719a488f1
|
data/Cargo.lock
CHANGED
@@ -31,16 +31,16 @@ checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
|
|
31
31
|
|
32
32
|
[[package]]
|
33
33
|
name = "bindgen"
|
34
|
-
version = "0.
|
34
|
+
version = "0.69.4"
|
35
35
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
36
|
-
checksum = "
|
36
|
+
checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
|
37
37
|
dependencies = [
|
38
38
|
"bitflags 2.4.0",
|
39
39
|
"cexpr",
|
40
40
|
"clang-sys",
|
41
|
+
"itertools",
|
41
42
|
"lazy_static",
|
42
43
|
"lazycell",
|
43
|
-
"peeking_take_while",
|
44
44
|
"proc-macro2",
|
45
45
|
"quote",
|
46
46
|
"regex",
|
@@ -114,6 +114,12 @@ dependencies = [
|
|
114
114
|
"libloading",
|
115
115
|
]
|
116
116
|
|
117
|
+
[[package]]
|
118
|
+
name = "either"
|
119
|
+
version = "1.10.0"
|
120
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
121
|
+
checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
|
122
|
+
|
117
123
|
[[package]]
|
118
124
|
name = "fancy-regex"
|
119
125
|
version = "0.11.0"
|
@@ -130,6 +136,15 @@ version = "0.3.1"
|
|
130
136
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
131
137
|
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
132
138
|
|
139
|
+
[[package]]
|
140
|
+
name = "itertools"
|
141
|
+
version = "0.12.1"
|
142
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
143
|
+
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
144
|
+
dependencies = [
|
145
|
+
"either",
|
146
|
+
]
|
147
|
+
|
133
148
|
[[package]]
|
134
149
|
name = "lazy_static"
|
135
150
|
version = "1.4.0"
|
@@ -242,12 +257,6 @@ dependencies = [
|
|
242
257
|
"windows-sys",
|
243
258
|
]
|
244
259
|
|
245
|
-
[[package]]
|
246
|
-
name = "peeking_take_while"
|
247
|
-
version = "0.1.2"
|
248
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
249
|
-
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
250
|
-
|
251
260
|
[[package]]
|
252
261
|
name = "proc-macro2"
|
253
262
|
version = "1.0.66"
|
@@ -268,18 +277,18 @@ dependencies = [
|
|
268
277
|
|
269
278
|
[[package]]
|
270
279
|
name = "rb-sys"
|
271
|
-
version = "0.9.
|
280
|
+
version = "0.9.87"
|
272
281
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
273
|
-
checksum = "
|
282
|
+
checksum = "225103e3d69bbfe8831f9fd0d2461335f3a9dd06aa6e88bcb6d6970383494d06"
|
274
283
|
dependencies = [
|
275
284
|
"rb-sys-build",
|
276
285
|
]
|
277
286
|
|
278
287
|
[[package]]
|
279
288
|
name = "rb-sys-build"
|
280
|
-
version = "0.9.
|
289
|
+
version = "0.9.87"
|
281
290
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
282
|
-
checksum = "
|
291
|
+
checksum = "bacce8095a5167d5ede618bbd9353e9d9e2f32ddaf54be911106f0ee6baacf09"
|
283
292
|
dependencies = [
|
284
293
|
"bindgen",
|
285
294
|
"lazy_static",
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,14 @@
|
|
1
1
|
[](https://badge.fury.io/rb/tiktoken_ruby)
|
2
|
+
|
2
3
|
# tiktoken_ruby
|
3
4
|
|
4
5
|
[Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
|
5
|
-
This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
|
6
|
+
This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
|
7
|
+
|
8
|
+
## Request for maintainers
|
9
|
+
|
10
|
+
I can't really put substantial time into maintaining this. Probably nothing more than a couple hours every few months. If you have experience maintaining ruby gems and would like to
|
11
|
+
lend a hand please send me an email or reply to this [issue](https://github.com/IAPark/tiktoken_ruby/issues/26)
|
6
12
|
|
7
13
|
## Installation
|
8
14
|
|
@@ -15,17 +21,19 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
15
21
|
$ gem install tiktoken_ruby
|
16
22
|
|
17
23
|
## Usage
|
24
|
+
|
18
25
|
Usage should be very similar to the python library. Here's a simple example
|
19
26
|
|
20
27
|
Encode and decode text
|
28
|
+
|
21
29
|
```ruby
|
22
30
|
require 'tiktoken_ruby'
|
23
|
-
|
24
31
|
enc = Tiktoken.get_encoding("cl100k_base")
|
25
32
|
enc.decode(enc.encode("hello world")) #=> "hello world"
|
26
33
|
```
|
27
34
|
|
28
35
|
Encoders can also be retrieved by model name
|
36
|
+
|
29
37
|
```ruby
|
30
38
|
require 'tiktoken_ruby'
|
31
39
|
|
@@ -53,7 +61,6 @@ bundle exec rake compile
|
|
53
61
|
bundle exec rake spec
|
54
62
|
```
|
55
63
|
|
56
|
-
|
57
64
|
## License
|
58
65
|
|
59
66
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
@@ -1,6 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
class Tiktoken::Encoding
|
4
|
+
CACHE_MUTEX = Mutex.new
|
5
|
+
|
4
6
|
attr_reader :name
|
5
7
|
|
6
8
|
# This returns a new Tiktoken::Encoding instance for the requested encoding
|
@@ -15,8 +17,10 @@ class Tiktoken::Encoding
|
|
15
17
|
# @param encoding [Symbol] The name of the encoding to load
|
16
18
|
# @return [Tiktoken::Encoding] The encoding instance
|
17
19
|
def self.for_name_cached(encoding)
|
18
|
-
|
19
|
-
|
20
|
+
CACHE_MUTEX.synchronize do
|
21
|
+
@encodings ||= {}
|
22
|
+
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
|
23
|
+
end
|
20
24
|
end
|
21
25
|
|
22
26
|
# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
|
data/lib/tiktoken_ruby.rb
CHANGED
@@ -28,7 +28,7 @@ module Tiktoken
|
|
28
28
|
|
29
29
|
# Gets the encoding for an OpenAI model
|
30
30
|
# @param model_name [Symbol|String] The name of the model to get the encoding for
|
31
|
-
# @return [Tiktoken::Encoding] The encoding instance
|
31
|
+
# @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
|
32
32
|
# @example Count tokens for text
|
33
33
|
# enc = Tiktoken.encoding_for_model("gpt-4")
|
34
34
|
# enc.encode("hello world").length #=> 2
|
@@ -37,10 +37,12 @@ module Tiktoken
|
|
37
37
|
return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
|
38
38
|
end
|
39
39
|
|
40
|
-
MODEL_PREFIX_TO_ENCODING.
|
41
|
-
|
42
|
-
|
43
|
-
|
40
|
+
_prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
|
41
|
+
model_name.start_with?(prefix.to_s)
|
42
|
+
end
|
43
|
+
|
44
|
+
if encoding
|
45
|
+
get_encoding(encoding)
|
44
46
|
end
|
45
47
|
end
|
46
48
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiktoken_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- IAPark
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|