tiktoken_ruby 0.0.5 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +49 -25
- data/Gemfile +5 -9
- data/Gemfile.lock +44 -33
- data/README.md +10 -3
- data/ext/tiktoken_ruby/Cargo.toml +2 -1
- data/ext/tiktoken_ruby/src/lib.rs +9 -5
- data/lib/tiktoken_ruby/encoding.rb +6 -2
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +38 -19
- data/tiktoken_ruby.gemspec +1 -12
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 347a10d045e27fca4cdfec03c4d2eac0150448b8f2125d5bcbcd1b92db83499a
|
|
4
|
+
data.tar.gz: de048e8320daa15b27ffa7ccdd9f7ec618cb2a3ad96fe2bedf71e6f780fc8b6f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7c587d4b18e777a0f7692aab857fec1fc3b3bcceceab158439197123786475c65d51f5356351bcf0c9c327b94e2ff15a83b2144c2db329aedea0456c1d763ff9
|
|
7
|
+
data.tar.gz: abfdeb836d81555effa5ef6647a77b360dae2b975bb3dc23a22c779f6dd82256630942ddfef01a837ff5195896fa4251925e87dec0d245dce1c8e83719a488f1
|
data/Cargo.lock
CHANGED
|
@@ -31,21 +31,22 @@ checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
|
|
|
31
31
|
|
|
32
32
|
[[package]]
|
|
33
33
|
name = "bindgen"
|
|
34
|
-
version = "0.
|
|
34
|
+
version = "0.69.4"
|
|
35
35
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
36
|
-
checksum = "
|
|
36
|
+
checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
|
|
37
37
|
dependencies = [
|
|
38
|
-
"bitflags",
|
|
38
|
+
"bitflags 2.4.0",
|
|
39
39
|
"cexpr",
|
|
40
40
|
"clang-sys",
|
|
41
|
+
"itertools",
|
|
41
42
|
"lazy_static",
|
|
42
43
|
"lazycell",
|
|
43
|
-
"peeking_take_while",
|
|
44
44
|
"proc-macro2",
|
|
45
45
|
"quote",
|
|
46
46
|
"regex",
|
|
47
47
|
"rustc-hash",
|
|
48
48
|
"shlex",
|
|
49
|
+
"syn",
|
|
49
50
|
]
|
|
50
51
|
|
|
51
52
|
[[package]]
|
|
@@ -69,6 +70,12 @@ version = "1.3.2"
|
|
|
69
70
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
70
71
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
71
72
|
|
|
73
|
+
[[package]]
|
|
74
|
+
name = "bitflags"
|
|
75
|
+
version = "2.4.0"
|
|
76
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
77
|
+
checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635"
|
|
78
|
+
|
|
72
79
|
[[package]]
|
|
73
80
|
name = "bstr"
|
|
74
81
|
version = "1.4.0"
|
|
@@ -107,6 +114,12 @@ dependencies = [
|
|
|
107
114
|
"libloading",
|
|
108
115
|
]
|
|
109
116
|
|
|
117
|
+
[[package]]
|
|
118
|
+
name = "either"
|
|
119
|
+
version = "1.10.0"
|
|
120
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
121
|
+
checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
|
|
122
|
+
|
|
110
123
|
[[package]]
|
|
111
124
|
name = "fancy-regex"
|
|
112
125
|
version = "0.11.0"
|
|
@@ -123,6 +136,15 @@ version = "0.3.1"
|
|
|
123
136
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
124
137
|
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
|
125
138
|
|
|
139
|
+
[[package]]
|
|
140
|
+
name = "itertools"
|
|
141
|
+
version = "0.12.1"
|
|
142
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
143
|
+
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
|
144
|
+
dependencies = [
|
|
145
|
+
"either",
|
|
146
|
+
]
|
|
147
|
+
|
|
126
148
|
[[package]]
|
|
127
149
|
name = "lazy_static"
|
|
128
150
|
version = "1.4.0"
|
|
@@ -163,20 +185,21 @@ dependencies = [
|
|
|
163
185
|
|
|
164
186
|
[[package]]
|
|
165
187
|
name = "magnus"
|
|
166
|
-
version = "0.
|
|
188
|
+
version = "0.6.1"
|
|
167
189
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
168
|
-
checksum = "
|
|
190
|
+
checksum = "0516897a45f8ce8270a8910bcb94cd83538b19b6ae3a0c281a765df170b64695"
|
|
169
191
|
dependencies = [
|
|
170
192
|
"magnus-macros",
|
|
171
193
|
"rb-sys",
|
|
172
194
|
"rb-sys-env",
|
|
195
|
+
"seq-macro",
|
|
173
196
|
]
|
|
174
197
|
|
|
175
198
|
[[package]]
|
|
176
199
|
name = "magnus-macros"
|
|
177
|
-
version = "0.
|
|
200
|
+
version = "0.6.0"
|
|
178
201
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
179
|
-
checksum = "
|
|
202
|
+
checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
|
|
180
203
|
dependencies = [
|
|
181
204
|
"proc-macro2",
|
|
182
205
|
"quote",
|
|
@@ -234,44 +257,38 @@ dependencies = [
|
|
|
234
257
|
"windows-sys",
|
|
235
258
|
]
|
|
236
259
|
|
|
237
|
-
[[package]]
|
|
238
|
-
name = "peeking_take_while"
|
|
239
|
-
version = "0.1.2"
|
|
240
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
241
|
-
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
|
242
|
-
|
|
243
260
|
[[package]]
|
|
244
261
|
name = "proc-macro2"
|
|
245
|
-
version = "1.0.
|
|
262
|
+
version = "1.0.66"
|
|
246
263
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
247
|
-
checksum = "
|
|
264
|
+
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
|
|
248
265
|
dependencies = [
|
|
249
266
|
"unicode-ident",
|
|
250
267
|
]
|
|
251
268
|
|
|
252
269
|
[[package]]
|
|
253
270
|
name = "quote"
|
|
254
|
-
version = "1.0.
|
|
271
|
+
version = "1.0.33"
|
|
255
272
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
256
|
-
checksum = "
|
|
273
|
+
checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
|
|
257
274
|
dependencies = [
|
|
258
275
|
"proc-macro2",
|
|
259
276
|
]
|
|
260
277
|
|
|
261
278
|
[[package]]
|
|
262
279
|
name = "rb-sys"
|
|
263
|
-
version = "0.9.
|
|
280
|
+
version = "0.9.87"
|
|
264
281
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
265
|
-
checksum = "
|
|
282
|
+
checksum = "225103e3d69bbfe8831f9fd0d2461335f3a9dd06aa6e88bcb6d6970383494d06"
|
|
266
283
|
dependencies = [
|
|
267
284
|
"rb-sys-build",
|
|
268
285
|
]
|
|
269
286
|
|
|
270
287
|
[[package]]
|
|
271
288
|
name = "rb-sys-build"
|
|
272
|
-
version = "0.9.
|
|
289
|
+
version = "0.9.87"
|
|
273
290
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
274
|
-
checksum = "
|
|
291
|
+
checksum = "bacce8095a5167d5ede618bbd9353e9d9e2f32ddaf54be911106f0ee6baacf09"
|
|
275
292
|
dependencies = [
|
|
276
293
|
"bindgen",
|
|
277
294
|
"lazy_static",
|
|
@@ -294,7 +311,7 @@ version = "0.2.16"
|
|
|
294
311
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
295
312
|
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
|
|
296
313
|
dependencies = [
|
|
297
|
-
"bitflags",
|
|
314
|
+
"bitflags 1.3.2",
|
|
298
315
|
]
|
|
299
316
|
|
|
300
317
|
[[package]]
|
|
@@ -332,6 +349,12 @@ version = "1.1.0"
|
|
|
332
349
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
333
350
|
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
|
334
351
|
|
|
352
|
+
[[package]]
|
|
353
|
+
name = "seq-macro"
|
|
354
|
+
version = "0.3.5"
|
|
355
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
356
|
+
checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
357
|
+
|
|
335
358
|
[[package]]
|
|
336
359
|
name = "serde"
|
|
337
360
|
version = "1.0.157"
|
|
@@ -358,9 +381,9 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
|
|
358
381
|
|
|
359
382
|
[[package]]
|
|
360
383
|
name = "syn"
|
|
361
|
-
version = "
|
|
384
|
+
version = "2.0.31"
|
|
362
385
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
363
|
-
checksum = "
|
|
386
|
+
checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398"
|
|
364
387
|
dependencies = [
|
|
365
388
|
"proc-macro2",
|
|
366
389
|
"quote",
|
|
@@ -386,6 +409,7 @@ name = "tiktoken_ruby"
|
|
|
386
409
|
version = "0.1.0"
|
|
387
410
|
dependencies = [
|
|
388
411
|
"magnus",
|
|
412
|
+
"rb-sys",
|
|
389
413
|
"tiktoken-rs",
|
|
390
414
|
]
|
|
391
415
|
|
data/Gemfile
CHANGED
|
@@ -2,15 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
source "https://rubygems.org"
|
|
4
4
|
|
|
5
|
-
# Specify your gem's dependencies in tiktoken_ruby.gemspec
|
|
6
5
|
gemspec
|
|
7
6
|
|
|
8
|
-
gem "rake"
|
|
9
|
-
|
|
7
|
+
gem "rake"
|
|
10
8
|
gem "rake-compiler"
|
|
11
|
-
|
|
12
|
-
gem "
|
|
13
|
-
|
|
14
|
-
gem "
|
|
15
|
-
|
|
16
|
-
gem "yard-doctest", "~> 0.1.17"
|
|
9
|
+
gem "rspec"
|
|
10
|
+
gem "standard"
|
|
11
|
+
gem "yard-doctest"
|
|
12
|
+
gem "racc"
|
data/Gemfile.lock
CHANGED
|
@@ -1,64 +1,74 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
tiktoken_ruby (0.0.
|
|
5
|
-
rb_sys (
|
|
4
|
+
tiktoken_ruby (0.0.8)
|
|
5
|
+
rb_sys (>= 0.9.86)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
8
8
|
remote: https://rubygems.org/
|
|
9
9
|
specs:
|
|
10
10
|
ast (2.4.2)
|
|
11
11
|
diff-lcs (1.5.0)
|
|
12
|
-
json (2.
|
|
12
|
+
json (2.7.1)
|
|
13
13
|
language_server-protocol (3.17.0.3)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
lint_roller (1.1.0)
|
|
15
|
+
minitest (5.21.2)
|
|
16
|
+
parallel (1.24.0)
|
|
17
|
+
parser (3.3.0.4)
|
|
17
18
|
ast (~> 2.4.1)
|
|
19
|
+
racc
|
|
20
|
+
racc (1.7.3)
|
|
18
21
|
rainbow (3.1.1)
|
|
19
|
-
rake (13.0
|
|
20
|
-
rake-compiler (1.2.
|
|
22
|
+
rake (13.1.0)
|
|
23
|
+
rake-compiler (1.2.5)
|
|
21
24
|
rake
|
|
22
|
-
rb_sys (0.9.
|
|
23
|
-
regexp_parser (2.
|
|
24
|
-
rexml (3.2.
|
|
25
|
+
rb_sys (0.9.86)
|
|
26
|
+
regexp_parser (2.9.0)
|
|
27
|
+
rexml (3.2.6)
|
|
25
28
|
rspec (3.12.0)
|
|
26
29
|
rspec-core (~> 3.12.0)
|
|
27
30
|
rspec-expectations (~> 3.12.0)
|
|
28
31
|
rspec-mocks (~> 3.12.0)
|
|
29
|
-
rspec-core (3.12.
|
|
32
|
+
rspec-core (3.12.2)
|
|
30
33
|
rspec-support (~> 3.12.0)
|
|
31
|
-
rspec-expectations (3.12.
|
|
34
|
+
rspec-expectations (3.12.3)
|
|
32
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
33
36
|
rspec-support (~> 3.12.0)
|
|
34
|
-
rspec-mocks (3.12.
|
|
37
|
+
rspec-mocks (3.12.6)
|
|
35
38
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
36
39
|
rspec-support (~> 3.12.0)
|
|
37
|
-
rspec-support (3.12.
|
|
38
|
-
rubocop (1.
|
|
40
|
+
rspec-support (3.12.1)
|
|
41
|
+
rubocop (1.59.0)
|
|
39
42
|
json (~> 2.3)
|
|
43
|
+
language_server-protocol (>= 3.17.0)
|
|
40
44
|
parallel (~> 1.10)
|
|
41
|
-
parser (>= 3.2.
|
|
45
|
+
parser (>= 3.2.2.4)
|
|
42
46
|
rainbow (>= 2.2.2, < 4.0)
|
|
43
47
|
regexp_parser (>= 1.8, < 3.0)
|
|
44
48
|
rexml (>= 3.2.5, < 4.0)
|
|
45
|
-
rubocop-ast (>= 1.
|
|
49
|
+
rubocop-ast (>= 1.30.0, < 2.0)
|
|
46
50
|
ruby-progressbar (~> 1.7)
|
|
47
51
|
unicode-display_width (>= 2.4.0, < 3.0)
|
|
48
|
-
rubocop-ast (1.
|
|
52
|
+
rubocop-ast (1.30.0)
|
|
49
53
|
parser (>= 3.2.1.0)
|
|
50
|
-
rubocop-performance (1.
|
|
51
|
-
rubocop (>= 1.
|
|
52
|
-
rubocop-ast (>=
|
|
54
|
+
rubocop-performance (1.20.2)
|
|
55
|
+
rubocop (>= 1.48.1, < 2.0)
|
|
56
|
+
rubocop-ast (>= 1.30.0, < 2.0)
|
|
53
57
|
ruby-progressbar (1.13.0)
|
|
54
|
-
standard (1.
|
|
58
|
+
standard (1.33.0)
|
|
55
59
|
language_server-protocol (~> 3.17.0.2)
|
|
56
|
-
|
|
57
|
-
rubocop
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
lint_roller (~> 1.0)
|
|
61
|
+
rubocop (~> 1.59.0)
|
|
62
|
+
standard-custom (~> 1.0.0)
|
|
63
|
+
standard-performance (~> 1.3)
|
|
64
|
+
standard-custom (1.0.2)
|
|
65
|
+
lint_roller (~> 1.0)
|
|
66
|
+
rubocop (~> 1.50)
|
|
67
|
+
standard-performance (1.3.1)
|
|
68
|
+
lint_roller (~> 1.1)
|
|
69
|
+
rubocop-performance (~> 1.20.2)
|
|
70
|
+
unicode-display_width (2.5.0)
|
|
71
|
+
yard (0.9.34)
|
|
62
72
|
yard-doctest (0.1.17)
|
|
63
73
|
minitest
|
|
64
74
|
yard
|
|
@@ -70,12 +80,13 @@ PLATFORMS
|
|
|
70
80
|
x86_64-linux
|
|
71
81
|
|
|
72
82
|
DEPENDENCIES
|
|
73
|
-
|
|
83
|
+
racc
|
|
84
|
+
rake
|
|
74
85
|
rake-compiler
|
|
75
|
-
rspec
|
|
76
|
-
standard
|
|
86
|
+
rspec
|
|
87
|
+
standard
|
|
77
88
|
tiktoken_ruby!
|
|
78
|
-
yard-doctest
|
|
89
|
+
yard-doctest
|
|
79
90
|
|
|
80
91
|
BUNDLED WITH
|
|
81
92
|
2.4.6
|
data/README.md
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
[](https://badge.fury.io/rb/tiktoken_ruby)
|
|
2
|
+
|
|
2
3
|
# tiktoken_ruby
|
|
3
4
|
|
|
4
5
|
[Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
|
|
5
|
-
This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
|
|
6
|
+
This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
|
|
7
|
+
|
|
8
|
+
## Request for maintainers
|
|
9
|
+
|
|
10
|
+
I can't really put substantial time into maintaining this. Probably nothing more than a couple hours every few months. If you have experience maintaining ruby gems and would like to
|
|
11
|
+
lend a hand please send me an email or reply to this [issue](https://github.com/IAPark/tiktoken_ruby/issues/26)
|
|
6
12
|
|
|
7
13
|
## Installation
|
|
8
14
|
|
|
@@ -15,17 +21,19 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
|
15
21
|
$ gem install tiktoken_ruby
|
|
16
22
|
|
|
17
23
|
## Usage
|
|
24
|
+
|
|
18
25
|
Usage should be very similar to the python library. Here's a simple example
|
|
19
26
|
|
|
20
27
|
Encode and decode text
|
|
28
|
+
|
|
21
29
|
```ruby
|
|
22
30
|
require 'tiktoken_ruby'
|
|
23
|
-
|
|
24
31
|
enc = Tiktoken.get_encoding("cl100k_base")
|
|
25
32
|
enc.decode(enc.encode("hello world")) #=> "hello world"
|
|
26
33
|
```
|
|
27
34
|
|
|
28
35
|
Encoders can also be retrieved by model name
|
|
36
|
+
|
|
29
37
|
```ruby
|
|
30
38
|
require 'tiktoken_ruby'
|
|
31
39
|
|
|
@@ -53,7 +61,6 @@ bundle exec rake compile
|
|
|
53
61
|
bundle exec rake spec
|
|
54
62
|
```
|
|
55
63
|
|
|
56
|
-
|
|
57
64
|
## License
|
|
58
65
|
|
|
59
66
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
|
@@ -10,5 +10,6 @@ publish = false
|
|
|
10
10
|
crate-type = ["cdylib"]
|
|
11
11
|
|
|
12
12
|
[dependencies]
|
|
13
|
-
magnus = { version = "0.
|
|
13
|
+
magnus = { version = "0.6.1" }
|
|
14
|
+
rb-sys = { version = "*", features = ["stable-api-compiled-fallback"] }
|
|
14
15
|
tiktoken-rs = { git = "https://github.com/IAPark/tiktoken-rs.git" }
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
mod core_bpe_wrapper;
|
|
2
2
|
|
|
3
3
|
use core_bpe_wrapper::CoreBPEWrapper;
|
|
4
|
-
use magnus::{define_module, function, prelude::*, Error,
|
|
4
|
+
use magnus::{class, define_module, function, method, prelude::*, Error, ExceptionClass, RModule};
|
|
5
5
|
|
|
6
6
|
fn r50k_base() -> CoreBPEWrapper {
|
|
7
7
|
let core_bpe = tiktoken_rs::r50k_base().unwrap();
|
|
@@ -38,14 +38,18 @@ fn init() -> Result<(), Error> {
|
|
|
38
38
|
factory_module.define_singleton_method("p50k_edit", function!(p50k_edit, 0))?;
|
|
39
39
|
factory_module.define_singleton_method("cl100k_base", function!(cl100k_base, 0))?;
|
|
40
40
|
|
|
41
|
-
|
|
42
41
|
let ext_module = module.define_module("Ext")?;
|
|
43
42
|
let bpe_class = ext_module.define_class("CoreBPE", class::object())?;
|
|
44
43
|
|
|
45
|
-
bpe_class.define_method(
|
|
44
|
+
bpe_class.define_method(
|
|
45
|
+
"encode_ordinary",
|
|
46
|
+
method!(CoreBPEWrapper::encode_ordinary, 1),
|
|
47
|
+
)?;
|
|
46
48
|
bpe_class.define_method("encode", method!(CoreBPEWrapper::encode, 2))?;
|
|
47
|
-
bpe_class.define_method(
|
|
48
|
-
|
|
49
|
+
bpe_class.define_method(
|
|
50
|
+
"encode_with_special_tokens",
|
|
51
|
+
method!(CoreBPEWrapper::encode_with_special_tokens, 1),
|
|
52
|
+
)?;
|
|
49
53
|
|
|
50
54
|
bpe_class.define_method("decode", method!(CoreBPEWrapper::decode, 1))?;
|
|
51
55
|
Ok(())
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
class Tiktoken::Encoding
|
|
4
|
+
CACHE_MUTEX = Mutex.new
|
|
5
|
+
|
|
4
6
|
attr_reader :name
|
|
5
7
|
|
|
6
8
|
# This returns a new Tiktoken::Encoding instance for the requested encoding
|
|
@@ -15,8 +17,10 @@ class Tiktoken::Encoding
|
|
|
15
17
|
# @param encoding [Symbol] The name of the encoding to load
|
|
16
18
|
# @return [Tiktoken::Encoding] The encoding instance
|
|
17
19
|
def self.for_name_cached(encoding)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
+
CACHE_MUTEX.synchronize do
|
|
21
|
+
@encodings ||= {}
|
|
22
|
+
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
|
|
23
|
+
end
|
|
20
24
|
end
|
|
21
25
|
|
|
22
26
|
# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
|
data/lib/tiktoken_ruby.rb
CHANGED
|
@@ -28,22 +28,22 @@ module Tiktoken
|
|
|
28
28
|
|
|
29
29
|
# Gets the encoding for an OpenAI model
|
|
30
30
|
# @param model_name [Symbol|String] The name of the model to get the encoding for
|
|
31
|
-
# @return [Tiktoken::Encoding] The encoding instance
|
|
31
|
+
# @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
|
|
32
32
|
# @example Count tokens for text
|
|
33
33
|
# enc = Tiktoken.encoding_for_model("gpt-4")
|
|
34
34
|
# enc.encode("hello world").length #=> 2
|
|
35
35
|
def encoding_for_model(model_name)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
model_name = prefix
|
|
39
|
-
break
|
|
40
|
-
end
|
|
36
|
+
if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
|
|
37
|
+
return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
|
|
41
38
|
end
|
|
42
39
|
|
|
43
|
-
|
|
44
|
-
|
|
40
|
+
_prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
|
|
41
|
+
model_name.start_with?(prefix.to_s)
|
|
42
|
+
end
|
|
45
43
|
|
|
46
|
-
|
|
44
|
+
if encoding
|
|
45
|
+
get_encoding(encoding)
|
|
46
|
+
end
|
|
47
47
|
end
|
|
48
48
|
|
|
49
49
|
# Lists all the encodings that are supported
|
|
@@ -67,12 +67,22 @@ module Tiktoken
|
|
|
67
67
|
:cl100k_base
|
|
68
68
|
]
|
|
69
69
|
|
|
70
|
-
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
|
70
|
+
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
|
71
71
|
# that is also MIT licensed but by OpenAI
|
|
72
72
|
MODEL_TO_ENCODING_NAME = {
|
|
73
|
+
# chat
|
|
73
74
|
"gpt-4": "cl100k_base",
|
|
74
75
|
"gpt-3.5-turbo": "cl100k_base",
|
|
75
|
-
#
|
|
76
|
+
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
|
77
|
+
# base
|
|
78
|
+
"davinci-002": "cl100k_base",
|
|
79
|
+
"babbage-002": "cl100k_base",
|
|
80
|
+
# embeddings
|
|
81
|
+
"text-embedding-ada-002": "cl100k_base",
|
|
82
|
+
"text-embedding-3-small": "cl100k_base",
|
|
83
|
+
"text-embedding-3-large": "cl100k_base",
|
|
84
|
+
# DEPRECATED MODELS
|
|
85
|
+
# text (DEPRECATED)
|
|
76
86
|
"text-davinci-003": "p50k_base",
|
|
77
87
|
"text-davinci-002": "p50k_base",
|
|
78
88
|
"text-davinci-001": "r50k_base",
|
|
@@ -83,19 +93,17 @@ module Tiktoken
|
|
|
83
93
|
curie: "r50k_base",
|
|
84
94
|
babbage: "r50k_base",
|
|
85
95
|
ada: "r50k_base",
|
|
86
|
-
# code
|
|
96
|
+
# code (DEPRECATED)
|
|
87
97
|
"code-davinci-002": "p50k_base",
|
|
88
98
|
"code-davinci-001": "p50k_base",
|
|
89
99
|
"code-cushman-002": "p50k_base",
|
|
90
100
|
"code-cushman-001": "p50k_base",
|
|
91
101
|
"davinci-codex": "p50k_base",
|
|
92
102
|
"cushman-codex": "p50k_base",
|
|
93
|
-
# edit
|
|
103
|
+
# edit (DEPRECATED)
|
|
94
104
|
"text-davinci-edit-001": "p50k_edit",
|
|
95
105
|
"code-davinci-edit-001": "p50k_edit",
|
|
96
|
-
# embeddings
|
|
97
|
-
"text-embedding-ada-002": "cl100k_base",
|
|
98
|
-
# old embeddings
|
|
106
|
+
# old embeddings (DEPRECATED)
|
|
99
107
|
"text-similarity-davinci-001": "r50k_base",
|
|
100
108
|
"text-similarity-curie-001": "r50k_base",
|
|
101
109
|
"text-similarity-babbage-001": "r50k_base",
|
|
@@ -105,10 +113,21 @@ module Tiktoken
|
|
|
105
113
|
"text-search-babbage-doc-001": "r50k_base",
|
|
106
114
|
"text-search-ada-doc-001": "r50k_base",
|
|
107
115
|
"code-search-babbage-code-001": "r50k_base",
|
|
108
|
-
"code-search-ada-code-001": "r50k_base"
|
|
116
|
+
"code-search-ada-code-001": "r50k_base",
|
|
117
|
+
# open source
|
|
118
|
+
gpt2: "gpt2"
|
|
109
119
|
}
|
|
110
120
|
|
|
111
|
-
|
|
112
|
-
|
|
121
|
+
MODEL_PREFIX_TO_ENCODING = {
|
|
122
|
+
# chat
|
|
123
|
+
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
|
124
|
+
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
|
125
|
+
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
|
|
126
|
+
# fine-tuned
|
|
127
|
+
"ft:gpt-4": "cl100k_base",
|
|
128
|
+
"ft:gpt-3.5-turbo": "cl100k_base",
|
|
129
|
+
"ft:davinci-002": "cl100k_base",
|
|
130
|
+
"ft:babbage-002": "cl100k_base"
|
|
131
|
+
}
|
|
113
132
|
end
|
|
114
133
|
end
|
data/tiktoken_ruby.gemspec
CHANGED
|
@@ -7,12 +7,10 @@ Gem::Specification.new do |spec|
|
|
|
7
7
|
spec.version = Tiktoken::VERSION
|
|
8
8
|
spec.authors = ["IAPark"]
|
|
9
9
|
spec.email = ["isaac.a.park@gmail.com"]
|
|
10
|
-
|
|
11
10
|
spec.summary = "Ruby wrapper for Tiktoken"
|
|
12
11
|
spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
|
|
13
12
|
"a BPE tokenizer written by and used by OpenAI. It can be used to " \
|
|
14
13
|
"count the number of tokens in text before sending it to OpenAI APIs."
|
|
15
|
-
|
|
16
14
|
spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
|
|
17
15
|
spec.license = "MIT"
|
|
18
16
|
spec.required_ruby_version = ">= 2.7.0"
|
|
@@ -22,11 +20,6 @@ Gem::Specification.new do |spec|
|
|
|
22
20
|
spec.metadata["homepage_uri"] = spec.homepage
|
|
23
21
|
spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
|
|
24
22
|
spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
|
|
25
|
-
|
|
26
|
-
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
|
27
|
-
|
|
28
|
-
# Specify which files should be added to the gem when it is released.
|
|
29
|
-
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
30
23
|
spec.files = Dir.chdir(__dir__) do
|
|
31
24
|
`git ls-files -z`.split("\x0").reject do |f|
|
|
32
25
|
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
|
|
@@ -36,9 +29,5 @@ Gem::Specification.new do |spec|
|
|
|
36
29
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
37
30
|
spec.require_paths = ["lib"]
|
|
38
31
|
spec.extensions = ["ext/tiktoken_ruby/extconf.rb"]
|
|
39
|
-
|
|
40
|
-
spec.add_dependency "rb_sys", "~> 0.9.68"
|
|
41
|
-
|
|
42
|
-
# For more information and examples about making a new gem, check out our
|
|
43
|
-
# guide at: https://bundler.io/guides/creating_gem.html
|
|
32
|
+
spec.add_dependency "rb_sys", ">= 0.9.86"
|
|
44
33
|
end
|
metadata
CHANGED
|
@@ -1,29 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tiktoken_ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- IAPark
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2024-04-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - "
|
|
17
|
+
- - ">="
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: 0.9.
|
|
19
|
+
version: 0.9.86
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- - "
|
|
24
|
+
- - ">="
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 0.9.
|
|
26
|
+
version: 0.9.86
|
|
27
27
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
|
28
28
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|
|
29
29
|
it to OpenAI APIs.
|