tiktoken_ruby 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +33 -18
- data/Gemfile +5 -9
- data/Gemfile.lock +44 -33
- data/ext/tiktoken_ruby/Cargo.toml +2 -1
- data/ext/tiktoken_ruby/src/lib.rs +9 -5
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +36 -19
- data/tiktoken_ruby.gemspec +1 -12
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: de2d6e8e83771f2ef51351e019e9cebc7163a1775bfc0e812da58371574b9b63
|
|
4
|
+
data.tar.gz: fe2c629e8b435a181bfa4524655bfa137f1660061ab4f780e0edc15a11a7538d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bbf721963e873464fae055d23308068fcdac8db4e27dea28653c3fc017f0803da59b4c549a0fb6b6339f79bc4379e2913e1c47fa25f9894ebde840b23aa81edb
|
|
7
|
+
data.tar.gz: 1275369f56a2498ce39c5a2b259efdab0684ae834c6d769bfe23d7190c69917a22d46dc7b7675cd15c8a44dcfa8f7c3be2df850035236fa33f8e55afd31db42b
|
data/Cargo.lock
CHANGED
|
@@ -31,11 +31,11 @@ checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
|
|
|
31
31
|
|
|
32
32
|
[[package]]
|
|
33
33
|
name = "bindgen"
|
|
34
|
-
version = "0.
|
|
34
|
+
version = "0.66.1"
|
|
35
35
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
36
|
-
checksum = "
|
|
36
|
+
checksum = "f2b84e06fc203107bfbad243f4aba2af864eb7db3b1cf46ea0a023b0b433d2a7"
|
|
37
37
|
dependencies = [
|
|
38
|
-
"bitflags",
|
|
38
|
+
"bitflags 2.4.0",
|
|
39
39
|
"cexpr",
|
|
40
40
|
"clang-sys",
|
|
41
41
|
"lazy_static",
|
|
@@ -46,6 +46,7 @@ dependencies = [
|
|
|
46
46
|
"regex",
|
|
47
47
|
"rustc-hash",
|
|
48
48
|
"shlex",
|
|
49
|
+
"syn",
|
|
49
50
|
]
|
|
50
51
|
|
|
51
52
|
[[package]]
|
|
@@ -69,6 +70,12 @@ version = "1.3.2"
|
|
|
69
70
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
70
71
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
71
72
|
|
|
73
|
+
[[package]]
|
|
74
|
+
name = "bitflags"
|
|
75
|
+
version = "2.4.0"
|
|
76
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
77
|
+
checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635"
|
|
78
|
+
|
|
72
79
|
[[package]]
|
|
73
80
|
name = "bstr"
|
|
74
81
|
version = "1.4.0"
|
|
@@ -163,20 +170,21 @@ dependencies = [
|
|
|
163
170
|
|
|
164
171
|
[[package]]
|
|
165
172
|
name = "magnus"
|
|
166
|
-
version = "0.
|
|
173
|
+
version = "0.6.1"
|
|
167
174
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
168
|
-
checksum = "
|
|
175
|
+
checksum = "0516897a45f8ce8270a8910bcb94cd83538b19b6ae3a0c281a765df170b64695"
|
|
169
176
|
dependencies = [
|
|
170
177
|
"magnus-macros",
|
|
171
178
|
"rb-sys",
|
|
172
179
|
"rb-sys-env",
|
|
180
|
+
"seq-macro",
|
|
173
181
|
]
|
|
174
182
|
|
|
175
183
|
[[package]]
|
|
176
184
|
name = "magnus-macros"
|
|
177
|
-
version = "0.
|
|
185
|
+
version = "0.6.0"
|
|
178
186
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
179
|
-
checksum = "
|
|
187
|
+
checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
|
|
180
188
|
dependencies = [
|
|
181
189
|
"proc-macro2",
|
|
182
190
|
"quote",
|
|
@@ -242,36 +250,36 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
|
|
242
250
|
|
|
243
251
|
[[package]]
|
|
244
252
|
name = "proc-macro2"
|
|
245
|
-
version = "1.0.
|
|
253
|
+
version = "1.0.66"
|
|
246
254
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
247
|
-
checksum = "
|
|
255
|
+
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
|
|
248
256
|
dependencies = [
|
|
249
257
|
"unicode-ident",
|
|
250
258
|
]
|
|
251
259
|
|
|
252
260
|
[[package]]
|
|
253
261
|
name = "quote"
|
|
254
|
-
version = "1.0.
|
|
262
|
+
version = "1.0.33"
|
|
255
263
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
256
|
-
checksum = "
|
|
264
|
+
checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
|
|
257
265
|
dependencies = [
|
|
258
266
|
"proc-macro2",
|
|
259
267
|
]
|
|
260
268
|
|
|
261
269
|
[[package]]
|
|
262
270
|
name = "rb-sys"
|
|
263
|
-
version = "0.9.
|
|
271
|
+
version = "0.9.81"
|
|
264
272
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
265
|
-
checksum = "
|
|
273
|
+
checksum = "a57240b308b155b09dce81e32829966a99f52d1088b45957e4283e526c5317a1"
|
|
266
274
|
dependencies = [
|
|
267
275
|
"rb-sys-build",
|
|
268
276
|
]
|
|
269
277
|
|
|
270
278
|
[[package]]
|
|
271
279
|
name = "rb-sys-build"
|
|
272
|
-
version = "0.9.
|
|
280
|
+
version = "0.9.81"
|
|
273
281
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
274
|
-
checksum = "
|
|
282
|
+
checksum = "f24ce877a4c5d07f06f6aa6fec3ac95e4b357b9f73b0f5445d8cbb7266d410e8"
|
|
275
283
|
dependencies = [
|
|
276
284
|
"bindgen",
|
|
277
285
|
"lazy_static",
|
|
@@ -294,7 +302,7 @@ version = "0.2.16"
|
|
|
294
302
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
295
303
|
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
|
|
296
304
|
dependencies = [
|
|
297
|
-
"bitflags",
|
|
305
|
+
"bitflags 1.3.2",
|
|
298
306
|
]
|
|
299
307
|
|
|
300
308
|
[[package]]
|
|
@@ -332,6 +340,12 @@ version = "1.1.0"
|
|
|
332
340
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
333
341
|
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
|
334
342
|
|
|
343
|
+
[[package]]
|
|
344
|
+
name = "seq-macro"
|
|
345
|
+
version = "0.3.5"
|
|
346
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
347
|
+
checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
348
|
+
|
|
335
349
|
[[package]]
|
|
336
350
|
name = "serde"
|
|
337
351
|
version = "1.0.157"
|
|
@@ -358,9 +372,9 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
|
|
358
372
|
|
|
359
373
|
[[package]]
|
|
360
374
|
name = "syn"
|
|
361
|
-
version = "
|
|
375
|
+
version = "2.0.31"
|
|
362
376
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
363
|
-
checksum = "
|
|
377
|
+
checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398"
|
|
364
378
|
dependencies = [
|
|
365
379
|
"proc-macro2",
|
|
366
380
|
"quote",
|
|
@@ -386,6 +400,7 @@ name = "tiktoken_ruby"
|
|
|
386
400
|
version = "0.1.0"
|
|
387
401
|
dependencies = [
|
|
388
402
|
"magnus",
|
|
403
|
+
"rb-sys",
|
|
389
404
|
"tiktoken-rs",
|
|
390
405
|
]
|
|
391
406
|
|
data/Gemfile
CHANGED
|
@@ -2,15 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
source "https://rubygems.org"
|
|
4
4
|
|
|
5
|
-
# Specify your gem's dependencies in tiktoken_ruby.gemspec
|
|
6
5
|
gemspec
|
|
7
6
|
|
|
8
|
-
gem "rake"
|
|
9
|
-
|
|
7
|
+
gem "rake"
|
|
10
8
|
gem "rake-compiler"
|
|
11
|
-
|
|
12
|
-
gem "
|
|
13
|
-
|
|
14
|
-
gem "
|
|
15
|
-
|
|
16
|
-
gem "yard-doctest", "~> 0.1.17"
|
|
9
|
+
gem "rspec"
|
|
10
|
+
gem "standard"
|
|
11
|
+
gem "yard-doctest"
|
|
12
|
+
gem "racc"
|
data/Gemfile.lock
CHANGED
|
@@ -1,64 +1,74 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
tiktoken_ruby (0.0.
|
|
5
|
-
rb_sys (
|
|
4
|
+
tiktoken_ruby (0.0.7)
|
|
5
|
+
rb_sys (>= 0.9.86)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
8
8
|
remote: https://rubygems.org/
|
|
9
9
|
specs:
|
|
10
10
|
ast (2.4.2)
|
|
11
11
|
diff-lcs (1.5.0)
|
|
12
|
-
json (2.
|
|
12
|
+
json (2.7.1)
|
|
13
13
|
language_server-protocol (3.17.0.3)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
lint_roller (1.1.0)
|
|
15
|
+
minitest (5.21.2)
|
|
16
|
+
parallel (1.24.0)
|
|
17
|
+
parser (3.3.0.4)
|
|
17
18
|
ast (~> 2.4.1)
|
|
19
|
+
racc
|
|
20
|
+
racc (1.7.3)
|
|
18
21
|
rainbow (3.1.1)
|
|
19
|
-
rake (13.0
|
|
20
|
-
rake-compiler (1.2.
|
|
22
|
+
rake (13.1.0)
|
|
23
|
+
rake-compiler (1.2.5)
|
|
21
24
|
rake
|
|
22
|
-
rb_sys (0.9.
|
|
23
|
-
regexp_parser (2.
|
|
24
|
-
rexml (3.2.
|
|
25
|
+
rb_sys (0.9.86)
|
|
26
|
+
regexp_parser (2.9.0)
|
|
27
|
+
rexml (3.2.6)
|
|
25
28
|
rspec (3.12.0)
|
|
26
29
|
rspec-core (~> 3.12.0)
|
|
27
30
|
rspec-expectations (~> 3.12.0)
|
|
28
31
|
rspec-mocks (~> 3.12.0)
|
|
29
|
-
rspec-core (3.12.
|
|
32
|
+
rspec-core (3.12.2)
|
|
30
33
|
rspec-support (~> 3.12.0)
|
|
31
|
-
rspec-expectations (3.12.
|
|
34
|
+
rspec-expectations (3.12.3)
|
|
32
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
33
36
|
rspec-support (~> 3.12.0)
|
|
34
|
-
rspec-mocks (3.12.
|
|
37
|
+
rspec-mocks (3.12.6)
|
|
35
38
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
36
39
|
rspec-support (~> 3.12.0)
|
|
37
|
-
rspec-support (3.12.
|
|
38
|
-
rubocop (1.
|
|
40
|
+
rspec-support (3.12.1)
|
|
41
|
+
rubocop (1.59.0)
|
|
39
42
|
json (~> 2.3)
|
|
43
|
+
language_server-protocol (>= 3.17.0)
|
|
40
44
|
parallel (~> 1.10)
|
|
41
|
-
parser (>= 3.2.
|
|
45
|
+
parser (>= 3.2.2.4)
|
|
42
46
|
rainbow (>= 2.2.2, < 4.0)
|
|
43
47
|
regexp_parser (>= 1.8, < 3.0)
|
|
44
48
|
rexml (>= 3.2.5, < 4.0)
|
|
45
|
-
rubocop-ast (>= 1.
|
|
49
|
+
rubocop-ast (>= 1.30.0, < 2.0)
|
|
46
50
|
ruby-progressbar (~> 1.7)
|
|
47
51
|
unicode-display_width (>= 2.4.0, < 3.0)
|
|
48
|
-
rubocop-ast (1.
|
|
52
|
+
rubocop-ast (1.30.0)
|
|
49
53
|
parser (>= 3.2.1.0)
|
|
50
|
-
rubocop-performance (1.
|
|
51
|
-
rubocop (>= 1.
|
|
52
|
-
rubocop-ast (>=
|
|
54
|
+
rubocop-performance (1.20.2)
|
|
55
|
+
rubocop (>= 1.48.1, < 2.0)
|
|
56
|
+
rubocop-ast (>= 1.30.0, < 2.0)
|
|
53
57
|
ruby-progressbar (1.13.0)
|
|
54
|
-
standard (1.
|
|
58
|
+
standard (1.33.0)
|
|
55
59
|
language_server-protocol (~> 3.17.0.2)
|
|
56
|
-
|
|
57
|
-
rubocop
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
lint_roller (~> 1.0)
|
|
61
|
+
rubocop (~> 1.59.0)
|
|
62
|
+
standard-custom (~> 1.0.0)
|
|
63
|
+
standard-performance (~> 1.3)
|
|
64
|
+
standard-custom (1.0.2)
|
|
65
|
+
lint_roller (~> 1.0)
|
|
66
|
+
rubocop (~> 1.50)
|
|
67
|
+
standard-performance (1.3.1)
|
|
68
|
+
lint_roller (~> 1.1)
|
|
69
|
+
rubocop-performance (~> 1.20.2)
|
|
70
|
+
unicode-display_width (2.5.0)
|
|
71
|
+
yard (0.9.34)
|
|
62
72
|
yard-doctest (0.1.17)
|
|
63
73
|
minitest
|
|
64
74
|
yard
|
|
@@ -70,12 +80,13 @@ PLATFORMS
|
|
|
70
80
|
x86_64-linux
|
|
71
81
|
|
|
72
82
|
DEPENDENCIES
|
|
73
|
-
|
|
83
|
+
racc
|
|
84
|
+
rake
|
|
74
85
|
rake-compiler
|
|
75
|
-
rspec
|
|
76
|
-
standard
|
|
86
|
+
rspec
|
|
87
|
+
standard
|
|
77
88
|
tiktoken_ruby!
|
|
78
|
-
yard-doctest
|
|
89
|
+
yard-doctest
|
|
79
90
|
|
|
80
91
|
BUNDLED WITH
|
|
81
92
|
2.4.6
|
|
@@ -10,5 +10,6 @@ publish = false
|
|
|
10
10
|
crate-type = ["cdylib"]
|
|
11
11
|
|
|
12
12
|
[dependencies]
|
|
13
|
-
magnus = { version = "0.
|
|
13
|
+
magnus = { version = "0.6.1" }
|
|
14
|
+
rb-sys = { version = "*", features = ["stable-api-compiled-fallback"] }
|
|
14
15
|
tiktoken-rs = { git = "https://github.com/IAPark/tiktoken-rs.git" }
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
mod core_bpe_wrapper;
|
|
2
2
|
|
|
3
3
|
use core_bpe_wrapper::CoreBPEWrapper;
|
|
4
|
-
use magnus::{define_module, function, prelude::*, Error,
|
|
4
|
+
use magnus::{class, define_module, function, method, prelude::*, Error, ExceptionClass, RModule};
|
|
5
5
|
|
|
6
6
|
fn r50k_base() -> CoreBPEWrapper {
|
|
7
7
|
let core_bpe = tiktoken_rs::r50k_base().unwrap();
|
|
@@ -38,14 +38,18 @@ fn init() -> Result<(), Error> {
|
|
|
38
38
|
factory_module.define_singleton_method("p50k_edit", function!(p50k_edit, 0))?;
|
|
39
39
|
factory_module.define_singleton_method("cl100k_base", function!(cl100k_base, 0))?;
|
|
40
40
|
|
|
41
|
-
|
|
42
41
|
let ext_module = module.define_module("Ext")?;
|
|
43
42
|
let bpe_class = ext_module.define_class("CoreBPE", class::object())?;
|
|
44
43
|
|
|
45
|
-
bpe_class.define_method(
|
|
44
|
+
bpe_class.define_method(
|
|
45
|
+
"encode_ordinary",
|
|
46
|
+
method!(CoreBPEWrapper::encode_ordinary, 1),
|
|
47
|
+
)?;
|
|
46
48
|
bpe_class.define_method("encode", method!(CoreBPEWrapper::encode, 2))?;
|
|
47
|
-
bpe_class.define_method(
|
|
48
|
-
|
|
49
|
+
bpe_class.define_method(
|
|
50
|
+
"encode_with_special_tokens",
|
|
51
|
+
method!(CoreBPEWrapper::encode_with_special_tokens, 1),
|
|
52
|
+
)?;
|
|
49
53
|
|
|
50
54
|
bpe_class.define_method("decode", method!(CoreBPEWrapper::decode, 1))?;
|
|
51
55
|
Ok(())
|
data/lib/tiktoken_ruby.rb
CHANGED
|
@@ -33,17 +33,15 @@ module Tiktoken
|
|
|
33
33
|
# enc = Tiktoken.encoding_for_model("gpt-4")
|
|
34
34
|
# enc.encode("hello world").length #=> 2
|
|
35
35
|
def encoding_for_model(model_name)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
model_name = prefix
|
|
39
|
-
break
|
|
40
|
-
end
|
|
36
|
+
if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
|
|
37
|
+
return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
|
|
41
38
|
end
|
|
42
39
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
40
|
+
MODEL_PREFIX_TO_ENCODING.each do |prefix, encoding|
|
|
41
|
+
if model_name.start_with?(prefix.to_s)
|
|
42
|
+
return get_encoding(encoding)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
47
45
|
end
|
|
48
46
|
|
|
49
47
|
# Lists all the encodings that are supported
|
|
@@ -67,12 +65,22 @@ module Tiktoken
|
|
|
67
65
|
:cl100k_base
|
|
68
66
|
]
|
|
69
67
|
|
|
70
|
-
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
|
68
|
+
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
|
71
69
|
# that is also MIT licensed but by OpenAI
|
|
72
70
|
MODEL_TO_ENCODING_NAME = {
|
|
71
|
+
# chat
|
|
73
72
|
"gpt-4": "cl100k_base",
|
|
74
73
|
"gpt-3.5-turbo": "cl100k_base",
|
|
75
|
-
#
|
|
74
|
+
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
|
75
|
+
# base
|
|
76
|
+
"davinci-002": "cl100k_base",
|
|
77
|
+
"babbage-002": "cl100k_base",
|
|
78
|
+
# embeddings
|
|
79
|
+
"text-embedding-ada-002": "cl100k_base",
|
|
80
|
+
"text-embedding-3-small": "cl100k_base",
|
|
81
|
+
"text-embedding-3-large": "cl100k_base",
|
|
82
|
+
# DEPRECATED MODELS
|
|
83
|
+
# text (DEPRECATED)
|
|
76
84
|
"text-davinci-003": "p50k_base",
|
|
77
85
|
"text-davinci-002": "p50k_base",
|
|
78
86
|
"text-davinci-001": "r50k_base",
|
|
@@ -83,19 +91,17 @@ module Tiktoken
|
|
|
83
91
|
curie: "r50k_base",
|
|
84
92
|
babbage: "r50k_base",
|
|
85
93
|
ada: "r50k_base",
|
|
86
|
-
# code
|
|
94
|
+
# code (DEPRECATED)
|
|
87
95
|
"code-davinci-002": "p50k_base",
|
|
88
96
|
"code-davinci-001": "p50k_base",
|
|
89
97
|
"code-cushman-002": "p50k_base",
|
|
90
98
|
"code-cushman-001": "p50k_base",
|
|
91
99
|
"davinci-codex": "p50k_base",
|
|
92
100
|
"cushman-codex": "p50k_base",
|
|
93
|
-
# edit
|
|
101
|
+
# edit (DEPRECATED)
|
|
94
102
|
"text-davinci-edit-001": "p50k_edit",
|
|
95
103
|
"code-davinci-edit-001": "p50k_edit",
|
|
96
|
-
# embeddings
|
|
97
|
-
"text-embedding-ada-002": "cl100k_base",
|
|
98
|
-
# old embeddings
|
|
104
|
+
# old embeddings (DEPRECATED)
|
|
99
105
|
"text-similarity-davinci-001": "r50k_base",
|
|
100
106
|
"text-similarity-curie-001": "r50k_base",
|
|
101
107
|
"text-similarity-babbage-001": "r50k_base",
|
|
@@ -105,10 +111,21 @@ module Tiktoken
|
|
|
105
111
|
"text-search-babbage-doc-001": "r50k_base",
|
|
106
112
|
"text-search-ada-doc-001": "r50k_base",
|
|
107
113
|
"code-search-babbage-code-001": "r50k_base",
|
|
108
|
-
"code-search-ada-code-001": "r50k_base"
|
|
114
|
+
"code-search-ada-code-001": "r50k_base",
|
|
115
|
+
# open source
|
|
116
|
+
gpt2: "gpt2"
|
|
109
117
|
}
|
|
110
118
|
|
|
111
|
-
|
|
112
|
-
|
|
119
|
+
MODEL_PREFIX_TO_ENCODING = {
|
|
120
|
+
# chat
|
|
121
|
+
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
|
122
|
+
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
|
123
|
+
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
|
|
124
|
+
# fine-tuned
|
|
125
|
+
"ft:gpt-4": "cl100k_base",
|
|
126
|
+
"ft:gpt-3.5-turbo": "cl100k_base",
|
|
127
|
+
"ft:davinci-002": "cl100k_base",
|
|
128
|
+
"ft:babbage-002": "cl100k_base"
|
|
129
|
+
}
|
|
113
130
|
end
|
|
114
131
|
end
|
data/tiktoken_ruby.gemspec
CHANGED
|
@@ -7,12 +7,10 @@ Gem::Specification.new do |spec|
|
|
|
7
7
|
spec.version = Tiktoken::VERSION
|
|
8
8
|
spec.authors = ["IAPark"]
|
|
9
9
|
spec.email = ["isaac.a.park@gmail.com"]
|
|
10
|
-
|
|
11
10
|
spec.summary = "Ruby wrapper for Tiktoken"
|
|
12
11
|
spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
|
|
13
12
|
"a BPE tokenizer written by and used by OpenAI. It can be used to " \
|
|
14
13
|
"count the number of tokens in text before sending it to OpenAI APIs."
|
|
15
|
-
|
|
16
14
|
spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
|
|
17
15
|
spec.license = "MIT"
|
|
18
16
|
spec.required_ruby_version = ">= 2.7.0"
|
|
@@ -22,11 +20,6 @@ Gem::Specification.new do |spec|
|
|
|
22
20
|
spec.metadata["homepage_uri"] = spec.homepage
|
|
23
21
|
spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
|
|
24
22
|
spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
|
|
25
|
-
|
|
26
|
-
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
|
27
|
-
|
|
28
|
-
# Specify which files should be added to the gem when it is released.
|
|
29
|
-
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
30
23
|
spec.files = Dir.chdir(__dir__) do
|
|
31
24
|
`git ls-files -z`.split("\x0").reject do |f|
|
|
32
25
|
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
|
|
@@ -36,9 +29,5 @@ Gem::Specification.new do |spec|
|
|
|
36
29
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
37
30
|
spec.require_paths = ["lib"]
|
|
38
31
|
spec.extensions = ["ext/tiktoken_ruby/extconf.rb"]
|
|
39
|
-
|
|
40
|
-
spec.add_dependency "rb_sys", "~> 0.9.68"
|
|
41
|
-
|
|
42
|
-
# For more information and examples about making a new gem, check out our
|
|
43
|
-
# guide at: https://bundler.io/guides/creating_gem.html
|
|
32
|
+
spec.add_dependency "rb_sys", ">= 0.9.86"
|
|
44
33
|
end
|
metadata
CHANGED
|
@@ -1,29 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tiktoken_ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- IAPark
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2024-02-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - "
|
|
17
|
+
- - ">="
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: 0.9.
|
|
19
|
+
version: 0.9.86
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- - "
|
|
24
|
+
- - ">="
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 0.9.
|
|
26
|
+
version: 0.9.86
|
|
27
27
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
|
28
28
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|
|
29
29
|
it to OpenAI APIs.
|