tiktoken_ruby 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +126 -133
- data/Gemfile.lock +4 -4
- data/README.md +10 -3
- data/ext/tiktoken_ruby/Cargo.toml +2 -2
- data/ext/tiktoken_ruby/src/lib.rs +6 -0
- data/lib/tiktoken_ruby/encoding.rb +6 -2
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +11 -6
- data/tiktoken_ruby.gemspec +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: abd5db9516cf5d26ace1790e1267038910af214b3315d157e35df851530b83cb
|
|
4
|
+
data.tar.gz: 88541080f80f27a52c8a7eb9bd9a2baf4dc3c67af4ba152bc03150ade1c89f72
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bdba999ff6ee22d57a993e7cbff9f1f95c22540973e7af26a58ded2e49e77c9863daedd936255a86c9722598b8fcc442d746c502d9b295f2758e52774aa8fd7c
|
|
7
|
+
data.tar.gz: 8917f5a08dbed662c890d0102c9d4be3d24a34d5537804b7ceb563819dd01e36d5b580d5e9f68a20464057620e8f2497558e0e3f1e0faf2afc044d2a4feec7e2
|
data/Cargo.lock
CHANGED
|
@@ -4,43 +4,43 @@ version = 3
|
|
|
4
4
|
|
|
5
5
|
[[package]]
|
|
6
6
|
name = "aho-corasick"
|
|
7
|
-
version = "
|
|
7
|
+
version = "1.1.3"
|
|
8
8
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
9
|
-
checksum = "
|
|
9
|
+
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
|
10
10
|
dependencies = [
|
|
11
11
|
"memchr",
|
|
12
12
|
]
|
|
13
13
|
|
|
14
14
|
[[package]]
|
|
15
15
|
name = "anyhow"
|
|
16
|
-
version = "1.0.
|
|
16
|
+
version = "1.0.83"
|
|
17
17
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
18
|
-
checksum = "
|
|
18
|
+
checksum = "25bdb32cbbdce2b519a9cd7df3a678443100e265d5e25ca763b7572a5104f5f3"
|
|
19
19
|
|
|
20
20
|
[[package]]
|
|
21
21
|
name = "autocfg"
|
|
22
|
-
version = "1.
|
|
22
|
+
version = "1.3.0"
|
|
23
23
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
24
|
-
checksum = "
|
|
24
|
+
checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
|
|
25
25
|
|
|
26
26
|
[[package]]
|
|
27
27
|
name = "base64"
|
|
28
|
-
version = "0.21.
|
|
28
|
+
version = "0.21.7"
|
|
29
29
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
30
|
-
checksum = "
|
|
30
|
+
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
|
|
31
31
|
|
|
32
32
|
[[package]]
|
|
33
33
|
name = "bindgen"
|
|
34
|
-
version = "0.
|
|
34
|
+
version = "0.69.4"
|
|
35
35
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
36
|
-
checksum = "
|
|
36
|
+
checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
|
|
37
37
|
dependencies = [
|
|
38
|
-
"bitflags
|
|
38
|
+
"bitflags",
|
|
39
39
|
"cexpr",
|
|
40
40
|
"clang-sys",
|
|
41
|
+
"itertools",
|
|
41
42
|
"lazy_static",
|
|
42
43
|
"lazycell",
|
|
43
|
-
"peeking_take_while",
|
|
44
44
|
"proc-macro2",
|
|
45
45
|
"quote",
|
|
46
46
|
"regex",
|
|
@@ -66,24 +66,17 @@ checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
|
|
|
66
66
|
|
|
67
67
|
[[package]]
|
|
68
68
|
name = "bitflags"
|
|
69
|
-
version = "
|
|
70
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
71
|
-
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
72
|
-
|
|
73
|
-
[[package]]
|
|
74
|
-
name = "bitflags"
|
|
75
|
-
version = "2.4.0"
|
|
69
|
+
version = "2.5.0"
|
|
76
70
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
77
|
-
checksum = "
|
|
71
|
+
checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
|
|
78
72
|
|
|
79
73
|
[[package]]
|
|
80
74
|
name = "bstr"
|
|
81
|
-
version = "1.
|
|
75
|
+
version = "1.9.1"
|
|
82
76
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
83
|
-
checksum = "
|
|
77
|
+
checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706"
|
|
84
78
|
dependencies = [
|
|
85
79
|
"memchr",
|
|
86
|
-
"once_cell",
|
|
87
80
|
"regex-automata",
|
|
88
81
|
"serde",
|
|
89
82
|
]
|
|
@@ -105,20 +98,26 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
|
|
105
98
|
|
|
106
99
|
[[package]]
|
|
107
100
|
name = "clang-sys"
|
|
108
|
-
version = "1.
|
|
101
|
+
version = "1.7.0"
|
|
109
102
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
110
|
-
checksum = "
|
|
103
|
+
checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1"
|
|
111
104
|
dependencies = [
|
|
112
105
|
"glob",
|
|
113
106
|
"libc",
|
|
114
107
|
"libloading",
|
|
115
108
|
]
|
|
116
109
|
|
|
110
|
+
[[package]]
|
|
111
|
+
name = "either"
|
|
112
|
+
version = "1.11.0"
|
|
113
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
114
|
+
checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2"
|
|
115
|
+
|
|
117
116
|
[[package]]
|
|
118
117
|
name = "fancy-regex"
|
|
119
|
-
version = "0.
|
|
118
|
+
version = "0.12.0"
|
|
120
119
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
121
|
-
checksum = "
|
|
120
|
+
checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05"
|
|
122
121
|
dependencies = [
|
|
123
122
|
"bit-set",
|
|
124
123
|
"regex",
|
|
@@ -130,6 +129,15 @@ version = "0.3.1"
|
|
|
130
129
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
131
130
|
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
|
132
131
|
|
|
132
|
+
[[package]]
|
|
133
|
+
name = "itertools"
|
|
134
|
+
version = "0.12.1"
|
|
135
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
136
|
+
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
|
137
|
+
dependencies = [
|
|
138
|
+
"either",
|
|
139
|
+
]
|
|
140
|
+
|
|
133
141
|
[[package]]
|
|
134
142
|
name = "lazy_static"
|
|
135
143
|
version = "1.4.0"
|
|
@@ -144,25 +152,25 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
|
144
152
|
|
|
145
153
|
[[package]]
|
|
146
154
|
name = "libc"
|
|
147
|
-
version = "0.2.
|
|
155
|
+
version = "0.2.154"
|
|
148
156
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
149
|
-
checksum = "
|
|
157
|
+
checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346"
|
|
150
158
|
|
|
151
159
|
[[package]]
|
|
152
160
|
name = "libloading"
|
|
153
|
-
version = "0.
|
|
161
|
+
version = "0.8.3"
|
|
154
162
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
155
|
-
checksum = "
|
|
163
|
+
checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
|
|
156
164
|
dependencies = [
|
|
157
165
|
"cfg-if",
|
|
158
|
-
"
|
|
166
|
+
"windows-targets",
|
|
159
167
|
]
|
|
160
168
|
|
|
161
169
|
[[package]]
|
|
162
170
|
name = "lock_api"
|
|
163
|
-
version = "0.4.
|
|
171
|
+
version = "0.4.12"
|
|
164
172
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
165
|
-
checksum = "
|
|
173
|
+
checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
|
|
166
174
|
dependencies = [
|
|
167
175
|
"autocfg",
|
|
168
176
|
"scopeguard",
|
|
@@ -170,9 +178,9 @@ dependencies = [
|
|
|
170
178
|
|
|
171
179
|
[[package]]
|
|
172
180
|
name = "magnus"
|
|
173
|
-
version = "0.6.
|
|
181
|
+
version = "0.6.4"
|
|
174
182
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
175
|
-
checksum = "
|
|
183
|
+
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
|
176
184
|
dependencies = [
|
|
177
185
|
"magnus-macros",
|
|
178
186
|
"rb-sys",
|
|
@@ -193,9 +201,9 @@ dependencies = [
|
|
|
193
201
|
|
|
194
202
|
[[package]]
|
|
195
203
|
name = "memchr"
|
|
196
|
-
version = "2.
|
|
204
|
+
version = "2.7.2"
|
|
197
205
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
198
|
-
checksum = "
|
|
206
|
+
checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
|
|
199
207
|
|
|
200
208
|
[[package]]
|
|
201
209
|
name = "minimal-lexical"
|
|
@@ -213,17 +221,11 @@ dependencies = [
|
|
|
213
221
|
"minimal-lexical",
|
|
214
222
|
]
|
|
215
223
|
|
|
216
|
-
[[package]]
|
|
217
|
-
name = "once_cell"
|
|
218
|
-
version = "1.17.1"
|
|
219
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
220
|
-
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
|
221
|
-
|
|
222
224
|
[[package]]
|
|
223
225
|
name = "parking_lot"
|
|
224
|
-
version = "0.12.
|
|
226
|
+
version = "0.12.2"
|
|
225
227
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
226
|
-
checksum = "
|
|
228
|
+
checksum = "7e4af0ca4f6caed20e900d564c242b8e5d4903fdacf31d3daf527b66fe6f42fb"
|
|
227
229
|
dependencies = [
|
|
228
230
|
"lock_api",
|
|
229
231
|
"parking_lot_core",
|
|
@@ -231,55 +233,49 @@ dependencies = [
|
|
|
231
233
|
|
|
232
234
|
[[package]]
|
|
233
235
|
name = "parking_lot_core"
|
|
234
|
-
version = "0.9.
|
|
236
|
+
version = "0.9.10"
|
|
235
237
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
236
|
-
checksum = "
|
|
238
|
+
checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
|
|
237
239
|
dependencies = [
|
|
238
240
|
"cfg-if",
|
|
239
241
|
"libc",
|
|
240
242
|
"redox_syscall",
|
|
241
243
|
"smallvec",
|
|
242
|
-
"windows-
|
|
244
|
+
"windows-targets",
|
|
243
245
|
]
|
|
244
246
|
|
|
245
|
-
[[package]]
|
|
246
|
-
name = "peeking_take_while"
|
|
247
|
-
version = "0.1.2"
|
|
248
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
249
|
-
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
|
250
|
-
|
|
251
247
|
[[package]]
|
|
252
248
|
name = "proc-macro2"
|
|
253
|
-
version = "1.0.
|
|
249
|
+
version = "1.0.82"
|
|
254
250
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
255
|
-
checksum = "
|
|
251
|
+
checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b"
|
|
256
252
|
dependencies = [
|
|
257
253
|
"unicode-ident",
|
|
258
254
|
]
|
|
259
255
|
|
|
260
256
|
[[package]]
|
|
261
257
|
name = "quote"
|
|
262
|
-
version = "1.0.
|
|
258
|
+
version = "1.0.36"
|
|
263
259
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
264
|
-
checksum = "
|
|
260
|
+
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
|
|
265
261
|
dependencies = [
|
|
266
262
|
"proc-macro2",
|
|
267
263
|
]
|
|
268
264
|
|
|
269
265
|
[[package]]
|
|
270
266
|
name = "rb-sys"
|
|
271
|
-
version = "0.9.
|
|
267
|
+
version = "0.9.97"
|
|
272
268
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
273
|
-
checksum = "
|
|
269
|
+
checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
|
|
274
270
|
dependencies = [
|
|
275
271
|
"rb-sys-build",
|
|
276
272
|
]
|
|
277
273
|
|
|
278
274
|
[[package]]
|
|
279
275
|
name = "rb-sys-build"
|
|
280
|
-
version = "0.9.
|
|
276
|
+
version = "0.9.97"
|
|
281
277
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
282
|
-
checksum = "
|
|
278
|
+
checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
|
|
283
279
|
dependencies = [
|
|
284
280
|
"bindgen",
|
|
285
281
|
"lazy_static",
|
|
@@ -298,35 +294,41 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
|
298
294
|
|
|
299
295
|
[[package]]
|
|
300
296
|
name = "redox_syscall"
|
|
301
|
-
version = "0.
|
|
297
|
+
version = "0.5.1"
|
|
302
298
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
303
|
-
checksum = "
|
|
299
|
+
checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e"
|
|
304
300
|
dependencies = [
|
|
305
|
-
"bitflags
|
|
301
|
+
"bitflags",
|
|
306
302
|
]
|
|
307
303
|
|
|
308
304
|
[[package]]
|
|
309
305
|
name = "regex"
|
|
310
|
-
version = "1.
|
|
306
|
+
version = "1.10.4"
|
|
311
307
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
312
|
-
checksum = "
|
|
308
|
+
checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
|
|
313
309
|
dependencies = [
|
|
314
310
|
"aho-corasick",
|
|
315
311
|
"memchr",
|
|
312
|
+
"regex-automata",
|
|
316
313
|
"regex-syntax",
|
|
317
314
|
]
|
|
318
315
|
|
|
319
316
|
[[package]]
|
|
320
317
|
name = "regex-automata"
|
|
321
|
-
version = "0.
|
|
318
|
+
version = "0.4.6"
|
|
322
319
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
323
|
-
checksum = "
|
|
320
|
+
checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
|
|
321
|
+
dependencies = [
|
|
322
|
+
"aho-corasick",
|
|
323
|
+
"memchr",
|
|
324
|
+
"regex-syntax",
|
|
325
|
+
]
|
|
324
326
|
|
|
325
327
|
[[package]]
|
|
326
328
|
name = "regex-syntax"
|
|
327
|
-
version = "0.
|
|
329
|
+
version = "0.8.3"
|
|
328
330
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
329
|
-
checksum = "
|
|
331
|
+
checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56"
|
|
330
332
|
|
|
331
333
|
[[package]]
|
|
332
334
|
name = "rustc-hash"
|
|
@@ -336,9 +338,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
|
336
338
|
|
|
337
339
|
[[package]]
|
|
338
340
|
name = "scopeguard"
|
|
339
|
-
version = "1.
|
|
341
|
+
version = "1.2.0"
|
|
340
342
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
341
|
-
checksum = "
|
|
343
|
+
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
|
342
344
|
|
|
343
345
|
[[package]]
|
|
344
346
|
name = "seq-macro"
|
|
@@ -348,9 +350,23 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
|
348
350
|
|
|
349
351
|
[[package]]
|
|
350
352
|
name = "serde"
|
|
351
|
-
version = "1.0.
|
|
353
|
+
version = "1.0.202"
|
|
352
354
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
353
|
-
checksum = "
|
|
355
|
+
checksum = "226b61a0d411b2ba5ff6d7f73a476ac4f8bb900373459cd00fab8512828ba395"
|
|
356
|
+
dependencies = [
|
|
357
|
+
"serde_derive",
|
|
358
|
+
]
|
|
359
|
+
|
|
360
|
+
[[package]]
|
|
361
|
+
name = "serde_derive"
|
|
362
|
+
version = "1.0.202"
|
|
363
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
364
|
+
checksum = "6048858004bcff69094cd972ed40a32500f153bd3be9f716b2eed2e8217c4838"
|
|
365
|
+
dependencies = [
|
|
366
|
+
"proc-macro2",
|
|
367
|
+
"quote",
|
|
368
|
+
"syn",
|
|
369
|
+
]
|
|
354
370
|
|
|
355
371
|
[[package]]
|
|
356
372
|
name = "shell-words"
|
|
@@ -360,21 +376,21 @@ checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde"
|
|
|
360
376
|
|
|
361
377
|
[[package]]
|
|
362
378
|
name = "shlex"
|
|
363
|
-
version = "1.
|
|
379
|
+
version = "1.3.0"
|
|
364
380
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
365
|
-
checksum = "
|
|
381
|
+
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
|
366
382
|
|
|
367
383
|
[[package]]
|
|
368
384
|
name = "smallvec"
|
|
369
|
-
version = "1.
|
|
385
|
+
version = "1.13.2"
|
|
370
386
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
371
|
-
checksum = "
|
|
387
|
+
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
|
|
372
388
|
|
|
373
389
|
[[package]]
|
|
374
390
|
name = "syn"
|
|
375
|
-
version = "2.0.
|
|
391
|
+
version = "2.0.63"
|
|
376
392
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
377
|
-
checksum = "
|
|
393
|
+
checksum = "bf5be731623ca1a1fb7d8be6f261a3be6d3e2337b8a1f97be944d020c8fcb704"
|
|
378
394
|
dependencies = [
|
|
379
395
|
"proc-macro2",
|
|
380
396
|
"quote",
|
|
@@ -383,8 +399,9 @@ dependencies = [
|
|
|
383
399
|
|
|
384
400
|
[[package]]
|
|
385
401
|
name = "tiktoken-rs"
|
|
386
|
-
version = "0.
|
|
387
|
-
source = "
|
|
402
|
+
version = "0.5.9"
|
|
403
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
404
|
+
checksum = "c314e7ce51440f9e8f5a497394682a57b7c323d0f4d0a6b1b13c429056e0e234"
|
|
388
405
|
dependencies = [
|
|
389
406
|
"anyhow",
|
|
390
407
|
"base64",
|
|
@@ -406,50 +423,20 @@ dependencies = [
|
|
|
406
423
|
|
|
407
424
|
[[package]]
|
|
408
425
|
name = "unicode-ident"
|
|
409
|
-
version = "1.0.
|
|
410
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
411
|
-
checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
|
|
412
|
-
|
|
413
|
-
[[package]]
|
|
414
|
-
name = "winapi"
|
|
415
|
-
version = "0.3.9"
|
|
426
|
+
version = "1.0.12"
|
|
416
427
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
417
|
-
checksum = "
|
|
418
|
-
dependencies = [
|
|
419
|
-
"winapi-i686-pc-windows-gnu",
|
|
420
|
-
"winapi-x86_64-pc-windows-gnu",
|
|
421
|
-
]
|
|
422
|
-
|
|
423
|
-
[[package]]
|
|
424
|
-
name = "winapi-i686-pc-windows-gnu"
|
|
425
|
-
version = "0.4.0"
|
|
426
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
427
|
-
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
|
428
|
-
|
|
429
|
-
[[package]]
|
|
430
|
-
name = "winapi-x86_64-pc-windows-gnu"
|
|
431
|
-
version = "0.4.0"
|
|
432
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
433
|
-
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
|
434
|
-
|
|
435
|
-
[[package]]
|
|
436
|
-
name = "windows-sys"
|
|
437
|
-
version = "0.45.0"
|
|
438
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
439
|
-
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
|
|
440
|
-
dependencies = [
|
|
441
|
-
"windows-targets",
|
|
442
|
-
]
|
|
428
|
+
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
|
443
429
|
|
|
444
430
|
[[package]]
|
|
445
431
|
name = "windows-targets"
|
|
446
|
-
version = "0.
|
|
432
|
+
version = "0.52.5"
|
|
447
433
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
448
|
-
checksum = "
|
|
434
|
+
checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
|
|
449
435
|
dependencies = [
|
|
450
436
|
"windows_aarch64_gnullvm",
|
|
451
437
|
"windows_aarch64_msvc",
|
|
452
438
|
"windows_i686_gnu",
|
|
439
|
+
"windows_i686_gnullvm",
|
|
453
440
|
"windows_i686_msvc",
|
|
454
441
|
"windows_x86_64_gnu",
|
|
455
442
|
"windows_x86_64_gnullvm",
|
|
@@ -458,42 +445,48 @@ dependencies = [
|
|
|
458
445
|
|
|
459
446
|
[[package]]
|
|
460
447
|
name = "windows_aarch64_gnullvm"
|
|
461
|
-
version = "0.
|
|
448
|
+
version = "0.52.5"
|
|
462
449
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
463
|
-
checksum = "
|
|
450
|
+
checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
|
|
464
451
|
|
|
465
452
|
[[package]]
|
|
466
453
|
name = "windows_aarch64_msvc"
|
|
467
|
-
version = "0.
|
|
454
|
+
version = "0.52.5"
|
|
468
455
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
469
|
-
checksum = "
|
|
456
|
+
checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
|
|
470
457
|
|
|
471
458
|
[[package]]
|
|
472
459
|
name = "windows_i686_gnu"
|
|
473
|
-
version = "0.
|
|
460
|
+
version = "0.52.5"
|
|
461
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
462
|
+
checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
|
|
463
|
+
|
|
464
|
+
[[package]]
|
|
465
|
+
name = "windows_i686_gnullvm"
|
|
466
|
+
version = "0.52.5"
|
|
474
467
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
475
|
-
checksum = "
|
|
468
|
+
checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
|
|
476
469
|
|
|
477
470
|
[[package]]
|
|
478
471
|
name = "windows_i686_msvc"
|
|
479
|
-
version = "0.
|
|
472
|
+
version = "0.52.5"
|
|
480
473
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
481
|
-
checksum = "
|
|
474
|
+
checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
|
|
482
475
|
|
|
483
476
|
[[package]]
|
|
484
477
|
name = "windows_x86_64_gnu"
|
|
485
|
-
version = "0.
|
|
478
|
+
version = "0.52.5"
|
|
486
479
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
487
|
-
checksum = "
|
|
480
|
+
checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
|
|
488
481
|
|
|
489
482
|
[[package]]
|
|
490
483
|
name = "windows_x86_64_gnullvm"
|
|
491
|
-
version = "0.
|
|
484
|
+
version = "0.52.5"
|
|
492
485
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
493
|
-
checksum = "
|
|
486
|
+
checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
|
|
494
487
|
|
|
495
488
|
[[package]]
|
|
496
489
|
name = "windows_x86_64_msvc"
|
|
497
|
-
version = "0.
|
|
490
|
+
version = "0.52.5"
|
|
498
491
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
499
|
-
checksum = "
|
|
492
|
+
checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
|
data/Gemfile.lock
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
tiktoken_ruby (0.0.
|
|
5
|
-
rb_sys (
|
|
4
|
+
tiktoken_ruby (0.0.9)
|
|
5
|
+
rb_sys (= 0.9.87)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
8
8
|
remote: https://rubygems.org/
|
|
@@ -22,7 +22,7 @@ GEM
|
|
|
22
22
|
rake (13.1.0)
|
|
23
23
|
rake-compiler (1.2.5)
|
|
24
24
|
rake
|
|
25
|
-
rb_sys (0.9.
|
|
25
|
+
rb_sys (0.9.87)
|
|
26
26
|
regexp_parser (2.9.0)
|
|
27
27
|
rexml (3.2.6)
|
|
28
28
|
rspec (3.12.0)
|
|
@@ -89,4 +89,4 @@ DEPENDENCIES
|
|
|
89
89
|
yard-doctest
|
|
90
90
|
|
|
91
91
|
BUNDLED WITH
|
|
92
|
-
2.4.
|
|
92
|
+
2.4.4
|
data/README.md
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
[](https://badge.fury.io/rb/tiktoken_ruby)
|
|
2
|
+
|
|
2
3
|
# tiktoken_ruby
|
|
3
4
|
|
|
4
5
|
[Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
|
|
5
|
-
This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
|
|
6
|
+
This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
|
|
7
|
+
|
|
8
|
+
## Request for maintainers
|
|
9
|
+
|
|
10
|
+
I can't really put substantial time into maintaining this. Probably nothing more than a couple hours every few months. If you have experience maintaining ruby gems and would like to
|
|
11
|
+
lend a hand please send me an email or reply to this [issue](https://github.com/IAPark/tiktoken_ruby/issues/26)
|
|
6
12
|
|
|
7
13
|
## Installation
|
|
8
14
|
|
|
@@ -15,17 +21,19 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
|
15
21
|
$ gem install tiktoken_ruby
|
|
16
22
|
|
|
17
23
|
## Usage
|
|
24
|
+
|
|
18
25
|
Usage should be very similar to the python library. Here's a simple example
|
|
19
26
|
|
|
20
27
|
Encode and decode text
|
|
28
|
+
|
|
21
29
|
```ruby
|
|
22
30
|
require 'tiktoken_ruby'
|
|
23
|
-
|
|
24
31
|
enc = Tiktoken.get_encoding("cl100k_base")
|
|
25
32
|
enc.decode(enc.encode("hello world")) #=> "hello world"
|
|
26
33
|
```
|
|
27
34
|
|
|
28
35
|
Encoders can also be retrieved by model name
|
|
36
|
+
|
|
29
37
|
```ruby
|
|
30
38
|
require 'tiktoken_ruby'
|
|
31
39
|
|
|
@@ -53,7 +61,6 @@ bundle exec rake compile
|
|
|
53
61
|
bundle exec rake spec
|
|
54
62
|
```
|
|
55
63
|
|
|
56
|
-
|
|
57
64
|
## License
|
|
58
65
|
|
|
59
66
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
|
@@ -11,5 +11,5 @@ crate-type = ["cdylib"]
|
|
|
11
11
|
|
|
12
12
|
[dependencies]
|
|
13
13
|
magnus = { version = "0.6.1" }
|
|
14
|
-
rb-sys = { version = "
|
|
15
|
-
tiktoken-rs = {
|
|
14
|
+
rb-sys = { version = "0.9.87", features = ["stable-api-compiled-fallback"] }
|
|
15
|
+
tiktoken-rs = { version = "0.5.9" }
|
|
@@ -20,6 +20,11 @@ fn cl100k_base() -> CoreBPEWrapper {
|
|
|
20
20
|
CoreBPEWrapper::new(core_bpe)
|
|
21
21
|
}
|
|
22
22
|
|
|
23
|
+
fn o200k_base() -> CoreBPEWrapper {
|
|
24
|
+
let core_bpe = tiktoken_rs::o200k_base().unwrap();
|
|
25
|
+
CoreBPEWrapper::new(core_bpe)
|
|
26
|
+
}
|
|
27
|
+
|
|
23
28
|
fn module() -> Result<RModule, magnus::Error> {
|
|
24
29
|
define_module("Tiktoken")
|
|
25
30
|
}
|
|
@@ -37,6 +42,7 @@ fn init() -> Result<(), Error> {
|
|
|
37
42
|
factory_module.define_singleton_method("p50k_base", function!(p50k_base, 0))?;
|
|
38
43
|
factory_module.define_singleton_method("p50k_edit", function!(p50k_edit, 0))?;
|
|
39
44
|
factory_module.define_singleton_method("cl100k_base", function!(cl100k_base, 0))?;
|
|
45
|
+
factory_module.define_singleton_method("o200k_base", function!(o200k_base, 0))?;
|
|
40
46
|
|
|
41
47
|
let ext_module = module.define_module("Ext")?;
|
|
42
48
|
let bpe_class = ext_module.define_class("CoreBPE", class::object())?;
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
class Tiktoken::Encoding
|
|
4
|
+
CACHE_MUTEX = Mutex.new
|
|
5
|
+
|
|
4
6
|
attr_reader :name
|
|
5
7
|
|
|
6
8
|
# This returns a new Tiktoken::Encoding instance for the requested encoding
|
|
@@ -15,8 +17,10 @@ class Tiktoken::Encoding
|
|
|
15
17
|
# @param encoding [Symbol] The name of the encoding to load
|
|
16
18
|
# @return [Tiktoken::Encoding] The encoding instance
|
|
17
19
|
def self.for_name_cached(encoding)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
+
CACHE_MUTEX.synchronize do
|
|
21
|
+
@encodings ||= {}
|
|
22
|
+
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
|
|
23
|
+
end
|
|
20
24
|
end
|
|
21
25
|
|
|
22
26
|
# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
|
data/lib/tiktoken_ruby.rb
CHANGED
|
@@ -28,7 +28,7 @@ module Tiktoken
|
|
|
28
28
|
|
|
29
29
|
# Gets the encoding for an OpenAI model
|
|
30
30
|
# @param model_name [Symbol|String] The name of the model to get the encoding for
|
|
31
|
-
# @return [Tiktoken::Encoding] The encoding instance
|
|
31
|
+
# @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
|
|
32
32
|
# @example Count tokens for text
|
|
33
33
|
# enc = Tiktoken.encoding_for_model("gpt-4")
|
|
34
34
|
# enc.encode("hello world").length #=> 2
|
|
@@ -37,10 +37,12 @@ module Tiktoken
|
|
|
37
37
|
return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
MODEL_PREFIX_TO_ENCODING.
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
40
|
+
_prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
|
|
41
|
+
model_name.start_with?(prefix.to_s)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
if encoding
|
|
45
|
+
get_encoding(encoding)
|
|
44
46
|
end
|
|
45
47
|
end
|
|
46
48
|
|
|
@@ -62,13 +64,15 @@ module Tiktoken
|
|
|
62
64
|
:r50k_base,
|
|
63
65
|
:p50k_base,
|
|
64
66
|
:p50k_edit,
|
|
65
|
-
:cl100k_base
|
|
67
|
+
:cl100k_base,
|
|
68
|
+
:o200k_base
|
|
66
69
|
]
|
|
67
70
|
|
|
68
71
|
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
|
69
72
|
# that is also MIT licensed but by OpenAI
|
|
70
73
|
MODEL_TO_ENCODING_NAME = {
|
|
71
74
|
# chat
|
|
75
|
+
"gpt-4o": "o200k_base",
|
|
72
76
|
"gpt-4": "cl100k_base",
|
|
73
77
|
"gpt-3.5-turbo": "cl100k_base",
|
|
74
78
|
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
|
@@ -118,6 +122,7 @@ module Tiktoken
|
|
|
118
122
|
|
|
119
123
|
MODEL_PREFIX_TO_ENCODING = {
|
|
120
124
|
# chat
|
|
125
|
+
"gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13, etc.
|
|
121
126
|
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
|
122
127
|
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
|
123
128
|
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
|
data/tiktoken_ruby.gemspec
CHANGED
|
@@ -29,5 +29,5 @@ Gem::Specification.new do |spec|
|
|
|
29
29
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
30
30
|
spec.require_paths = ["lib"]
|
|
31
31
|
spec.extensions = ["ext/tiktoken_ruby/extconf.rb"]
|
|
32
|
-
spec.add_dependency "rb_sys", "
|
|
32
|
+
spec.add_dependency "rb_sys", "= 0.9.87"
|
|
33
33
|
end
|
metadata
CHANGED
|
@@ -1,29 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tiktoken_ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.9
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- IAPark
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-05-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- -
|
|
17
|
+
- - '='
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: 0.9.
|
|
19
|
+
version: 0.9.87
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- -
|
|
24
|
+
- - '='
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 0.9.
|
|
26
|
+
version: 0.9.87
|
|
27
27
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
|
28
28
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|
|
29
29
|
it to OpenAI APIs.
|