tiktoken_ruby 0.0.12 → 0.0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 894d85b8e99040cab3c06241e9d2b3d538c0245efed323c4225f30b7e286b530
4
- data.tar.gz: 382740d1eb3397908163411ccc2eb633f883d21c687a06073482be920986bd91
3
+ metadata.gz: 71cce652b2f6a2ca962823d1947603a5224f305901cb4d8c822ca32b58192d47
4
+ data.tar.gz: d2cd0525f5f784a5904e1b7bc05cb3dc0c00f1f10561d52a99c6ef12b1351e89
5
5
  SHA512:
6
- metadata.gz: 5c9020b8d1872979e3d5df3d8bf472f8282ada8a99efb5a22f8e95e2345d4b7b01f24ab5c7530336b8b64c835593418cc8a8a637fd6c07aa4814944a543fa69a
7
- data.tar.gz: 3310d1d4dbec01271d3cc84ad1b107d9da1206b4c1cbd7db27b4fe1a4c3b51c9d5dd8e443a9169262f09eba575d19c6f83bd0f402025bfa6bff84ca61532e7b8
6
+ metadata.gz: fe7572bf7a82f77441335a273e90e6b4bce92be2b6fb6073c1409de635af8c3521b0181ba8d3422691f8d5242b6a3b08ec899fad465084bb7fc570d153e44b00
7
+ data.tar.gz: 4f95fa2c39ed53c1d40cb6928f294f1617b35042abd4b8cdcf641dac016b4dba9a15b3f46835cd85600af8a73152c5f3488694db9cdf9d3dd22707ab109a133d
@@ -4,5 +4,8 @@
4
4
  },
5
5
  "[markdown]": {
6
6
  "editor.defaultFormatter": "esbenp.prettier-vscode"
7
+ },
8
+ "[github-actions-workflow]": {
9
+ "editor.defaultFormatter": "redhat.vscode-yaml"
7
10
  }
8
11
  }
data/CHANGELOG.md ADDED
@@ -0,0 +1,21 @@
1
+ # [v0.0.14.1] - 20-12-2025
2
+ ## What's Changed
3
+ * Cut v0.0.12 by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/78
4
+ * Bump magnus from 0.8.0 to 0.8.1 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/83
5
+ * Bump actions/checkout from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/82
6
+ * Bump standard from 1.50.0 to 1.51.1 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/81
7
+ * Bump actions/upload-artifact from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/84
8
+ * Bump magnus from 0.8.1 to 0.8.2 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/85
9
+ * Bump the bundler-dependencies group with 2 updates by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/86
10
+ * Support by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/87
11
+ * Bump actions/checkout from 5 to 6 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/89
12
+ * Bump standard from 1.51.1 to 1.52.0 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/88
13
+ * release GVL while encoding / decoding tokens by @tenderworks in https://github.com/IAPark/tiktoken_ruby/pull/90
14
+ * Drop Ruby 3.1 support; automate release process by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/92
15
+ * Rewrite history by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/93
16
+ * Force workflow rebuild by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/94
17
+
18
+ ## New Contributors
19
+ * @tenderworks made their first contribution in https://github.com/IAPark/tiktoken_ruby/pull/90
20
+
21
+ **Full Changelog**: https://github.com/IAPark/tiktoken_ruby/compare/v0.0.12...v0.0.14.1
data/Cargo.lock CHANGED
@@ -4,18 +4,18 @@ version = 4
4
4
 
5
5
  [[package]]
6
6
  name = "aho-corasick"
7
- version = "1.1.3"
7
+ version = "1.1.4"
8
8
  source = "registry+https://github.com/rust-lang/crates.io-index"
9
- checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
9
+ checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
10
10
  dependencies = [
11
11
  "memchr",
12
12
  ]
13
13
 
14
14
  [[package]]
15
15
  name = "anyhow"
16
- version = "1.0.99"
16
+ version = "1.0.100"
17
17
  source = "registry+https://github.com/rust-lang/crates.io-index"
18
- checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100"
18
+ checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
19
19
 
20
20
  [[package]]
21
21
  name = "base64"
@@ -60,15 +60,15 @@ checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
60
60
 
61
61
  [[package]]
62
62
  name = "bitflags"
63
- version = "2.9.3"
63
+ version = "2.10.0"
64
64
  source = "registry+https://github.com/rust-lang/crates.io-index"
65
- checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d"
65
+ checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
66
66
 
67
67
  [[package]]
68
68
  name = "bstr"
69
- version = "1.12.0"
69
+ version = "1.12.1"
70
70
  source = "registry+https://github.com/rust-lang/crates.io-index"
71
- checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4"
71
+ checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
72
72
  dependencies = [
73
73
  "memchr",
74
74
  "regex-automata",
@@ -86,9 +86,9 @@ dependencies = [
86
86
 
87
87
  [[package]]
88
88
  name = "cfg-if"
89
- version = "1.0.3"
89
+ version = "1.0.4"
90
90
  source = "registry+https://github.com/rust-lang/crates.io-index"
91
- checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
91
+ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
92
92
 
93
93
  [[package]]
94
94
  name = "clang-sys"
@@ -147,25 +147,25 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
147
147
 
148
148
  [[package]]
149
149
  name = "libc"
150
- version = "0.2.175"
150
+ version = "0.2.177"
151
151
  source = "registry+https://github.com/rust-lang/crates.io-index"
152
- checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543"
152
+ checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
153
153
 
154
154
  [[package]]
155
155
  name = "libloading"
156
- version = "0.8.8"
156
+ version = "0.8.9"
157
157
  source = "registry+https://github.com/rust-lang/crates.io-index"
158
- checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
158
+ checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
159
159
  dependencies = [
160
160
  "cfg-if",
161
- "windows-targets",
161
+ "windows-link",
162
162
  ]
163
163
 
164
164
  [[package]]
165
165
  name = "magnus"
166
- version = "0.8.0"
166
+ version = "0.8.2"
167
167
  source = "registry+https://github.com/rust-lang/crates.io-index"
168
- checksum = "3f14d3cc31b2dc4fce6cd447a83c7a7ca2ab8a9f1e535dcb2f796ff972b0e68b"
168
+ checksum = "3b36a5b126bbe97eb0d02d07acfeb327036c6319fd816139a49824a83b7f9012"
169
169
  dependencies = [
170
170
  "magnus-macros",
171
171
  "rb-sys",
@@ -186,9 +186,9 @@ dependencies = [
186
186
 
187
187
  [[package]]
188
188
  name = "memchr"
189
- version = "2.7.5"
189
+ version = "2.7.6"
190
190
  source = "registry+https://github.com/rust-lang/crates.io-index"
191
- checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
191
+ checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
192
192
 
193
193
  [[package]]
194
194
  name = "minimal-lexical"
@@ -208,18 +208,18 @@ dependencies = [
208
208
 
209
209
  [[package]]
210
210
  name = "proc-macro2"
211
- version = "1.0.101"
211
+ version = "1.0.103"
212
212
  source = "registry+https://github.com/rust-lang/crates.io-index"
213
- checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
213
+ checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
214
214
  dependencies = [
215
215
  "unicode-ident",
216
216
  ]
217
217
 
218
218
  [[package]]
219
219
  name = "quote"
220
- version = "1.0.40"
220
+ version = "1.0.42"
221
221
  source = "registry+https://github.com/rust-lang/crates.io-index"
222
- checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
222
+ checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
223
223
  dependencies = [
224
224
  "proc-macro2",
225
225
  ]
@@ -256,9 +256,9 @@ checksum = "08f8d2924cf136a1315e2b4c7460a39f62ef11ee5d522df9b2750fab55b868b6"
256
256
 
257
257
  [[package]]
258
258
  name = "regex"
259
- version = "1.11.2"
259
+ version = "1.12.2"
260
260
  source = "registry+https://github.com/rust-lang/crates.io-index"
261
- checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912"
261
+ checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
262
262
  dependencies = [
263
263
  "aho-corasick",
264
264
  "memchr",
@@ -268,9 +268,9 @@ dependencies = [
268
268
 
269
269
  [[package]]
270
270
  name = "regex-automata"
271
- version = "0.4.10"
271
+ version = "0.4.13"
272
272
  source = "registry+https://github.com/rust-lang/crates.io-index"
273
- checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6"
273
+ checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
274
274
  dependencies = [
275
275
  "aho-corasick",
276
276
  "memchr",
@@ -279,9 +279,9 @@ dependencies = [
279
279
 
280
280
  [[package]]
281
281
  name = "regex-syntax"
282
- version = "0.8.6"
282
+ version = "0.8.8"
283
283
  source = "registry+https://github.com/rust-lang/crates.io-index"
284
- checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"
284
+ checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
285
285
 
286
286
  [[package]]
287
287
  name = "rustc-hash"
@@ -297,18 +297,27 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
297
297
 
298
298
  [[package]]
299
299
  name = "serde"
300
- version = "1.0.219"
300
+ version = "1.0.228"
301
+ source = "registry+https://github.com/rust-lang/crates.io-index"
302
+ checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
303
+ dependencies = [
304
+ "serde_core",
305
+ ]
306
+
307
+ [[package]]
308
+ name = "serde_core"
309
+ version = "1.0.228"
301
310
  source = "registry+https://github.com/rust-lang/crates.io-index"
302
- checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
311
+ checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
303
312
  dependencies = [
304
313
  "serde_derive",
305
314
  ]
306
315
 
307
316
  [[package]]
308
317
  name = "serde_derive"
309
- version = "1.0.219"
318
+ version = "1.0.228"
310
319
  source = "registry+https://github.com/rust-lang/crates.io-index"
311
- checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
320
+ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
312
321
  dependencies = [
313
322
  "proc-macro2",
314
323
  "quote",
@@ -329,9 +338,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
329
338
 
330
339
  [[package]]
331
340
  name = "syn"
332
- version = "2.0.106"
341
+ version = "2.0.110"
333
342
  source = "registry+https://github.com/rust-lang/crates.io-index"
334
- checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
343
+ checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea"
335
344
  dependencies = [
336
345
  "proc-macro2",
337
346
  "quote",
@@ -340,9 +349,9 @@ dependencies = [
340
349
 
341
350
  [[package]]
342
351
  name = "tiktoken-rs"
343
- version = "0.7.0"
352
+ version = "0.9.1"
344
353
  source = "registry+https://github.com/rust-lang/crates.io-index"
345
- checksum = "25563eeba904d770acf527e8b370fe9a5547bacd20ff84a0b6c3bc41288e5625"
354
+ checksum = "3a19830747d9034cd9da43a60eaa8e552dfda7712424aebf187b7a60126bae0d"
346
355
  dependencies = [
347
356
  "anyhow",
348
357
  "base64",
@@ -364,77 +373,12 @@ dependencies = [
364
373
 
365
374
  [[package]]
366
375
  name = "unicode-ident"
367
- version = "1.0.18"
376
+ version = "1.0.22"
368
377
  source = "registry+https://github.com/rust-lang/crates.io-index"
369
- checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
378
+ checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
370
379
 
371
380
  [[package]]
372
381
  name = "windows-link"
373
- version = "0.1.3"
374
- source = "registry+https://github.com/rust-lang/crates.io-index"
375
- checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
376
-
377
- [[package]]
378
- name = "windows-targets"
379
- version = "0.53.3"
380
- source = "registry+https://github.com/rust-lang/crates.io-index"
381
- checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91"
382
- dependencies = [
383
- "windows-link",
384
- "windows_aarch64_gnullvm",
385
- "windows_aarch64_msvc",
386
- "windows_i686_gnu",
387
- "windows_i686_gnullvm",
388
- "windows_i686_msvc",
389
- "windows_x86_64_gnu",
390
- "windows_x86_64_gnullvm",
391
- "windows_x86_64_msvc",
392
- ]
393
-
394
- [[package]]
395
- name = "windows_aarch64_gnullvm"
396
- version = "0.53.0"
397
- source = "registry+https://github.com/rust-lang/crates.io-index"
398
- checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
399
-
400
- [[package]]
401
- name = "windows_aarch64_msvc"
402
- version = "0.53.0"
403
- source = "registry+https://github.com/rust-lang/crates.io-index"
404
- checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
405
-
406
- [[package]]
407
- name = "windows_i686_gnu"
408
- version = "0.53.0"
409
- source = "registry+https://github.com/rust-lang/crates.io-index"
410
- checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
411
-
412
- [[package]]
413
- name = "windows_i686_gnullvm"
414
- version = "0.53.0"
415
- source = "registry+https://github.com/rust-lang/crates.io-index"
416
- checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
417
-
418
- [[package]]
419
- name = "windows_i686_msvc"
420
- version = "0.53.0"
421
- source = "registry+https://github.com/rust-lang/crates.io-index"
422
- checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
423
-
424
- [[package]]
425
- name = "windows_x86_64_gnu"
426
- version = "0.53.0"
427
- source = "registry+https://github.com/rust-lang/crates.io-index"
428
- checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
429
-
430
- [[package]]
431
- name = "windows_x86_64_gnullvm"
432
- version = "0.53.0"
433
- source = "registry+https://github.com/rust-lang/crates.io-index"
434
- checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
435
-
436
- [[package]]
437
- name = "windows_x86_64_msvc"
438
- version = "0.53.0"
382
+ version = "0.2.1"
439
383
  source = "registry+https://github.com/rust-lang/crates.io-index"
440
- checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
384
+ checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.12)
4
+ tiktoken_ruby (0.0.14.1)
5
5
  rb_sys (~> 0.9)
6
6
 
7
7
  GEM
@@ -9,38 +9,38 @@ GEM
9
9
  specs:
10
10
  ast (2.4.3)
11
11
  diff-lcs (1.6.2)
12
- json (2.12.0)
12
+ json (2.16.0)
13
13
  language_server-protocol (3.17.0.5)
14
14
  lint_roller (1.1.0)
15
15
  minitest (5.25.5)
16
16
  parallel (1.27.0)
17
- parser (3.3.8.0)
17
+ parser (3.3.10.0)
18
18
  ast (~> 2.4.1)
19
19
  racc
20
- prism (1.4.0)
20
+ prism (1.6.0)
21
21
  racc (1.8.1)
22
22
  rainbow (3.1.1)
23
- rake (13.3.0)
23
+ rake (13.3.1)
24
24
  rake-compiler (1.3.0)
25
25
  rake
26
26
  rake-compiler-dock (1.9.1)
27
27
  rb_sys (0.9.117)
28
28
  rake-compiler-dock (= 1.9.1)
29
- regexp_parser (2.10.0)
30
- rspec (3.13.1)
29
+ regexp_parser (2.11.3)
30
+ rspec (3.13.2)
31
31
  rspec-core (~> 3.13.0)
32
32
  rspec-expectations (~> 3.13.0)
33
33
  rspec-mocks (~> 3.13.0)
34
- rspec-core (3.13.5)
34
+ rspec-core (3.13.6)
35
35
  rspec-support (~> 3.13.0)
36
36
  rspec-expectations (3.13.5)
37
37
  diff-lcs (>= 1.2.0, < 2.0)
38
38
  rspec-support (~> 3.13.0)
39
- rspec-mocks (3.13.5)
39
+ rspec-mocks (3.13.7)
40
40
  diff-lcs (>= 1.2.0, < 2.0)
41
41
  rspec-support (~> 3.13.0)
42
- rspec-support (3.13.5)
43
- rubocop (1.75.7)
42
+ rspec-support (3.13.6)
43
+ rubocop (1.81.7)
44
44
  json (~> 2.3)
45
45
  language_server-protocol (~> 3.17.0.2)
46
46
  lint_roller (~> 1.1.0)
@@ -48,10 +48,10 @@ GEM
48
48
  parser (>= 3.3.0.2)
49
49
  rainbow (>= 2.2.2, < 4.0)
50
50
  regexp_parser (>= 2.9.3, < 3.0)
51
- rubocop-ast (>= 1.44.0, < 2.0)
51
+ rubocop-ast (>= 1.47.1, < 2.0)
52
52
  ruby-progressbar (~> 1.7)
53
53
  unicode-display_width (>= 2.4.0, < 4.0)
54
- rubocop-ast (1.44.1)
54
+ rubocop-ast (1.48.0)
55
55
  parser (>= 3.3.7.2)
56
56
  prism (~> 1.4)
57
57
  rubocop-performance (1.25.0)
@@ -59,10 +59,10 @@ GEM
59
59
  rubocop (>= 1.75.0, < 2.0)
60
60
  rubocop-ast (>= 1.38.0, < 2.0)
61
61
  ruby-progressbar (1.13.0)
62
- standard (1.50.0)
62
+ standard (1.52.0)
63
63
  language_server-protocol (~> 3.17.0.2)
64
64
  lint_roller (~> 1.0)
65
- rubocop (~> 1.75.5)
65
+ rubocop (~> 1.81.7)
66
66
  standard-custom (~> 1.0.0)
67
67
  standard-performance (~> 1.8)
68
68
  standard-custom (1.0.2)
@@ -71,9 +71,9 @@ GEM
71
71
  standard-performance (1.8.0)
72
72
  lint_roller (~> 1.1)
73
73
  rubocop-performance (~> 1.25.0)
74
- unicode-display_width (3.1.4)
75
- unicode-emoji (~> 4.0, >= 4.0.4)
76
- unicode-emoji (4.0.4)
74
+ unicode-display_width (3.2.0)
75
+ unicode-emoji (~> 4.1)
76
+ unicode-emoji (4.1.0)
77
77
  yard (0.9.37)
78
78
  yard-doctest (0.1.17)
79
79
  minitest
data/README.md CHANGED
@@ -36,6 +36,39 @@ enc = Tiktoken.encoding_for_model("gpt-4")
36
36
  enc.encode("hello world").length #=> 2
37
37
  ```
38
38
 
39
+ ### Encoding methods
40
+
41
+ There are three methods for encoding text:
42
+
43
+ - `encode_ordinary(text)` - Encodes text, always treating special tokens as ordinary text
44
+ - `encode(text, allowed_special: [])` - Encodes text, treating special tokens as text unless listed in `allowed_special`
45
+ - `encode_with_special_tokens(text)` - Encodes text, recognizing and parsing all special tokens
46
+
47
+ **Special tokens** are control sequences used by OpenAI models, such as `<|endoftext|>`, `<|fim_prefix|>`, `<|fim_middle|>`, and `<|fim_suffix|>`. The encoding methods differ in how they handle these sequences:
48
+
49
+ ```ruby
50
+ enc = Tiktoken.get_encoding("cl100k_base")
51
+ text = "Hello<|endoftext|>World"
52
+
53
+ # encode_ordinary: treats <|endoftext|> as literal characters (9 tokens)
54
+ enc.encode_ordinary(text)
55
+ #=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
56
+
57
+ # encode: same as encode_ordinary by default
58
+ enc.encode(text)
59
+ #=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
60
+
61
+ # encode with allowed_special: recognizes the specified special token (3 tokens)
62
+ enc.encode(text, allowed_special: ["<|endoftext|>"])
63
+ #=> [9906, 100257, 10343]
64
+
65
+ # encode_with_special_tokens: recognizes ALL special tokens (3 tokens)
66
+ enc.encode_with_special_tokens(text)
67
+ #=> [9906, 100257, 10343]
68
+ ```
69
+
70
+ All methods round-trip correctly through `decode`.
71
+
39
72
  ## Development
40
73
 
41
74
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -10,6 +10,6 @@ publish = false
10
10
  crate-type = ["cdylib"]
11
11
 
12
12
  [dependencies]
13
- magnus = { version = "0.8.0" }
13
+ magnus = { version = "0.8.2" }
14
14
  rb-sys = { version = "0.9.117", features = ["stable-api-compiled-fallback"] }
15
- tiktoken-rs = { version = "0.7.0" }
15
+ tiktoken-rs = { version = "0.9.0" }
@@ -1,4 +1,5 @@
1
1
  use std::collections::HashSet;
2
+ use std::ffi::c_void;
2
3
 
3
4
  use tiktoken_rs::Rank;
4
5
 
@@ -9,13 +10,82 @@ pub struct CoreBPEWrapper {
9
10
  core_bpe: tiktoken_rs::CoreBPE,
10
11
  }
11
12
 
13
+ struct EncodeOrdinaryData {
14
+ core_bpe: *const tiktoken_rs::CoreBPE,
15
+ text: String,
16
+ result: Vec<Rank>,
17
+ }
18
+
19
+ struct EncodeData {
20
+ core_bpe: *const tiktoken_rs::CoreBPE,
21
+ text: String,
22
+ allowed_special: HashSet<String>,
23
+ result: Vec<Rank>,
24
+ }
25
+
26
+ struct EncodeSpecialData {
27
+ core_bpe: *const tiktoken_rs::CoreBPE,
28
+ text: String,
29
+ result: Vec<Rank>,
30
+ }
31
+
32
+ struct DecodeData {
33
+ core_bpe: *const tiktoken_rs::CoreBPE,
34
+ ids: Vec<Rank>,
35
+ result: Result<String, String>,
36
+ }
37
+
38
+ unsafe extern "C" fn encode_ordinary_without_gvl(data: *mut c_void) -> *mut c_void {
39
+ let data = &mut *(data as *mut EncodeOrdinaryData);
40
+ let core_bpe = &*data.core_bpe;
41
+ data.result = core_bpe.encode_ordinary(&data.text);
42
+ std::ptr::null_mut()
43
+ }
44
+
45
+ unsafe extern "C" fn encode_without_gvl(data: *mut c_void) -> *mut c_void {
46
+ let data = &mut *(data as *mut EncodeData);
47
+ let core_bpe = &*data.core_bpe;
48
+ let allowed_special: HashSet<&str> = data.allowed_special.iter().map(|s| s.as_str()).collect();
49
+ data.result = core_bpe.encode(&data.text, &allowed_special).0;
50
+ std::ptr::null_mut()
51
+ }
52
+
53
+ unsafe extern "C" fn encode_special_without_gvl(data: *mut c_void) -> *mut c_void {
54
+ let data = &mut *(data as *mut EncodeSpecialData);
55
+ let core_bpe = &*data.core_bpe;
56
+ data.result = core_bpe.encode_with_special_tokens(&data.text);
57
+ std::ptr::null_mut()
58
+ }
59
+
60
+ unsafe extern "C" fn decode_without_gvl(data: *mut c_void) -> *mut c_void {
61
+ let data = &mut *(data as *mut DecodeData);
62
+ let core_bpe = &*data.core_bpe;
63
+ data.result = core_bpe.decode(std::mem::take(&mut data.ids)).map_err(|e| e.to_string());
64
+ std::ptr::null_mut()
65
+ }
66
+
12
67
  impl CoreBPEWrapper {
13
68
  pub fn new(core_bpe: tiktoken_rs::CoreBPE) -> Self {
14
69
  Self { core_bpe }
15
70
  }
16
71
 
17
72
  pub fn encode_ordinary(&self, text: String) -> Vec<Rank> {
18
- self.core_bpe.encode_ordinary(text.as_str())
73
+ let mut data = EncodeOrdinaryData {
74
+ core_bpe: &self.core_bpe as *const _,
75
+ text,
76
+ result: Vec::new(),
77
+ };
78
+
79
+ unsafe {
80
+ rb_sys::rb_thread_call_without_gvl(
81
+ Some(encode_ordinary_without_gvl),
82
+ &mut data as *mut _ as *mut c_void,
83
+ None,
84
+ std::ptr::null_mut(),
85
+ );
86
+ }
87
+
88
+ data.result
19
89
  }
20
90
 
21
91
  pub fn encode(
@@ -24,24 +94,68 @@ impl CoreBPEWrapper {
24
94
  allowed_special: magnus::RArray,
25
95
  ) -> Result<Vec<Rank>, magnus::Error> {
26
96
  let allowed_special: Vec<String> = allowed_special.to_vec()?;
27
- let allowed_special: Vec<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
28
- let allowed_special: HashSet<&str> = HashSet::from_iter(allowed_special.iter().cloned());
29
97
 
30
- Ok(self.core_bpe.encode(text.as_str(), &allowed_special).0)
98
+ let mut data = EncodeData {
99
+ core_bpe: &self.core_bpe as *const _,
100
+ text,
101
+ allowed_special: HashSet::from_iter(allowed_special),
102
+ result: Vec::new(),
103
+ };
104
+
105
+ unsafe {
106
+ rb_sys::rb_thread_call_without_gvl(
107
+ Some(encode_without_gvl),
108
+ &mut data as *mut _ as *mut c_void,
109
+ None,
110
+ std::ptr::null_mut(),
111
+ );
112
+ }
113
+
114
+ Ok(data.result)
31
115
  }
32
116
 
33
117
  pub fn encode_with_special_tokens(&self, text: String) -> Vec<Rank> {
34
- self.core_bpe.encode_with_special_tokens(text.as_str())
118
+ let mut data = EncodeSpecialData {
119
+ core_bpe: &self.core_bpe as *const _,
120
+ text,
121
+ result: Vec::new(),
122
+ };
123
+
124
+ unsafe {
125
+ rb_sys::rb_thread_call_without_gvl(
126
+ Some(encode_special_without_gvl),
127
+ &mut data as *mut _ as *mut c_void,
128
+ None,
129
+ std::ptr::null_mut(),
130
+ );
131
+ }
132
+
133
+ data.result
35
134
  }
36
135
 
37
136
  pub fn decode(&self, ids: Vec<Rank>) -> Result<String, magnus::Error> {
38
- self.core_bpe.decode(ids).map_err(|e| {
137
+ let mut data = DecodeData {
138
+ core_bpe: &self.core_bpe as *const _,
139
+ ids,
140
+ result: Err(String::new()),
141
+ };
142
+
143
+ unsafe {
144
+ rb_sys::rb_thread_call_without_gvl(
145
+ Some(decode_without_gvl),
146
+ &mut data as *mut _ as *mut c_void,
147
+ None,
148
+ std::ptr::null_mut(),
149
+ );
150
+ }
151
+
152
+ data.result.map_err(|e| {
39
153
  let error = match uncicode_error() {
40
154
  Ok(error) => error,
41
155
  Err(e) => return e,
42
156
  };
43
157
 
44
- magnus::Error::new(error, e.to_string())
158
+ magnus::Error::new(error, e)
45
159
  })
46
160
  }
47
161
  }
@@ -25,12 +25,20 @@ fn o200k_base() -> CoreBPEWrapper {
25
25
  CoreBPEWrapper::new(core_bpe)
26
26
  }
27
27
 
28
+ fn o200k_harmony() -> CoreBPEWrapper {
29
+ let core_bpe = tiktoken_rs::o200k_harmony().unwrap();
30
+ CoreBPEWrapper::new(core_bpe)
31
+ }
32
+
28
33
  fn module() -> Result<RModule, magnus::Error> {
29
34
  Ruby::get().unwrap().define_module("Tiktoken")
30
35
  }
31
36
 
32
37
  fn uncicode_error() -> Result<ExceptionClass, magnus::Error> {
33
- module()?.define_error("UnicodeError", Ruby::get().unwrap().exception_standard_error())
38
+ module()?.define_error(
39
+ "UnicodeError",
40
+ Ruby::get().unwrap().exception_standard_error(),
41
+ )
34
42
  }
35
43
 
36
44
  #[magnus::init]
@@ -43,6 +51,7 @@ fn init() -> Result<(), Error> {
43
51
  factory_module.define_singleton_method("p50k_edit", function!(p50k_edit, 0))?;
44
52
  factory_module.define_singleton_method("cl100k_base", function!(cl100k_base, 0))?;
45
53
  factory_module.define_singleton_method("o200k_base", function!(o200k_base, 0))?;
54
+ factory_module.define_singleton_method("o200k_harmony", function!(o200k_harmony, 0))?;
46
55
 
47
56
  let ext_module = module.define_module("Ext")?;
48
57
  let bpe_class = ext_module.define_class("CoreBPE", Ruby::get().unwrap().class_object())?;
@@ -40,6 +40,13 @@ class Tiktoken::Encoding
40
40
  @ext_base_bpe.encode(text, allowed_special)
41
41
  end
42
42
 
43
+ # Encodes the text as a list of integer tokens, including special tokens.
44
+ # @param text [String] The text to encode
45
+ # @return [Array<Integer>] The encoded tokens
46
+ def encode_with_special_tokens(text)
47
+ @ext_base_bpe.encode_with_special_tokens(text)
48
+ end
49
+
43
50
  # Decodes the tokens back into text
44
51
  # @param tokens [Array<Integer>] The tokens to decode
45
52
  # @return [String] The decoded text
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.12"
4
+ VERSION = "0.0.14.1"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -65,7 +65,8 @@ module Tiktoken
65
65
  :p50k_base,
66
66
  :p50k_edit,
67
67
  :cl100k_base,
68
- :o200k_base
68
+ :o200k_base,
69
+ :o200k_harmony
69
70
  ]
70
71
 
71
72
  # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
@@ -143,6 +144,7 @@ module Tiktoken
143
144
  "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
144
145
  "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
145
146
  "gpt-35-turbo-": "cl100k_base", # Azure deployment name
147
+ "gpt-oss-": "o200k_harmony",
146
148
  # fine-tuned
147
149
  "ft:gpt-4o": "cl100k_base",
148
150
  "ft:gpt-4": "cl100k_base",
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.14.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - IAPark
@@ -36,6 +36,7 @@ files:
36
36
  - ".rspec"
37
37
  - ".standard.yml"
38
38
  - ".vscode/settings.json"
39
+ - CHANGELOG.md
39
40
  - Cargo.lock
40
41
  - Cargo.toml
41
42
  - Gemfile
@@ -51,9 +52,7 @@ files:
51
52
  - lib/tiktoken_ruby.rb
52
53
  - lib/tiktoken_ruby/encoding.rb
53
54
  - lib/tiktoken_ruby/version.rb
54
- - script/release
55
55
  - sig/tiktoken_ruby.rbs
56
- - tiktoken_ruby.gemspec
57
56
  homepage: https://github.com/IAPark/tiktoken_ruby
58
57
  licenses:
59
58
  - MIT
@@ -68,7 +67,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
68
67
  requirements:
69
68
  - - ">="
70
69
  - !ruby/object:Gem::Version
71
- version: 3.1.0
70
+ version: 3.2.0
72
71
  required_rubygems_version: !ruby/object:Gem::Requirement
73
72
  requirements:
74
73
  - - ">="
data/script/release DELETED
@@ -1,43 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- set -e
4
-
5
- if [ -z "${TIKTOKEN_PUBLISH_KEY}" ]; then
6
- echo "Error: TIKTOKEN_PUBLISH_KEY is not set. This is the RubyGems API key to push the gem."
7
- exit 1
8
- fi
9
-
10
- run_id=""
11
- # Parse arguments
12
- while [[ "$#" -gt 0 ]]; do
13
- case $1 in
14
- --run-id)
15
- run_id="$2"
16
- shift 2
17
- ;;
18
- *)
19
- echo "Unknown parameter passed: $1"
20
- exit 1
21
- ;;
22
- esac
23
- done
24
-
25
- if [ -z "${run_id}" ]; then
26
- echo "Error: --run-id is not provided. Please provide the GitHub Action run id for the cross-compile workflow."
27
- exit 1
28
- fi
29
-
30
- version=$(grep VERSION lib/tiktoken_ruby/version.rb | head -n 1 | cut -d'"' -f2)
31
- echo "Building tiktoken_ruby v$version, using artifacts from run $run_id"
32
-
33
- rm -rf pkg/cross-compiled
34
- gh run download "$run_id" -D pkg/cross-compiled
35
-
36
- for gem in pkg/cross-compiled/cross-gem-*/tiktoken_ruby-"$version"*.gem ; do
37
- echo "Publishing $gem"
38
- GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
39
- done
40
-
41
- # last but not least, the uncompiled gem
42
- bundle exec rake package
43
- GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "pkg/tiktoken_ruby-$version.gem" --host https://rubygems.org
@@ -1,33 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "lib/tiktoken_ruby/version"
4
-
5
- Gem::Specification.new do |spec|
6
- spec.name = "tiktoken_ruby"
7
- spec.version = Tiktoken::VERSION
8
- spec.authors = ["IAPark"]
9
- spec.email = ["isaac.a.park@gmail.com"]
10
- spec.summary = "Ruby wrapper for Tiktoken"
11
- spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
12
- "a BPE tokenizer written by and used by OpenAI. It can be used to " \
13
- "count the number of tokens in text before sending it to OpenAI APIs."
14
- spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
15
- spec.license = "MIT"
16
- spec.required_ruby_version = ">= 3.1.0"
17
- spec.required_rubygems_version = ">= 3.4.0"
18
- spec.platform = Gem::Platform::RUBY
19
-
20
- spec.metadata["homepage_uri"] = spec.homepage
21
- spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
22
- spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
23
- spec.files = Dir.chdir(__dir__) do
24
- `git ls-files -z`.split("\x0").reject do |f|
25
- (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
26
- end
27
- end
28
- spec.bindir = "exe"
29
- spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
30
- spec.require_paths = ["lib"]
31
- spec.extensions = ["ext/tiktoken_ruby/extconf.rb"]
32
- spec.add_dependency "rb_sys", "~> 0.9"
33
- end