tiktoken_ruby 0.0.13-aarch64-linux → 0.0.14.1-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vscode/settings.json +3 -0
- data/CHANGELOG.md +21 -0
- data/Gemfile.lock +9 -9
- data/README.md +33 -0
- data/lib/tiktoken_ruby/3.2/tiktoken_ruby.so +0 -0
- data/lib/tiktoken_ruby/3.3/tiktoken_ruby.so +0 -0
- data/lib/tiktoken_ruby/3.4/tiktoken_ruby.so +0 -0
- data/lib/tiktoken_ruby/encoding.rb +7 -0
- data/lib/tiktoken_ruby/version.rb +1 -1
- metadata +3 -3
- data/script/release +0 -43
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 77ba02e1cebd5f3cbec19aa88c925230122b297a21b8ed0e1d8c2beaac85670c
|
|
4
|
+
data.tar.gz: dfa74e11789b7b76da6078420aeffcb39216cd98cff16acf198be0933d77a838
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d0feaca501ad29cf0d57fbe6223f1f9a7a2241c80c060a48965397bca0459f69695643f2011e1e77ea292fea5b73a44b88841ff64e5c9239fc9d1a6ae0f1e0f8
|
|
7
|
+
data.tar.gz: af2c4d449d9ba77e889ae3bfb227f6ef2d183398995ea40b9e79838aeb06ab58a6a75f80ca5c842ea29e195a633495239037d40f8632857603eb48d101df46fa
|
data/.vscode/settings.json
CHANGED
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# [v0.0.14.1] - 20-12-2025
|
|
2
|
+
## What's Changed
|
|
3
|
+
* Cut v0.0.12 by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/78
|
|
4
|
+
* Bump magnus from 0.8.0 to 0.8.1 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/83
|
|
5
|
+
* Bump actions/checkout from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/82
|
|
6
|
+
* Bump standard from 1.50.0 to 1.51.1 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/81
|
|
7
|
+
* Bump actions/upload-artifact from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/84
|
|
8
|
+
* Bump magnus from 0.8.1 to 0.8.2 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/85
|
|
9
|
+
* Bump the bundler-dependencies group with 2 updates by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/86
|
|
10
|
+
* Support by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/87
|
|
11
|
+
* Bump actions/checkout from 5 to 6 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/89
|
|
12
|
+
* Bump standard from 1.51.1 to 1.52.0 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/88
|
|
13
|
+
* release GVL while encoding / decoding tokens by @tenderworks in https://github.com/IAPark/tiktoken_ruby/pull/90
|
|
14
|
+
* Drop Ruby 3.1 support; automate release process by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/92
|
|
15
|
+
* Rewrite history by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/93
|
|
16
|
+
* Force workflow rebuild by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/94
|
|
17
|
+
|
|
18
|
+
## New Contributors
|
|
19
|
+
* @tenderworks made their first contribution in https://github.com/IAPark/tiktoken_ruby/pull/90
|
|
20
|
+
|
|
21
|
+
**Full Changelog**: https://github.com/IAPark/tiktoken_ruby/compare/v0.0.12...v0.0.14.1
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
tiktoken_ruby (0.0.
|
|
4
|
+
tiktoken_ruby (0.0.14.1)
|
|
5
5
|
rb_sys (~> 0.9)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -9,15 +9,15 @@ GEM
|
|
|
9
9
|
specs:
|
|
10
10
|
ast (2.4.3)
|
|
11
11
|
diff-lcs (1.6.2)
|
|
12
|
-
json (2.
|
|
12
|
+
json (2.16.0)
|
|
13
13
|
language_server-protocol (3.17.0.5)
|
|
14
14
|
lint_roller (1.1.0)
|
|
15
15
|
minitest (5.25.5)
|
|
16
16
|
parallel (1.27.0)
|
|
17
|
-
parser (3.3.
|
|
17
|
+
parser (3.3.10.0)
|
|
18
18
|
ast (~> 2.4.1)
|
|
19
19
|
racc
|
|
20
|
-
prism (1.
|
|
20
|
+
prism (1.6.0)
|
|
21
21
|
racc (1.8.1)
|
|
22
22
|
rainbow (3.1.1)
|
|
23
23
|
rake (13.3.1)
|
|
@@ -40,7 +40,7 @@ GEM
|
|
|
40
40
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
41
41
|
rspec-support (~> 3.13.0)
|
|
42
42
|
rspec-support (3.13.6)
|
|
43
|
-
rubocop (1.
|
|
43
|
+
rubocop (1.81.7)
|
|
44
44
|
json (~> 2.3)
|
|
45
45
|
language_server-protocol (~> 3.17.0.2)
|
|
46
46
|
lint_roller (~> 1.1.0)
|
|
@@ -48,10 +48,10 @@ GEM
|
|
|
48
48
|
parser (>= 3.3.0.2)
|
|
49
49
|
rainbow (>= 2.2.2, < 4.0)
|
|
50
50
|
regexp_parser (>= 2.9.3, < 3.0)
|
|
51
|
-
rubocop-ast (>= 1.
|
|
51
|
+
rubocop-ast (>= 1.47.1, < 2.0)
|
|
52
52
|
ruby-progressbar (~> 1.7)
|
|
53
53
|
unicode-display_width (>= 2.4.0, < 4.0)
|
|
54
|
-
rubocop-ast (1.
|
|
54
|
+
rubocop-ast (1.48.0)
|
|
55
55
|
parser (>= 3.3.7.2)
|
|
56
56
|
prism (~> 1.4)
|
|
57
57
|
rubocop-performance (1.25.0)
|
|
@@ -59,10 +59,10 @@ GEM
|
|
|
59
59
|
rubocop (>= 1.75.0, < 2.0)
|
|
60
60
|
rubocop-ast (>= 1.38.0, < 2.0)
|
|
61
61
|
ruby-progressbar (1.13.0)
|
|
62
|
-
standard (1.
|
|
62
|
+
standard (1.52.0)
|
|
63
63
|
language_server-protocol (~> 3.17.0.2)
|
|
64
64
|
lint_roller (~> 1.0)
|
|
65
|
-
rubocop (~> 1.
|
|
65
|
+
rubocop (~> 1.81.7)
|
|
66
66
|
standard-custom (~> 1.0.0)
|
|
67
67
|
standard-performance (~> 1.8)
|
|
68
68
|
standard-custom (1.0.2)
|
data/README.md
CHANGED
|
@@ -36,6 +36,39 @@ enc = Tiktoken.encoding_for_model("gpt-4")
|
|
|
36
36
|
enc.encode("hello world").length #=> 2
|
|
37
37
|
```
|
|
38
38
|
|
|
39
|
+
### Encoding methods
|
|
40
|
+
|
|
41
|
+
There are three methods for encoding text:
|
|
42
|
+
|
|
43
|
+
- `encode_ordinary(text)` - Encodes text, always treating special tokens as ordinary text
|
|
44
|
+
- `encode(text, allowed_special: [])` - Encodes text, treating special tokens as text unless listed in `allowed_special`
|
|
45
|
+
- `encode_with_special_tokens(text)` - Encodes text, recognizing and parsing all special tokens
|
|
46
|
+
|
|
47
|
+
**Special tokens** are control sequences used by OpenAI models, such as `<|endoftext|>`, `<|fim_prefix|>`, `<|fim_middle|>`, and `<|fim_suffix|>`. The encoding methods differ in how they handle these sequences:
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
enc = Tiktoken.get_encoding("cl100k_base")
|
|
51
|
+
text = "Hello<|endoftext|>World"
|
|
52
|
+
|
|
53
|
+
# encode_ordinary: treats <|endoftext|> as literal characters (9 tokens)
|
|
54
|
+
enc.encode_ordinary(text)
|
|
55
|
+
#=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
|
|
56
|
+
|
|
57
|
+
# encode: same as encode_ordinary by default
|
|
58
|
+
enc.encode(text)
|
|
59
|
+
#=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
|
|
60
|
+
|
|
61
|
+
# encode with allowed_special: recognizes the specified special token (3 tokens)
|
|
62
|
+
enc.encode(text, allowed_special: ["<|endoftext|>"])
|
|
63
|
+
#=> [9906, 100257, 10343]
|
|
64
|
+
|
|
65
|
+
# encode_with_special_tokens: recognizes ALL special tokens (3 tokens)
|
|
66
|
+
enc.encode_with_special_tokens(text)
|
|
67
|
+
#=> [9906, 100257, 10343]
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
All methods round-trip correctly through `decode`.
|
|
71
|
+
|
|
39
72
|
## Development
|
|
40
73
|
|
|
41
74
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -40,6 +40,13 @@ class Tiktoken::Encoding
|
|
|
40
40
|
@ext_base_bpe.encode(text, allowed_special)
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
+
# Encodes the text as a list of integer tokens, including special tokens.
|
|
44
|
+
# @param text [String] The text to encode
|
|
45
|
+
# @return [Array<Integer>] The encoded tokens
|
|
46
|
+
def encode_with_special_tokens(text)
|
|
47
|
+
@ext_base_bpe.encode_with_special_tokens(text)
|
|
48
|
+
end
|
|
49
|
+
|
|
43
50
|
# Decodes the tokens back into text
|
|
44
51
|
# @param tokens [Array<Integer>] The tokens to decode
|
|
45
52
|
# @return [String] The decoded text
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tiktoken_ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.14.1
|
|
5
5
|
platform: aarch64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- IAPark
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-12-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
|
14
14
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|
|
@@ -22,6 +22,7 @@ files:
|
|
|
22
22
|
- ".rspec"
|
|
23
23
|
- ".standard.yml"
|
|
24
24
|
- ".vscode/settings.json"
|
|
25
|
+
- CHANGELOG.md
|
|
25
26
|
- Gemfile
|
|
26
27
|
- Gemfile.lock
|
|
27
28
|
- LICENSE.txt
|
|
@@ -34,7 +35,6 @@ files:
|
|
|
34
35
|
- lib/tiktoken_ruby/3.4/tiktoken_ruby.so
|
|
35
36
|
- lib/tiktoken_ruby/encoding.rb
|
|
36
37
|
- lib/tiktoken_ruby/version.rb
|
|
37
|
-
- script/release
|
|
38
38
|
- sig/tiktoken_ruby.rbs
|
|
39
39
|
homepage: https://github.com/IAPark/tiktoken_ruby
|
|
40
40
|
licenses:
|
data/script/release
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
|
|
3
|
-
set -e
|
|
4
|
-
|
|
5
|
-
if [ -z "${TIKTOKEN_PUBLISH_KEY}" ]; then
|
|
6
|
-
echo "Error: TIKTOKEN_PUBLISH_KEY is not set. This is the RubyGems API key to push the gem."
|
|
7
|
-
exit 1
|
|
8
|
-
fi
|
|
9
|
-
|
|
10
|
-
run_id=""
|
|
11
|
-
# Parse arguments
|
|
12
|
-
while [[ "$#" -gt 0 ]]; do
|
|
13
|
-
case $1 in
|
|
14
|
-
--run-id)
|
|
15
|
-
run_id="$2"
|
|
16
|
-
shift 2
|
|
17
|
-
;;
|
|
18
|
-
*)
|
|
19
|
-
echo "Unknown parameter passed: $1"
|
|
20
|
-
exit 1
|
|
21
|
-
;;
|
|
22
|
-
esac
|
|
23
|
-
done
|
|
24
|
-
|
|
25
|
-
if [ -z "${run_id}" ]; then
|
|
26
|
-
echo "Error: --run-id is not provided. Please provide the GitHub Action run id for the cross-compile workflow."
|
|
27
|
-
exit 1
|
|
28
|
-
fi
|
|
29
|
-
|
|
30
|
-
version=$(grep VERSION lib/tiktoken_ruby/version.rb | head -n 1 | cut -d'"' -f2)
|
|
31
|
-
echo "Building tiktoken_ruby v$version, using artifacts from run $run_id"
|
|
32
|
-
|
|
33
|
-
rm -rf pkg/cross-compiled
|
|
34
|
-
gh run download "$run_id" -D pkg/cross-compiled
|
|
35
|
-
|
|
36
|
-
for gem in pkg/cross-compiled/cross-gem-*/tiktoken_ruby-"$version"*.gem ; do
|
|
37
|
-
echo "Publishing $gem"
|
|
38
|
-
GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "$gem" --host https://rubygems.org
|
|
39
|
-
done
|
|
40
|
-
|
|
41
|
-
# last but not least, the uncompiled gem
|
|
42
|
-
bundle exec rake package
|
|
43
|
-
GEM_HOST_API_KEY="${TIKTOKEN_PUBLISH_KEY}" gem push "pkg/tiktoken_ruby-$version.gem" --host https://rubygems.org
|