tiktoken_ruby 0.0.6-arm64-darwin → 0.0.7-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +5 -9
- data/Gemfile.lock +44 -33
- data/lib/tiktoken_ruby/3.0/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/3.1/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/3.2/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/3.3/tiktoken_ruby.bundle +0 -0
- data/lib/tiktoken_ruby/version.rb +1 -1
- data/lib/tiktoken_ruby.rb +36 -19
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 97f483266d9dd175af4d270716a3481a157db746d4be804504d20870c11e4fe0
|
4
|
+
data.tar.gz: 76468fd689d5b71bdaaaee7cc6141d51fe5a7ff2af70a334d1bd538a7b3e8455
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3501683955f9c817c90ba800ec7a4bb160b33bd3e40aa5efd2a0e482812d76a6fb3f0eac7e7aed3b02d5b72701b353090ec199f8b2929c0d77d8ac28b20462f2
|
7
|
+
data.tar.gz: 3ecc8a2f8fbdb30c3b0e8dcf54e1a4c6458cf994d2d86dc64c03264d81c04444968da3659ba07ca499d8ba113915c171553df147d3b98086f88325ab01d24265
|
data/Gemfile
CHANGED
@@ -2,15 +2,11 @@
|
|
2
2
|
|
3
3
|
source "https://rubygems.org"
|
4
4
|
|
5
|
-
# Specify your gem's dependencies in tiktoken_ruby.gemspec
|
6
5
|
gemspec
|
7
6
|
|
8
|
-
gem "rake"
|
9
|
-
|
7
|
+
gem "rake"
|
10
8
|
gem "rake-compiler"
|
11
|
-
|
12
|
-
gem "
|
13
|
-
|
14
|
-
gem "
|
15
|
-
|
16
|
-
gem "yard-doctest", "~> 0.1.17"
|
9
|
+
gem "rspec"
|
10
|
+
gem "standard"
|
11
|
+
gem "yard-doctest"
|
12
|
+
gem "racc"
|
data/Gemfile.lock
CHANGED
@@ -1,64 +1,74 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tiktoken_ruby (0.0.
|
5
|
-
rb_sys (
|
4
|
+
tiktoken_ruby (0.0.7)
|
5
|
+
rb_sys (>= 0.9.86)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
10
|
ast (2.4.2)
|
11
11
|
diff-lcs (1.5.0)
|
12
|
-
json (2.
|
12
|
+
json (2.7.1)
|
13
13
|
language_server-protocol (3.17.0.3)
|
14
|
-
|
15
|
-
|
16
|
-
|
14
|
+
lint_roller (1.1.0)
|
15
|
+
minitest (5.21.2)
|
16
|
+
parallel (1.24.0)
|
17
|
+
parser (3.3.0.4)
|
17
18
|
ast (~> 2.4.1)
|
19
|
+
racc
|
20
|
+
racc (1.7.3)
|
18
21
|
rainbow (3.1.1)
|
19
|
-
rake (13.0
|
20
|
-
rake-compiler (1.2.
|
22
|
+
rake (13.1.0)
|
23
|
+
rake-compiler (1.2.5)
|
21
24
|
rake
|
22
|
-
rb_sys (0.9.
|
23
|
-
regexp_parser (2.
|
24
|
-
rexml (3.2.
|
25
|
+
rb_sys (0.9.86)
|
26
|
+
regexp_parser (2.9.0)
|
27
|
+
rexml (3.2.6)
|
25
28
|
rspec (3.12.0)
|
26
29
|
rspec-core (~> 3.12.0)
|
27
30
|
rspec-expectations (~> 3.12.0)
|
28
31
|
rspec-mocks (~> 3.12.0)
|
29
|
-
rspec-core (3.12.
|
32
|
+
rspec-core (3.12.2)
|
30
33
|
rspec-support (~> 3.12.0)
|
31
|
-
rspec-expectations (3.12.
|
34
|
+
rspec-expectations (3.12.3)
|
32
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
33
36
|
rspec-support (~> 3.12.0)
|
34
|
-
rspec-mocks (3.12.
|
37
|
+
rspec-mocks (3.12.6)
|
35
38
|
diff-lcs (>= 1.2.0, < 2.0)
|
36
39
|
rspec-support (~> 3.12.0)
|
37
|
-
rspec-support (3.12.
|
38
|
-
rubocop (1.
|
40
|
+
rspec-support (3.12.1)
|
41
|
+
rubocop (1.59.0)
|
39
42
|
json (~> 2.3)
|
43
|
+
language_server-protocol (>= 3.17.0)
|
40
44
|
parallel (~> 1.10)
|
41
|
-
parser (>= 3.2.
|
45
|
+
parser (>= 3.2.2.4)
|
42
46
|
rainbow (>= 2.2.2, < 4.0)
|
43
47
|
regexp_parser (>= 1.8, < 3.0)
|
44
48
|
rexml (>= 3.2.5, < 4.0)
|
45
|
-
rubocop-ast (>= 1.
|
49
|
+
rubocop-ast (>= 1.30.0, < 2.0)
|
46
50
|
ruby-progressbar (~> 1.7)
|
47
51
|
unicode-display_width (>= 2.4.0, < 3.0)
|
48
|
-
rubocop-ast (1.
|
52
|
+
rubocop-ast (1.30.0)
|
49
53
|
parser (>= 3.2.1.0)
|
50
|
-
rubocop-performance (1.
|
51
|
-
rubocop (>= 1.
|
52
|
-
rubocop-ast (>=
|
54
|
+
rubocop-performance (1.20.2)
|
55
|
+
rubocop (>= 1.48.1, < 2.0)
|
56
|
+
rubocop-ast (>= 1.30.0, < 2.0)
|
53
57
|
ruby-progressbar (1.13.0)
|
54
|
-
standard (1.
|
58
|
+
standard (1.33.0)
|
55
59
|
language_server-protocol (~> 3.17.0.2)
|
56
|
-
|
57
|
-
rubocop
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
60
|
+
lint_roller (~> 1.0)
|
61
|
+
rubocop (~> 1.59.0)
|
62
|
+
standard-custom (~> 1.0.0)
|
63
|
+
standard-performance (~> 1.3)
|
64
|
+
standard-custom (1.0.2)
|
65
|
+
lint_roller (~> 1.0)
|
66
|
+
rubocop (~> 1.50)
|
67
|
+
standard-performance (1.3.1)
|
68
|
+
lint_roller (~> 1.1)
|
69
|
+
rubocop-performance (~> 1.20.2)
|
70
|
+
unicode-display_width (2.5.0)
|
71
|
+
yard (0.9.34)
|
62
72
|
yard-doctest (0.1.17)
|
63
73
|
minitest
|
64
74
|
yard
|
@@ -70,12 +80,13 @@ PLATFORMS
|
|
70
80
|
x86_64-linux
|
71
81
|
|
72
82
|
DEPENDENCIES
|
73
|
-
|
83
|
+
racc
|
84
|
+
rake
|
74
85
|
rake-compiler
|
75
|
-
rspec
|
76
|
-
standard
|
86
|
+
rspec
|
87
|
+
standard
|
77
88
|
tiktoken_ruby!
|
78
|
-
yard-doctest
|
89
|
+
yard-doctest
|
79
90
|
|
80
91
|
BUNDLED WITH
|
81
92
|
2.4.6
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/tiktoken_ruby.rb
CHANGED
@@ -33,17 +33,15 @@ module Tiktoken
|
|
33
33
|
# enc = Tiktoken.encoding_for_model("gpt-4")
|
34
34
|
# enc.encode("hello world").length #=> 2
|
35
35
|
def encoding_for_model(model_name)
|
36
|
-
|
37
|
-
|
38
|
-
model_name = prefix
|
39
|
-
break
|
40
|
-
end
|
36
|
+
if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
|
37
|
+
return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
|
41
38
|
end
|
42
39
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
40
|
+
MODEL_PREFIX_TO_ENCODING.each do |prefix, encoding|
|
41
|
+
if model_name.start_with?(prefix.to_s)
|
42
|
+
return get_encoding(encoding)
|
43
|
+
end
|
44
|
+
end
|
47
45
|
end
|
48
46
|
|
49
47
|
# Lists all the encodings that are supported
|
@@ -67,12 +65,22 @@ module Tiktoken
|
|
67
65
|
:cl100k_base
|
68
66
|
]
|
69
67
|
|
70
|
-
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
68
|
+
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
71
69
|
# that is also MIT licensed but by OpenAI
|
72
70
|
MODEL_TO_ENCODING_NAME = {
|
71
|
+
# chat
|
73
72
|
"gpt-4": "cl100k_base",
|
74
73
|
"gpt-3.5-turbo": "cl100k_base",
|
75
|
-
#
|
74
|
+
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
75
|
+
# base
|
76
|
+
"davinci-002": "cl100k_base",
|
77
|
+
"babbage-002": "cl100k_base",
|
78
|
+
# embeddings
|
79
|
+
"text-embedding-ada-002": "cl100k_base",
|
80
|
+
"text-embedding-3-small": "cl100k_base",
|
81
|
+
"text-embedding-3-large": "cl100k_base",
|
82
|
+
# DEPRECATED MODELS
|
83
|
+
# text (DEPRECATED)
|
76
84
|
"text-davinci-003": "p50k_base",
|
77
85
|
"text-davinci-002": "p50k_base",
|
78
86
|
"text-davinci-001": "r50k_base",
|
@@ -83,19 +91,17 @@ module Tiktoken
|
|
83
91
|
curie: "r50k_base",
|
84
92
|
babbage: "r50k_base",
|
85
93
|
ada: "r50k_base",
|
86
|
-
# code
|
94
|
+
# code (DEPRECATED)
|
87
95
|
"code-davinci-002": "p50k_base",
|
88
96
|
"code-davinci-001": "p50k_base",
|
89
97
|
"code-cushman-002": "p50k_base",
|
90
98
|
"code-cushman-001": "p50k_base",
|
91
99
|
"davinci-codex": "p50k_base",
|
92
100
|
"cushman-codex": "p50k_base",
|
93
|
-
# edit
|
101
|
+
# edit (DEPRECATED)
|
94
102
|
"text-davinci-edit-001": "p50k_edit",
|
95
103
|
"code-davinci-edit-001": "p50k_edit",
|
96
|
-
# embeddings
|
97
|
-
"text-embedding-ada-002": "cl100k_base",
|
98
|
-
# old embeddings
|
104
|
+
# old embeddings (DEPRECATED)
|
99
105
|
"text-similarity-davinci-001": "r50k_base",
|
100
106
|
"text-similarity-curie-001": "r50k_base",
|
101
107
|
"text-similarity-babbage-001": "r50k_base",
|
@@ -105,10 +111,21 @@ module Tiktoken
|
|
105
111
|
"text-search-babbage-doc-001": "r50k_base",
|
106
112
|
"text-search-ada-doc-001": "r50k_base",
|
107
113
|
"code-search-babbage-code-001": "r50k_base",
|
108
|
-
"code-search-ada-code-001": "r50k_base"
|
114
|
+
"code-search-ada-code-001": "r50k_base",
|
115
|
+
# open source
|
116
|
+
gpt2: "gpt2"
|
109
117
|
}
|
110
118
|
|
111
|
-
|
112
|
-
|
119
|
+
MODEL_PREFIX_TO_ENCODING = {
|
120
|
+
# chat
|
121
|
+
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
122
|
+
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
123
|
+
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
|
124
|
+
# fine-tuned
|
125
|
+
"ft:gpt-4": "cl100k_base",
|
126
|
+
"ft:gpt-3.5-turbo": "cl100k_base",
|
127
|
+
"ft:davinci-002": "cl100k_base",
|
128
|
+
"ft:babbage-002": "cl100k_base"
|
129
|
+
}
|
113
130
|
end
|
114
131
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiktoken_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: arm64-darwin
|
6
6
|
authors:
|
7
7
|
- IAPark
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
14
14
|
used by OpenAI. It can be used to count the number of tokens in text before sending
|
@@ -31,6 +31,7 @@ files:
|
|
31
31
|
- lib/tiktoken_ruby/3.0/tiktoken_ruby.bundle
|
32
32
|
- lib/tiktoken_ruby/3.1/tiktoken_ruby.bundle
|
33
33
|
- lib/tiktoken_ruby/3.2/tiktoken_ruby.bundle
|
34
|
+
- lib/tiktoken_ruby/3.3/tiktoken_ruby.bundle
|
34
35
|
- lib/tiktoken_ruby/encoding.rb
|
35
36
|
- lib/tiktoken_ruby/version.rb
|
36
37
|
- sig/tiktoken_ruby.rbs
|
@@ -52,7 +53,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
52
53
|
version: '3.0'
|
53
54
|
- - "<"
|
54
55
|
- !ruby/object:Gem::Version
|
55
|
-
version: 3.
|
56
|
+
version: 3.4.dev
|
56
57
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
58
|
requirements:
|
58
59
|
- - ">="
|