tiktoken_ruby 0.0.6-x86_64-darwin → 0.0.7-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d7a6258ffc12c84a34f6176e322e4a1e3bf1a17cb940a5ba518ef6f5f4d8ed52
4
- data.tar.gz: 6d158bf86fe461f317399b814ea56ebe403a89b6cb10fc94ac95b7eebf8f7c1f
3
+ metadata.gz: 6bbdd6164228f0fa953d7988326bec5899f29b87cbe4c6b19e3146b6a98a6f35
4
+ data.tar.gz: 5a53fdc3cd7eb9289cffcf2dbe88328d1387c777bdf8312571c9e7c15d8ec2fc
5
5
  SHA512:
6
- metadata.gz: 8311a0e83d0189b064e3e00cf40776244dd4d9eb724ee8fea413782601c804d125796c2758c53cbb0b3c796a47142f4bbad6a7335193d5a1969a042c41868c39
7
- data.tar.gz: 95649bcdeb706d61a556d8302129b09a3f5726edc91adda359a0eef2d77182abb928234926c9047b106738b2b9a7ffa020ea1be266555ceb2f3687a60a4c4aec
6
+ metadata.gz: d895625940b103633790071183b00db0a0ee4c3df83a047ed34383de1ce54803c5b7caa7e2f71badd3d099197961c050abb425f40da9b1a57a0284e5e26e4d77
7
+ data.tar.gz: fddc06e751328c8cc773df5da02949ae021ad2f9cc1039e6be6affd7f954e5ca96284e3981c25a923455ef106c4f0d836076b58e0e41f55f2b2f265eaae13162
data/Gemfile CHANGED
@@ -2,15 +2,11 @@
2
2
 
3
3
  source "https://rubygems.org"
4
4
 
5
- # Specify your gem's dependencies in tiktoken_ruby.gemspec
6
5
  gemspec
7
6
 
8
- gem "rake", "~> 13.0"
9
-
7
+ gem "rake"
10
8
  gem "rake-compiler"
11
-
12
- gem "rspec", "~> 3.0"
13
-
14
- gem "standard", "~> 1.3"
15
-
16
- gem "yard-doctest", "~> 0.1.17"
9
+ gem "rspec"
10
+ gem "standard"
11
+ gem "yard-doctest"
12
+ gem "racc"
data/Gemfile.lock CHANGED
@@ -1,64 +1,74 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.6)
5
- rb_sys (~> 0.9.68)
4
+ tiktoken_ruby (0.0.7)
5
+ rb_sys (>= 0.9.86)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
10
  ast (2.4.2)
11
11
  diff-lcs (1.5.0)
12
- json (2.6.3)
12
+ json (2.7.1)
13
13
  language_server-protocol (3.17.0.3)
14
- minitest (5.18.0)
15
- parallel (1.22.1)
16
- parser (3.2.1.1)
14
+ lint_roller (1.1.0)
15
+ minitest (5.21.2)
16
+ parallel (1.24.0)
17
+ parser (3.3.0.4)
17
18
  ast (~> 2.4.1)
19
+ racc
20
+ racc (1.7.3)
18
21
  rainbow (3.1.1)
19
- rake (13.0.6)
20
- rake-compiler (1.2.1)
22
+ rake (13.1.0)
23
+ rake-compiler (1.2.5)
21
24
  rake
22
- rb_sys (0.9.68)
23
- regexp_parser (2.7.0)
24
- rexml (3.2.5)
25
+ rb_sys (0.9.86)
26
+ regexp_parser (2.9.0)
27
+ rexml (3.2.6)
25
28
  rspec (3.12.0)
26
29
  rspec-core (~> 3.12.0)
27
30
  rspec-expectations (~> 3.12.0)
28
31
  rspec-mocks (~> 3.12.0)
29
- rspec-core (3.12.1)
32
+ rspec-core (3.12.2)
30
33
  rspec-support (~> 3.12.0)
31
- rspec-expectations (3.12.2)
34
+ rspec-expectations (3.12.3)
32
35
  diff-lcs (>= 1.2.0, < 2.0)
33
36
  rspec-support (~> 3.12.0)
34
- rspec-mocks (3.12.4)
37
+ rspec-mocks (3.12.6)
35
38
  diff-lcs (>= 1.2.0, < 2.0)
36
39
  rspec-support (~> 3.12.0)
37
- rspec-support (3.12.0)
38
- rubocop (1.48.1)
40
+ rspec-support (3.12.1)
41
+ rubocop (1.59.0)
39
42
  json (~> 2.3)
43
+ language_server-protocol (>= 3.17.0)
40
44
  parallel (~> 1.10)
41
- parser (>= 3.2.0.0)
45
+ parser (>= 3.2.2.4)
42
46
  rainbow (>= 2.2.2, < 4.0)
43
47
  regexp_parser (>= 1.8, < 3.0)
44
48
  rexml (>= 3.2.5, < 4.0)
45
- rubocop-ast (>= 1.26.0, < 2.0)
49
+ rubocop-ast (>= 1.30.0, < 2.0)
46
50
  ruby-progressbar (~> 1.7)
47
51
  unicode-display_width (>= 2.4.0, < 3.0)
48
- rubocop-ast (1.27.0)
52
+ rubocop-ast (1.30.0)
49
53
  parser (>= 3.2.1.0)
50
- rubocop-performance (1.16.0)
51
- rubocop (>= 1.7.0, < 2.0)
52
- rubocop-ast (>= 0.4.0)
54
+ rubocop-performance (1.20.2)
55
+ rubocop (>= 1.48.1, < 2.0)
56
+ rubocop-ast (>= 1.30.0, < 2.0)
53
57
  ruby-progressbar (1.13.0)
54
- standard (1.25.1)
58
+ standard (1.33.0)
55
59
  language_server-protocol (~> 3.17.0.2)
56
- rubocop (= 1.48.1)
57
- rubocop-performance (= 1.16.0)
58
- unicode-display_width (2.4.2)
59
- webrick (1.7.0)
60
- yard (0.9.28)
61
- webrick (~> 1.7.0)
60
+ lint_roller (~> 1.0)
61
+ rubocop (~> 1.59.0)
62
+ standard-custom (~> 1.0.0)
63
+ standard-performance (~> 1.3)
64
+ standard-custom (1.0.2)
65
+ lint_roller (~> 1.0)
66
+ rubocop (~> 1.50)
67
+ standard-performance (1.3.1)
68
+ lint_roller (~> 1.1)
69
+ rubocop-performance (~> 1.20.2)
70
+ unicode-display_width (2.5.0)
71
+ yard (0.9.34)
62
72
  yard-doctest (0.1.17)
63
73
  minitest
64
74
  yard
@@ -70,12 +80,13 @@ PLATFORMS
70
80
  x86_64-linux
71
81
 
72
82
  DEPENDENCIES
73
- rake (~> 13.0)
83
+ racc
84
+ rake
74
85
  rake-compiler
75
- rspec (~> 3.0)
76
- standard (~> 1.3)
86
+ rspec
87
+ standard
77
88
  tiktoken_ruby!
78
- yard-doctest (~> 0.1.17)
89
+ yard-doctest
79
90
 
80
91
  BUNDLED WITH
81
92
  2.4.6
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.6"
4
+ VERSION = "0.0.7"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -33,17 +33,15 @@ module Tiktoken
33
33
  # enc = Tiktoken.encoding_for_model("gpt-4")
34
34
  # enc.encode("hello world").length #=> 2
35
35
  def encoding_for_model(model_name)
36
- PREFIX_MODELS.each do |prefix|
37
- if model_name.to_s.start_with?("#{prefix}-")
38
- model_name = prefix
39
- break
40
- end
36
+ if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
37
+ return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
41
38
  end
42
39
 
43
- encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
44
- return nil unless encoding_name
45
-
46
- get_encoding(encoding_name)
40
+ MODEL_PREFIX_TO_ENCODING.each do |prefix, encoding|
41
+ if model_name.start_with?(prefix.to_s)
42
+ return get_encoding(encoding)
43
+ end
44
+ end
47
45
  end
48
46
 
49
47
  # Lists all the encodings that are supported
@@ -67,12 +65,22 @@ module Tiktoken
67
65
  :cl100k_base
68
66
  ]
69
67
 
70
- # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
68
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
71
69
  # that is also MIT licensed but by OpenAI
72
70
  MODEL_TO_ENCODING_NAME = {
71
+ # chat
73
72
  "gpt-4": "cl100k_base",
74
73
  "gpt-3.5-turbo": "cl100k_base",
75
- # text
74
+ "gpt-35-turbo": "cl100k_base", # Azure deployment name
75
+ # base
76
+ "davinci-002": "cl100k_base",
77
+ "babbage-002": "cl100k_base",
78
+ # embeddings
79
+ "text-embedding-ada-002": "cl100k_base",
80
+ "text-embedding-3-small": "cl100k_base",
81
+ "text-embedding-3-large": "cl100k_base",
82
+ # DEPRECATED MODELS
83
+ # text (DEPRECATED)
76
84
  "text-davinci-003": "p50k_base",
77
85
  "text-davinci-002": "p50k_base",
78
86
  "text-davinci-001": "r50k_base",
@@ -83,19 +91,17 @@ module Tiktoken
83
91
  curie: "r50k_base",
84
92
  babbage: "r50k_base",
85
93
  ada: "r50k_base",
86
- # code
94
+ # code (DEPRECATED)
87
95
  "code-davinci-002": "p50k_base",
88
96
  "code-davinci-001": "p50k_base",
89
97
  "code-cushman-002": "p50k_base",
90
98
  "code-cushman-001": "p50k_base",
91
99
  "davinci-codex": "p50k_base",
92
100
  "cushman-codex": "p50k_base",
93
- # edit
101
+ # edit (DEPRECATED)
94
102
  "text-davinci-edit-001": "p50k_edit",
95
103
  "code-davinci-edit-001": "p50k_edit",
96
- # embeddings
97
- "text-embedding-ada-002": "cl100k_base",
98
- # old embeddings
104
+ # old embeddings (DEPRECATED)
99
105
  "text-similarity-davinci-001": "r50k_base",
100
106
  "text-similarity-curie-001": "r50k_base",
101
107
  "text-similarity-babbage-001": "r50k_base",
@@ -105,10 +111,21 @@ module Tiktoken
105
111
  "text-search-babbage-doc-001": "r50k_base",
106
112
  "text-search-ada-doc-001": "r50k_base",
107
113
  "code-search-babbage-code-001": "r50k_base",
108
- "code-search-ada-code-001": "r50k_base"
114
+ "code-search-ada-code-001": "r50k_base",
115
+ # open source
116
+ gpt2: "gpt2"
109
117
  }
110
118
 
111
- # these are models that have a versioned models that are otherwise identical
112
- PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
119
+ MODEL_PREFIX_TO_ENCODING = {
120
+ # chat
121
+ "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
122
+ "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
123
+ "gpt-35-turbo-": "cl100k_base", # Azure deployment name
124
+ # fine-tuned
125
+ "ft:gpt-4": "cl100k_base",
126
+ "ft:gpt-3.5-turbo": "cl100k_base",
127
+ "ft:davinci-002": "cl100k_base",
128
+ "ft:babbage-002": "cl100k_base"
129
+ }
113
130
  end
114
131
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-09-24 00:00:00.000000000 Z
11
+ date: 2024-02-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
14
  used by OpenAI. It can be used to count the number of tokens in text before sending
@@ -31,6 +31,7 @@ files:
31
31
  - lib/tiktoken_ruby/3.0/tiktoken_ruby.bundle
32
32
  - lib/tiktoken_ruby/3.1/tiktoken_ruby.bundle
33
33
  - lib/tiktoken_ruby/3.2/tiktoken_ruby.bundle
34
+ - lib/tiktoken_ruby/3.3/tiktoken_ruby.bundle
34
35
  - lib/tiktoken_ruby/encoding.rb
35
36
  - lib/tiktoken_ruby/version.rb
36
37
  - sig/tiktoken_ruby.rbs
@@ -52,7 +53,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
52
53
  version: '3.0'
53
54
  - - "<"
54
55
  - !ruby/object:Gem::Version
55
- version: 3.3.dev
56
+ version: 3.4.dev
56
57
  required_rubygems_version: !ruby/object:Gem::Requirement
57
58
  requirements:
58
59
  - - ">="