tiktoken_ruby 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e011f7a82a6ee3d09405cd2c817d2ba9fe06e5a3bdde3ee622c9aefd9b1e8dd8
4
- data.tar.gz: 892539805e10e5e10456956aa65b72e7ab4ececae56db2c49ccd11848cd0d2c2
3
+ metadata.gz: de2d6e8e83771f2ef51351e019e9cebc7163a1775bfc0e812da58371574b9b63
4
+ data.tar.gz: fe2c629e8b435a181bfa4524655bfa137f1660061ab4f780e0edc15a11a7538d
5
5
  SHA512:
6
- metadata.gz: 5accbfe97904fb1e9b6c5fa793a9e326bb25befaca6dbf5a2f8e6cfcad5f3c80c7dd1d0f66aaa95bed732943a2c0999654ffa2cf40891b2c9443f8909efb1dc6
7
- data.tar.gz: 5e95a59d2876fb54e8a76e7d855c42d4e314be211f495dae5ef0bf4004cc44eeb0ea433a620e037232349a10d107ed25afafa704e1fffcddb4ce29219a199e87
6
+ metadata.gz: bbf721963e873464fae055d23308068fcdac8db4e27dea28653c3fc017f0803da59b4c549a0fb6b6339f79bc4379e2913e1c47fa25f9894ebde840b23aa81edb
7
+ data.tar.gz: 1275369f56a2498ce39c5a2b259efdab0684ae834c6d769bfe23d7190c69917a22d46dc7b7675cd15c8a44dcfa8f7c3be2df850035236fa33f8e55afd31db42b
data/Gemfile CHANGED
@@ -2,15 +2,11 @@
2
2
 
3
3
  source "https://rubygems.org"
4
4
 
5
- # Specify your gem's dependencies in tiktoken_ruby.gemspec
6
5
  gemspec
7
6
 
8
- gem "rake", "~> 13.0"
9
-
7
+ gem "rake"
10
8
  gem "rake-compiler"
11
-
12
- gem "rspec", "~> 3.0"
13
-
14
- gem "standard", "~> 1.3"
15
-
16
- gem "yard-doctest", "~> 0.1.17"
9
+ gem "rspec"
10
+ gem "standard"
11
+ gem "yard-doctest"
12
+ gem "racc"
data/Gemfile.lock CHANGED
@@ -1,64 +1,74 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.6)
5
- rb_sys (~> 0.9.68)
4
+ tiktoken_ruby (0.0.7)
5
+ rb_sys (>= 0.9.86)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
10
  ast (2.4.2)
11
11
  diff-lcs (1.5.0)
12
- json (2.6.3)
12
+ json (2.7.1)
13
13
  language_server-protocol (3.17.0.3)
14
- minitest (5.18.0)
15
- parallel (1.22.1)
16
- parser (3.2.1.1)
14
+ lint_roller (1.1.0)
15
+ minitest (5.21.2)
16
+ parallel (1.24.0)
17
+ parser (3.3.0.4)
17
18
  ast (~> 2.4.1)
19
+ racc
20
+ racc (1.7.3)
18
21
  rainbow (3.1.1)
19
- rake (13.0.6)
20
- rake-compiler (1.2.1)
22
+ rake (13.1.0)
23
+ rake-compiler (1.2.5)
21
24
  rake
22
- rb_sys (0.9.68)
23
- regexp_parser (2.7.0)
24
- rexml (3.2.5)
25
+ rb_sys (0.9.86)
26
+ regexp_parser (2.9.0)
27
+ rexml (3.2.6)
25
28
  rspec (3.12.0)
26
29
  rspec-core (~> 3.12.0)
27
30
  rspec-expectations (~> 3.12.0)
28
31
  rspec-mocks (~> 3.12.0)
29
- rspec-core (3.12.1)
32
+ rspec-core (3.12.2)
30
33
  rspec-support (~> 3.12.0)
31
- rspec-expectations (3.12.2)
34
+ rspec-expectations (3.12.3)
32
35
  diff-lcs (>= 1.2.0, < 2.0)
33
36
  rspec-support (~> 3.12.0)
34
- rspec-mocks (3.12.4)
37
+ rspec-mocks (3.12.6)
35
38
  diff-lcs (>= 1.2.0, < 2.0)
36
39
  rspec-support (~> 3.12.0)
37
- rspec-support (3.12.0)
38
- rubocop (1.48.1)
40
+ rspec-support (3.12.1)
41
+ rubocop (1.59.0)
39
42
  json (~> 2.3)
43
+ language_server-protocol (>= 3.17.0)
40
44
  parallel (~> 1.10)
41
- parser (>= 3.2.0.0)
45
+ parser (>= 3.2.2.4)
42
46
  rainbow (>= 2.2.2, < 4.0)
43
47
  regexp_parser (>= 1.8, < 3.0)
44
48
  rexml (>= 3.2.5, < 4.0)
45
- rubocop-ast (>= 1.26.0, < 2.0)
49
+ rubocop-ast (>= 1.30.0, < 2.0)
46
50
  ruby-progressbar (~> 1.7)
47
51
  unicode-display_width (>= 2.4.0, < 3.0)
48
- rubocop-ast (1.27.0)
52
+ rubocop-ast (1.30.0)
49
53
  parser (>= 3.2.1.0)
50
- rubocop-performance (1.16.0)
51
- rubocop (>= 1.7.0, < 2.0)
52
- rubocop-ast (>= 0.4.0)
54
+ rubocop-performance (1.20.2)
55
+ rubocop (>= 1.48.1, < 2.0)
56
+ rubocop-ast (>= 1.30.0, < 2.0)
53
57
  ruby-progressbar (1.13.0)
54
- standard (1.25.1)
58
+ standard (1.33.0)
55
59
  language_server-protocol (~> 3.17.0.2)
56
- rubocop (= 1.48.1)
57
- rubocop-performance (= 1.16.0)
58
- unicode-display_width (2.4.2)
59
- webrick (1.7.0)
60
- yard (0.9.28)
61
- webrick (~> 1.7.0)
60
+ lint_roller (~> 1.0)
61
+ rubocop (~> 1.59.0)
62
+ standard-custom (~> 1.0.0)
63
+ standard-performance (~> 1.3)
64
+ standard-custom (1.0.2)
65
+ lint_roller (~> 1.0)
66
+ rubocop (~> 1.50)
67
+ standard-performance (1.3.1)
68
+ lint_roller (~> 1.1)
69
+ rubocop-performance (~> 1.20.2)
70
+ unicode-display_width (2.5.0)
71
+ yard (0.9.34)
62
72
  yard-doctest (0.1.17)
63
73
  minitest
64
74
  yard
@@ -70,12 +80,13 @@ PLATFORMS
70
80
  x86_64-linux
71
81
 
72
82
  DEPENDENCIES
73
- rake (~> 13.0)
83
+ racc
84
+ rake
74
85
  rake-compiler
75
- rspec (~> 3.0)
76
- standard (~> 1.3)
86
+ rspec
87
+ standard
77
88
  tiktoken_ruby!
78
- yard-doctest (~> 0.1.17)
89
+ yard-doctest
79
90
 
80
91
  BUNDLED WITH
81
92
  2.4.6
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.6"
4
+ VERSION = "0.0.7"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -33,17 +33,15 @@ module Tiktoken
33
33
  # enc = Tiktoken.encoding_for_model("gpt-4")
34
34
  # enc.encode("hello world").length #=> 2
35
35
  def encoding_for_model(model_name)
36
- PREFIX_MODELS.each do |prefix|
37
- if model_name.to_s.start_with?("#{prefix}-")
38
- model_name = prefix
39
- break
40
- end
36
+ if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
37
+ return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
41
38
  end
42
39
 
43
- encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
44
- return nil unless encoding_name
45
-
46
- get_encoding(encoding_name)
40
+ MODEL_PREFIX_TO_ENCODING.each do |prefix, encoding|
41
+ if model_name.start_with?(prefix.to_s)
42
+ return get_encoding(encoding)
43
+ end
44
+ end
47
45
  end
48
46
 
49
47
  # Lists all the encodings that are supported
@@ -67,12 +65,22 @@ module Tiktoken
67
65
  :cl100k_base
68
66
  ]
69
67
 
70
- # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
68
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
71
69
  # that is also MIT licensed but by OpenAI
72
70
  MODEL_TO_ENCODING_NAME = {
71
+ # chat
73
72
  "gpt-4": "cl100k_base",
74
73
  "gpt-3.5-turbo": "cl100k_base",
75
- # text
74
+ "gpt-35-turbo": "cl100k_base", # Azure deployment name
75
+ # base
76
+ "davinci-002": "cl100k_base",
77
+ "babbage-002": "cl100k_base",
78
+ # embeddings
79
+ "text-embedding-ada-002": "cl100k_base",
80
+ "text-embedding-3-small": "cl100k_base",
81
+ "text-embedding-3-large": "cl100k_base",
82
+ # DEPRECATED MODELS
83
+ # text (DEPRECATED)
76
84
  "text-davinci-003": "p50k_base",
77
85
  "text-davinci-002": "p50k_base",
78
86
  "text-davinci-001": "r50k_base",
@@ -83,19 +91,17 @@ module Tiktoken
83
91
  curie: "r50k_base",
84
92
  babbage: "r50k_base",
85
93
  ada: "r50k_base",
86
- # code
94
+ # code (DEPRECATED)
87
95
  "code-davinci-002": "p50k_base",
88
96
  "code-davinci-001": "p50k_base",
89
97
  "code-cushman-002": "p50k_base",
90
98
  "code-cushman-001": "p50k_base",
91
99
  "davinci-codex": "p50k_base",
92
100
  "cushman-codex": "p50k_base",
93
- # edit
101
+ # edit (DEPRECATED)
94
102
  "text-davinci-edit-001": "p50k_edit",
95
103
  "code-davinci-edit-001": "p50k_edit",
96
- # embeddings
97
- "text-embedding-ada-002": "cl100k_base",
98
- # old embeddings
104
+ # old embeddings (DEPRECATED)
99
105
  "text-similarity-davinci-001": "r50k_base",
100
106
  "text-similarity-curie-001": "r50k_base",
101
107
  "text-similarity-babbage-001": "r50k_base",
@@ -105,10 +111,21 @@ module Tiktoken
105
111
  "text-search-babbage-doc-001": "r50k_base",
106
112
  "text-search-ada-doc-001": "r50k_base",
107
113
  "code-search-babbage-code-001": "r50k_base",
108
- "code-search-ada-code-001": "r50k_base"
114
+ "code-search-ada-code-001": "r50k_base",
115
+ # open source
116
+ gpt2: "gpt2"
109
117
  }
110
118
 
111
- # these are models that have a versioned models that are otherwise identical
112
- PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
119
+ MODEL_PREFIX_TO_ENCODING = {
120
+ # chat
121
+ "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
122
+ "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
123
+ "gpt-35-turbo-": "cl100k_base", # Azure deployment name
124
+ # fine-tuned
125
+ "ft:gpt-4": "cl100k_base",
126
+ "ft:gpt-3.5-turbo": "cl100k_base",
127
+ "ft:davinci-002": "cl100k_base",
128
+ "ft:babbage-002": "cl100k_base"
129
+ }
113
130
  end
114
131
  end
@@ -7,12 +7,10 @@ Gem::Specification.new do |spec|
7
7
  spec.version = Tiktoken::VERSION
8
8
  spec.authors = ["IAPark"]
9
9
  spec.email = ["isaac.a.park@gmail.com"]
10
-
11
10
  spec.summary = "Ruby wrapper for Tiktoken"
12
11
  spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
13
12
  "a BPE tokenizer written by and used by OpenAI. It can be used to " \
14
13
  "count the number of tokens in text before sending it to OpenAI APIs."
15
-
16
14
  spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
17
15
  spec.license = "MIT"
18
16
  spec.required_ruby_version = ">= 2.7.0"
@@ -22,11 +20,6 @@ Gem::Specification.new do |spec|
22
20
  spec.metadata["homepage_uri"] = spec.homepage
23
21
  spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
24
22
  spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
25
-
26
- # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
27
-
28
- # Specify which files should be added to the gem when it is released.
29
- # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
30
23
  spec.files = Dir.chdir(__dir__) do
31
24
  `git ls-files -z`.split("\x0").reject do |f|
32
25
  (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
@@ -36,9 +29,5 @@ Gem::Specification.new do |spec|
36
29
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
37
30
  spec.require_paths = ["lib"]
38
31
  spec.extensions = ["ext/tiktoken_ruby/extconf.rb"]
39
-
40
- spec.add_dependency "rb_sys", "~> 0.9.68"
41
-
42
- # For more information and examples about making a new gem, check out our
43
- # guide at: https://bundler.io/guides/creating_gem.html
32
+ spec.add_dependency "rb_sys", ">= 0.9.86"
44
33
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-09-24 00:00:00.000000000 Z
11
+ date: 2024-02-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 0.9.68
19
+ version: 0.9.86
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.9.68
26
+ version: 0.9.86
27
27
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
28
28
  used by OpenAI. It can be used to count the number of tokens in text before sending
29
29
  it to OpenAI APIs.