tiktoken_ruby 0.0.3-x86_64-darwin → 0.0.5-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5ff2e729499103996a4b72baf94f7130746ca7ce453597c492ed1a679b5faab3
4
- data.tar.gz: 583ff111ac69257887fdfae23c84420b9f305ac36ba37450fbbc87e4e5c707dd
3
+ metadata.gz: 0ca5ca7c3f9e6736bb3f30805554db928aea7a0c84378d42cdb5a58bbcfcf526
4
+ data.tar.gz: 4ca2b4bfd46a8119a8c8d58140c5fadf4e0b5c366695ac2a7173609aebb0d862
5
5
  SHA512:
6
- metadata.gz: 5b9961606f15107c39547687741c39631da1244d9d44ac7d78347ebc13a6088aa0e13451779a7f4aa064ec9b929231c6777ceac296f730a8db07182fa97c93b1
7
- data.tar.gz: 5a27a8983d4c29e9b17b5f5692955a38c26f903c098edb35ad690ed08d88c7c0425daa2bc44aff3742073b380b63eaea41d03dd05a22f2630fca89c4b41a3562
6
+ metadata.gz: caf4a222f5ae7a76823bdbd0a6639ae285e4bb4e1de6f58d77f5d5a341d20c733efac27022423f3151b1da0a085e2da9d634aff264615f969fc9390697f695ce
7
+ data.tar.gz: bc7115d352e2faa4283477934d0f09c9470c9a590aa4a924d0dea87d394b49889853b6bd8beac2f10367f6efc4d508732d8cc32931742ee64e99bdfbac9765e7
data/Gemfile CHANGED
@@ -8,11 +8,9 @@ gemspec
8
8
  gem "rake", "~> 13.0"
9
9
 
10
10
  gem "rake-compiler"
11
- gem "rb_sys"
12
11
 
13
12
  gem "rspec", "~> 3.0"
14
13
 
15
14
  gem "standard", "~> 1.3"
16
- gem 'pry', '~> 0.14.2'
17
15
 
18
16
  gem "yard-doctest", "~> 0.1.17"
data/Gemfile.lock CHANGED
@@ -1,24 +1,20 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.3)
4
+ tiktoken_ruby (0.0.5)
5
+ rb_sys (~> 0.9.68)
5
6
 
6
7
  GEM
7
8
  remote: https://rubygems.org/
8
9
  specs:
9
10
  ast (2.4.2)
10
- coderay (1.1.3)
11
11
  diff-lcs (1.5.0)
12
12
  json (2.6.3)
13
13
  language_server-protocol (3.17.0.3)
14
- method_source (1.0.0)
15
14
  minitest (5.18.0)
16
15
  parallel (1.22.1)
17
16
  parser (3.2.1.1)
18
17
  ast (~> 2.4.1)
19
- pry (0.14.2)
20
- coderay (~> 1.1)
21
- method_source (~> 1.0)
22
18
  rainbow (3.1.1)
23
19
  rake (13.0.6)
24
20
  rake-compiler (1.2.1)
@@ -69,13 +65,13 @@ GEM
69
65
 
70
66
  PLATFORMS
71
67
  arm64-darwin-22
68
+ ruby
69
+ x86_64-darwin-22
72
70
  x86_64-linux
73
71
 
74
72
  DEPENDENCIES
75
- pry (~> 0.14.2)
76
73
  rake (~> 13.0)
77
74
  rake-compiler
78
- rb_sys
79
75
  rspec (~> 3.0)
80
76
  standard (~> 1.3)
81
77
  tiktoken_ruby!
data/README.md CHANGED
@@ -15,7 +15,7 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
15
  $ gem install tiktoken_ruby
16
16
 
17
17
  ## Usage
18
- Usage should be very similar to the python library. here's a simple example
18
+ Usage should be very similar to the python library. Here's a simple example
19
19
 
20
20
  Encode and decode text
21
21
  ```ruby
@@ -43,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
43
43
 
44
44
  Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
45
45
 
46
+ To get started with development:
47
+
48
+ ```sh
49
+ git clone https://github.com/IAPark/tiktoken_ruby.git
50
+ cd tiktoken_ruby
51
+ bundle install
52
+ bundle exec rake compile
53
+ bundle exec rake spec
54
+ ```
55
+
56
+
46
57
  ## License
47
58
 
48
59
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile CHANGED
@@ -14,7 +14,6 @@ end
14
14
 
15
15
  RSpec::Core::RakeTask.new(:spec)
16
16
 
17
-
18
17
  task :native, [:platform] do |_t, platform:|
19
18
  sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build"
20
19
  end
data/doctest_helper.rb CHANGED
@@ -1 +1 @@
1
- require 'lib/tiktoken_ruby.rb'
1
+ require "lib/tiktoken_ruby"
@@ -1,51 +1,52 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Tiktoken::Encoding
4
- attr_reader :name
5
-
6
- # This returns a new Tiktoken::Encoding instance for the requested encoding
7
- # @param encoding [Symbol] The name of the encoding to load
8
- # @return [Tiktoken::Encoding] The encoding instance
9
- def self.for_name(encoding)
10
- Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
11
- end
12
-
13
- # This returns a Tiktoken::Encoding instance for the requested encoding
14
- # It will reuse an existing encoding if it's already been loaded
15
- # @param encoding [Symbol] The name of the encoding to load
16
- # @return [Tiktoken::Encoding] The encoding instance
17
- def self.for_name_cached(encoding)
18
- @encodings ||= {}
19
- @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
- end
21
-
22
- # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
- # basically it's unescaped
24
- # @param text [String] The text to encode
25
- # @return [Array<Integer>] The encoded tokens
26
- def encode_ordinary(text)
27
- @ext_base_bpe.encode_ordinary(text)
28
- end
29
-
30
- # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
- # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
- # @param text [String] The text to encode
33
- # @param allowed_special [Array<String>] An array of special tokens to allow
34
- # @return [Array<Integer>] The encoded tokens
35
- def encode(text, allowed_special: [])
36
- @ext_base_bpe.encode(text, allowed_special)
37
- end
38
-
39
- # Decodes the tokens back into text
40
- # @param tokens [Array<Integer>] The tokens to decode
41
- # @return [String] The decoded text
42
- def decode(tokens)
43
- @ext_base_bpe.decode(tokens)
44
- end
45
-
46
- private
47
- def initialize(ext_base_bpe, name)
48
- @ext_base_bpe = ext_base_bpe
49
- @name = name
50
- end
4
+ attr_reader :name
5
+
6
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
7
+ # @param encoding [Symbol] The name of the encoding to load
8
+ # @return [Tiktoken::Encoding] The encoding instance
9
+ def self.for_name(encoding)
10
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
11
+ end
12
+
13
+ # This returns a Tiktoken::Encoding instance for the requested encoding
14
+ # It will reuse an existing encoding if it's already been loaded
15
+ # @param encoding [Symbol] The name of the encoding to load
16
+ # @return [Tiktoken::Encoding] The encoding instance
17
+ def self.for_name_cached(encoding)
18
+ @encodings ||= {}
19
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ end
21
+
22
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
+ # basically it's unescaped
24
+ # @param text [String] The text to encode
25
+ # @return [Array<Integer>] The encoded tokens
26
+ def encode_ordinary(text)
27
+ @ext_base_bpe.encode_ordinary(text)
28
+ end
29
+
30
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
+ # @param text [String] The text to encode
33
+ # @param allowed_special [Array<String>] An array of special tokens to allow
34
+ # @return [Array<Integer>] The encoded tokens
35
+ def encode(text, allowed_special: [])
36
+ @ext_base_bpe.encode(text, allowed_special)
37
+ end
38
+
39
+ # Decodes the tokens back into text
40
+ # @param tokens [Array<Integer>] The tokens to decode
41
+ # @return [String] The decoded text
42
+ def decode(tokens)
43
+ @ext_base_bpe.decode(tokens)
44
+ end
45
+
46
+ private
47
+
48
+ def initialize(ext_base_bpe, name)
49
+ @ext_base_bpe = ext_base_bpe
50
+ @name = name
51
+ end
51
52
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.3"
4
+ VERSION = "0.0.5"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "tiktoken_ruby/version"
4
- require_relative "tiktoken_ruby/encoding.rb"
4
+ require_relative "tiktoken_ruby/encoding"
5
5
 
6
6
  begin
7
7
  RUBY_VERSION =~ /(\d+\.\d+)/
@@ -12,7 +12,6 @@ end
12
12
 
13
13
  module Tiktoken
14
14
  class << self
15
-
16
15
  # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
17
16
  # it will reuse the instance of that type that was previous loaded
18
17
  # @param name [Symbol|String] The name of the encoding to load
@@ -34,7 +33,7 @@ module Tiktoken
34
33
  # enc = Tiktoken.encoding_for_model("gpt-4")
35
34
  # enc.encode("hello world").length #=> 2
36
35
  def encoding_for_model(model_name)
37
- for prefix in PREFIX_MODELS
36
+ PREFIX_MODELS.each do |prefix|
38
37
  if model_name.to_s.start_with?("#{prefix}-")
39
38
  model_name = prefix
40
39
  break
@@ -65,7 +64,7 @@ module Tiktoken
65
64
  :r50k_base,
66
65
  :p50k_base,
67
66
  :p50k_edit,
68
- :cl100k_base,
67
+ :cl100k_base
69
68
  ]
70
69
 
71
70
  # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
@@ -80,10 +79,10 @@ module Tiktoken
80
79
  "text-curie-001": "r50k_base",
81
80
  "text-babbage-001": "r50k_base",
82
81
  "text-ada-001": "r50k_base",
83
- "davinci": "r50k_base",
84
- "curie": "r50k_base",
85
- "babbage": "r50k_base",
86
- "ada": "r50k_base",
82
+ davinci: "r50k_base",
83
+ curie: "r50k_base",
84
+ babbage: "r50k_base",
85
+ ada: "r50k_base",
87
86
  # code
88
87
  "code-davinci-002": "p50k_base",
89
88
  "code-davinci-001": "p50k_base",
@@ -106,7 +105,7 @@ module Tiktoken
106
105
  "text-search-babbage-doc-001": "r50k_base",
107
106
  "text-search-ada-doc-001": "r50k_base",
108
107
  "code-search-babbage-code-001": "r50k_base",
109
- "code-search-ada-code-001": "r50k_base",
108
+ "code-search-ada-code-001": "r50k_base"
110
109
  }
111
110
 
112
111
  # these are models that have a versioned models that are otherwise identical
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-03-21 00:00:00.000000000 Z
11
+ date: 2023-05-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
14
  used by OpenAI. It can be used to count the number of tokens in text before sending
@@ -28,7 +28,6 @@ files:
28
28
  - Rakefile
29
29
  - doctest_helper.rb
30
30
  - lib/tiktoken_ruby.rb
31
- - lib/tiktoken_ruby/2.7/tiktoken_ruby.bundle
32
31
  - lib/tiktoken_ruby/3.0/tiktoken_ruby.bundle
33
32
  - lib/tiktoken_ruby/3.1/tiktoken_ruby.bundle
34
33
  - lib/tiktoken_ruby/3.2/tiktoken_ruby.bundle
@@ -50,7 +49,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
50
49
  requirements:
51
50
  - - ">="
52
51
  - !ruby/object:Gem::Version
53
- version: '2.7'
52
+ version: '3.0'
54
53
  - - "<"
55
54
  - !ruby/object:Gem::Version
56
55
  version: 3.3.dev