tiktoken_ruby 0.0.3-x64-mingw-ucrt → 0.0.4-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0fc02070b46f894e41faf76d01edb3448bcf9fec33d8ce9c84a066f3b6a3c42c
4
- data.tar.gz: d3059efb8fb610aabf5a32427d6d82040882b1384c046cdbd8c8f556babbfd54
3
+ metadata.gz: 1d3ae645bd74a9706f834cfddab56cfba8233233243f0f9f7a6e38f453ea0d7f
4
+ data.tar.gz: c9aba2e4ffecf42dda1ec11b40e22c929dc471cf0fdffcbaa243e477b7aa18dd
5
5
  SHA512:
6
- metadata.gz: 1b5826fa7ca9377abc35a5c4dd985afa2960110def15b992d350363b9fe66340d27e3ee9845abd4ba2571e9bec6777eb0d46902cb4b88c5e1ff3fc9ef0ad2346
7
- data.tar.gz: f4613b799248700b53e24eaa7e87f3cca774921955b3db8fd5cddecc027b6df37924daa5dabed55943e89bb0bc6d78d7528e6c33eb46dac5ecaa2f1a868af019
6
+ metadata.gz: 9163832bedcebe8d97b2319862bc7230ea78441b0883655392671e9269360dd74a65234b79a2277876a786e2473b0be3feadea6b865283217ed2a75b019de2f2
7
+ data.tar.gz: ece1e1a541aa4b63818096b837ae1ae2e0f4fef91e1ff6baebfd2471f10fb746d6830d47c170d7fd524adede6d7411abe78bfa650ef4e27997c5064ef871373a
data/Gemfile CHANGED
@@ -13,6 +13,5 @@ gem "rb_sys"
13
13
  gem "rspec", "~> 3.0"
14
14
 
15
15
  gem "standard", "~> 1.3"
16
- gem 'pry', '~> 0.14.2'
17
16
 
18
17
  gem "yard-doctest", "~> 0.1.17"
data/Gemfile.lock CHANGED
@@ -1,24 +1,19 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.3)
4
+ tiktoken_ruby (0.0.4)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  ast (2.4.2)
10
- coderay (1.1.3)
11
10
  diff-lcs (1.5.0)
12
11
  json (2.6.3)
13
12
  language_server-protocol (3.17.0.3)
14
- method_source (1.0.0)
15
13
  minitest (5.18.0)
16
14
  parallel (1.22.1)
17
15
  parser (3.2.1.1)
18
16
  ast (~> 2.4.1)
19
- pry (0.14.2)
20
- coderay (~> 1.1)
21
- method_source (~> 1.0)
22
17
  rainbow (3.1.1)
23
18
  rake (13.0.6)
24
19
  rake-compiler (1.2.1)
@@ -69,10 +64,10 @@ GEM
69
64
 
70
65
  PLATFORMS
71
66
  arm64-darwin-22
67
+ x86_64-darwin-22
72
68
  x86_64-linux
73
69
 
74
70
  DEPENDENCIES
75
- pry (~> 0.14.2)
76
71
  rake (~> 13.0)
77
72
  rake-compiler
78
73
  rb_sys
data/README.md CHANGED
@@ -15,7 +15,7 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
15
  $ gem install tiktoken_ruby
16
16
 
17
17
  ## Usage
18
- Usage should be very similar to the python library. here's a simple example
18
+ Usage should be very similar to the python library. Here's a simple example
19
19
 
20
20
  Encode and decode text
21
21
  ```ruby
@@ -43,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
43
43
 
44
44
  Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
45
45
 
46
+ To get started with development:
47
+
48
+ ```sh
49
+ git clone https://github.com/IAPark/tiktoken_ruby.git
50
+ cd tiktoken_ruby
51
+ bundle install
52
+ bundle exec rake compile
53
+ bundle exec rake spec
54
+ ```
55
+
56
+
46
57
  ## License
47
58
 
48
59
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile CHANGED
@@ -14,7 +14,6 @@ end
14
14
 
15
15
  RSpec::Core::RakeTask.new(:spec)
16
16
 
17
-
18
17
  task :native, [:platform] do |_t, platform:|
19
18
  sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build"
20
19
  end
data/doctest_helper.rb CHANGED
@@ -1 +1 @@
1
- require 'lib/tiktoken_ruby.rb'
1
+ require "lib/tiktoken_ruby"
Binary file
Binary file
@@ -1,51 +1,52 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Tiktoken::Encoding
4
- attr_reader :name
5
-
6
- # This returns a new Tiktoken::Encoding instance for the requested encoding
7
- # @param encoding [Symbol] The name of the encoding to load
8
- # @return [Tiktoken::Encoding] The encoding instance
9
- def self.for_name(encoding)
10
- Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
11
- end
12
-
13
- # This returns a Tiktoken::Encoding instance for the requested encoding
14
- # It will reuse an existing encoding if it's already been loaded
15
- # @param encoding [Symbol] The name of the encoding to load
16
- # @return [Tiktoken::Encoding] The encoding instance
17
- def self.for_name_cached(encoding)
18
- @encodings ||= {}
19
- @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
- end
21
-
22
- # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
- # basically it's unescaped
24
- # @param text [String] The text to encode
25
- # @return [Array<Integer>] The encoded tokens
26
- def encode_ordinary(text)
27
- @ext_base_bpe.encode_ordinary(text)
28
- end
29
-
30
- # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
- # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
- # @param text [String] The text to encode
33
- # @param allowed_special [Array<String>] An array of special tokens to allow
34
- # @return [Array<Integer>] The encoded tokens
35
- def encode(text, allowed_special: [])
36
- @ext_base_bpe.encode(text, allowed_special)
37
- end
38
-
39
- # Decodes the tokens back into text
40
- # @param tokens [Array<Integer>] The tokens to decode
41
- # @return [String] The decoded text
42
- def decode(tokens)
43
- @ext_base_bpe.decode(tokens)
44
- end
45
-
46
- private
47
- def initialize(ext_base_bpe, name)
48
- @ext_base_bpe = ext_base_bpe
49
- @name = name
50
- end
4
+ attr_reader :name
5
+
6
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
7
+ # @param encoding [Symbol] The name of the encoding to load
8
+ # @return [Tiktoken::Encoding] The encoding instance
9
+ def self.for_name(encoding)
10
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
11
+ end
12
+
13
+ # This returns a Tiktoken::Encoding instance for the requested encoding
14
+ # It will reuse an existing encoding if it's already been loaded
15
+ # @param encoding [Symbol] The name of the encoding to load
16
+ # @return [Tiktoken::Encoding] The encoding instance
17
+ def self.for_name_cached(encoding)
18
+ @encodings ||= {}
19
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ end
21
+
22
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
+ # basically it's unescaped
24
+ # @param text [String] The text to encode
25
+ # @return [Array<Integer>] The encoded tokens
26
+ def encode_ordinary(text)
27
+ @ext_base_bpe.encode_ordinary(text)
28
+ end
29
+
30
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
+ # @param text [String] The text to encode
33
+ # @param allowed_special [Array<String>] An array of special tokens to allow
34
+ # @return [Array<Integer>] The encoded tokens
35
+ def encode(text, allowed_special: [])
36
+ @ext_base_bpe.encode(text, allowed_special)
37
+ end
38
+
39
+ # Decodes the tokens back into text
40
+ # @param tokens [Array<Integer>] The tokens to decode
41
+ # @return [String] The decoded text
42
+ def decode(tokens)
43
+ @ext_base_bpe.decode(tokens)
44
+ end
45
+
46
+ private
47
+
48
+ def initialize(ext_base_bpe, name)
49
+ @ext_base_bpe = ext_base_bpe
50
+ @name = name
51
+ end
51
52
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.3"
4
+ VERSION = "0.0.4"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "tiktoken_ruby/version"
4
- require_relative "tiktoken_ruby/encoding.rb"
4
+ require_relative "tiktoken_ruby/encoding"
5
5
 
6
6
  begin
7
7
  RUBY_VERSION =~ /(\d+\.\d+)/
@@ -12,7 +12,6 @@ end
12
12
 
13
13
  module Tiktoken
14
14
  class << self
15
-
16
15
  # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
17
16
  # it will reuse the instance of that type that was previous loaded
18
17
  # @param name [Symbol|String] The name of the encoding to load
@@ -34,7 +33,7 @@ module Tiktoken
34
33
  # enc = Tiktoken.encoding_for_model("gpt-4")
35
34
  # enc.encode("hello world").length #=> 2
36
35
  def encoding_for_model(model_name)
37
- for prefix in PREFIX_MODELS
36
+ PREFIX_MODELS.each do |prefix|
38
37
  if model_name.to_s.start_with?("#{prefix}-")
39
38
  model_name = prefix
40
39
  break
@@ -65,7 +64,7 @@ module Tiktoken
65
64
  :r50k_base,
66
65
  :p50k_base,
67
66
  :p50k_edit,
68
- :cl100k_base,
67
+ :cl100k_base
69
68
  ]
70
69
 
71
70
  # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
@@ -80,10 +79,10 @@ module Tiktoken
80
79
  "text-curie-001": "r50k_base",
81
80
  "text-babbage-001": "r50k_base",
82
81
  "text-ada-001": "r50k_base",
83
- "davinci": "r50k_base",
84
- "curie": "r50k_base",
85
- "babbage": "r50k_base",
86
- "ada": "r50k_base",
82
+ davinci: "r50k_base",
83
+ curie: "r50k_base",
84
+ babbage: "r50k_base",
85
+ ada: "r50k_base",
87
86
  # code
88
87
  "code-davinci-002": "p50k_base",
89
88
  "code-davinci-001": "p50k_base",
@@ -106,7 +105,7 @@ module Tiktoken
106
105
  "text-search-babbage-doc-001": "r50k_base",
107
106
  "text-search-ada-doc-001": "r50k_base",
108
107
  "code-search-babbage-code-001": "r50k_base",
109
- "code-search-ada-code-001": "r50k_base",
108
+ "code-search-ada-code-001": "r50k_base"
110
109
  }
111
110
 
112
111
  # these are models that have a versioned models that are otherwise identical
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: x64-mingw-ucrt
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-03-21 00:00:00.000000000 Z
11
+ date: 2023-03-28 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
14
  used by OpenAI. It can be used to count the number of tokens in text before sending