tiktoken_ruby 0.0.2-x64-mingw-ucrt → 0.0.3-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a76f3a21d5c1994e02b643393862b4d4183c40d5c2ad004cda1280b7ca4c3d9
4
- data.tar.gz: fa20012e669bcd1ef57e6781488405c0436b65a960270a18e2603b1610278950
3
+ metadata.gz: 0fc02070b46f894e41faf76d01edb3448bcf9fec33d8ce9c84a066f3b6a3c42c
4
+ data.tar.gz: d3059efb8fb610aabf5a32427d6d82040882b1384c046cdbd8c8f556babbfd54
5
5
  SHA512:
6
- metadata.gz: db8f73e134a5fdb08e8dbb520e01b62829f373cc7f6073ca80928e6deeb5e4fee30d9fb57dbb4f691b570edaafad7eb6e507ead8d588b975a1a1ebbe477e8489
7
- data.tar.gz: 8f89833ea89e23fdc50d71ca927d69bc15ec45dc81e399e3014f66b854b039dc51bbb3faa5f468b478438aa34aa09cd4f2c7f6c7702e04dcee9b734e72a2a12f
6
+ metadata.gz: 1b5826fa7ca9377abc35a5c4dd985afa2960110def15b992d350363b9fe66340d27e3ee9845abd4ba2571e9bec6777eb0d46902cb4b88c5e1ff3fc9ef0ad2346
7
+ data.tar.gz: f4613b799248700b53e24eaa7e87f3cca774921955b3db8fd5cddecc027b6df37924daa5dabed55943e89bb0bc6d78d7528e6c33eb46dac5ecaa2f1a868af019
data/Gemfile CHANGED
@@ -14,3 +14,5 @@ gem "rspec", "~> 3.0"
14
14
 
15
15
  gem "standard", "~> 1.3"
16
16
  gem 'pry', '~> 0.14.2'
17
+
18
+ gem "yard-doctest", "~> 0.1.17"
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.2)
4
+ tiktoken_ruby (0.0.3)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -12,6 +12,7 @@ GEM
12
12
  json (2.6.3)
13
13
  language_server-protocol (3.17.0.3)
14
14
  method_source (1.0.0)
15
+ minitest (5.18.0)
15
16
  parallel (1.22.1)
16
17
  parser (3.2.1.1)
17
18
  ast (~> 2.4.1)
@@ -59,6 +60,12 @@ GEM
59
60
  rubocop (= 1.48.1)
60
61
  rubocop-performance (= 1.16.0)
61
62
  unicode-display_width (2.4.2)
63
+ webrick (1.7.0)
64
+ yard (0.9.28)
65
+ webrick (~> 1.7.0)
66
+ yard-doctest (0.1.17)
67
+ minitest
68
+ yard
62
69
 
63
70
  PLATFORMS
64
71
  arm64-darwin-22
@@ -72,6 +79,7 @@ DEPENDENCIES
72
79
  rspec (~> 3.0)
73
80
  standard (~> 1.3)
74
81
  tiktoken_ruby!
82
+ yard-doctest (~> 0.1.17)
75
83
 
76
84
  BUNDLED WITH
77
85
  2.4.6
data/README.md CHANGED
@@ -15,11 +15,22 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
15
  $ gem install tiktoken_ruby
16
16
 
17
17
  ## Usage
18
+ Usage should be very similar to the python library. here's a simple example
18
19
 
20
+ Encode and decode text
19
21
  ```ruby
20
- encoding = Tiktoken::Encoding.r50k_base
21
- tokens = encoding.encode("Hello world!")
22
- puts encoding.decode(tokens)
22
+ require 'tiktoken_ruby'
23
+
24
+ enc = Tiktoken.get_encoding("cl100k_base")
25
+ enc.decode(enc.encode("hello world")) #=> "hello world"
26
+ ```
27
+
28
+ Encoders can also be retrieved by model name
29
+ ```ruby
30
+ require 'tiktoken_ruby'
31
+
32
+ enc = Tiktoken.encoding_for_model("gpt-4")
33
+ enc.encode("hello world").length #=> 2
23
34
  ```
24
35
 
25
36
  ## Development
data/doctest_helper.rb ADDED
@@ -0,0 +1 @@
1
+ require 'lib/tiktoken_ruby.rb'
Binary file
Binary file
@@ -1,23 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Tiktoken::Encoding
4
- def self.method_missing(method)
5
- Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(method))
3
+ class Tiktoken::Encoding
4
+ attr_reader :name
5
+
6
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
7
+ # @param encoding [Symbol] The name of the encoding to load
8
+ # @return [Tiktoken::Encoding] The encoding instance
9
+ def self.for_name(encoding)
10
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
6
11
  end
7
12
 
8
- def initialize(ext_base_bpe)
9
- @ext_base_bpe = ext_base_bpe
13
+ # This returns a Tiktoken::Encoding instance for the requested encoding
14
+ # It will reuse an existing encoding if it's already been loaded
15
+ # @param encoding [Symbol] The name of the encoding to load
16
+ # @return [Tiktoken::Encoding] The encoding instance
17
+ def self.for_name_cached(encoding)
18
+ @encodings ||= {}
19
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
10
20
  end
11
21
 
22
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
+ # basically it's unescaped
24
+ # @param text [String] The text to encode
25
+ # @return [Array<Integer>] The encoded tokens
12
26
  def encode_ordinary(text)
13
27
  @ext_base_bpe.encode_ordinary(text)
14
28
  end
15
29
 
30
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
+ # @param text [String] The text to encode
33
+ # @param allowed_special [Array<String>] An array of special tokens to allow
34
+ # @return [Array<Integer>] The encoded tokens
16
35
  def encode(text, allowed_special: [])
17
36
  @ext_base_bpe.encode(text, allowed_special)
18
37
  end
19
38
 
39
+ # Decodes the tokens back into text
40
+ # @param tokens [Array<Integer>] The tokens to decode
41
+ # @return [String] The decoded text
20
42
  def decode(tokens)
21
43
  @ext_base_bpe.decode(tokens)
22
44
  end
45
+
46
+ private
47
+ def initialize(ext_base_bpe, name)
48
+ @ext_base_bpe = ext_base_bpe
49
+ @name = name
50
+ end
23
51
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.2"
4
+ VERSION = "0.0.3"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -11,5 +11,105 @@ rescue LoadError
11
11
  end
12
12
 
13
13
  module Tiktoken
14
- class Error < StandardError; end
14
+ class << self
15
+
16
+ # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
17
+ # it will reuse the instance of that type that was previous loaded
18
+ # @param name [Symbol|String] The name of the encoding to load
19
+ # @return [Tiktoken::Encoding] The encoding instance
20
+ # @example Encode and decode text
21
+ # enc = Tiktoken.get_encoding("cl100k_base")
22
+ # enc.decode(enc.encode("hello world")) #=> "hello world"
23
+ def get_encoding(name)
24
+ name = name.to_sym
25
+ return nil unless SUPPORTED_ENCODINGS.include?(name)
26
+
27
+ Tiktoken::Encoding.for_name_cached(name)
28
+ end
29
+
30
+ # Gets the encoding for an OpenAI model
31
+ # @param model_name [Symbol|String] The name of the model to get the encoding for
32
+ # @return [Tiktoken::Encoding] The encoding instance
33
+ # @example Count tokens for text
34
+ # enc = Tiktoken.encoding_for_model("gpt-4")
35
+ # enc.encode("hello world").length #=> 2
36
+ def encoding_for_model(model_name)
37
+ for prefix in PREFIX_MODELS
38
+ if model_name.to_s.start_with?("#{prefix}-")
39
+ model_name = prefix
40
+ break
41
+ end
42
+ end
43
+
44
+ encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
45
+ return nil unless encoding_name
46
+
47
+ get_encoding(encoding_name)
48
+ end
49
+
50
+ # Lists all the encodings that are supported
51
+ # @return [Array<Symbol>] The list of supported encodings
52
+ def list_encoding_names
53
+ SUPPORTED_ENCODINGS
54
+ end
55
+
56
+ # Lists all the models that are supported
57
+ # @return [Array<Symbol>] The list of supported models
58
+ def list_model_names
59
+ MODEL_TO_ENCODING_NAME.keys
60
+ end
61
+
62
+ private
63
+
64
+ SUPPORTED_ENCODINGS = [
65
+ :r50k_base,
66
+ :p50k_base,
67
+ :p50k_edit,
68
+ :cl100k_base,
69
+ ]
70
+
71
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
72
+ # that is also MIT licensed but by OpenAI
73
+ MODEL_TO_ENCODING_NAME = {
74
+ "gpt-4": "cl100k_base",
75
+ "gpt-3.5-turbo": "cl100k_base",
76
+ # text
77
+ "text-davinci-003": "p50k_base",
78
+ "text-davinci-002": "p50k_base",
79
+ "text-davinci-001": "r50k_base",
80
+ "text-curie-001": "r50k_base",
81
+ "text-babbage-001": "r50k_base",
82
+ "text-ada-001": "r50k_base",
83
+ "davinci": "r50k_base",
84
+ "curie": "r50k_base",
85
+ "babbage": "r50k_base",
86
+ "ada": "r50k_base",
87
+ # code
88
+ "code-davinci-002": "p50k_base",
89
+ "code-davinci-001": "p50k_base",
90
+ "code-cushman-002": "p50k_base",
91
+ "code-cushman-001": "p50k_base",
92
+ "davinci-codex": "p50k_base",
93
+ "cushman-codex": "p50k_base",
94
+ # edit
95
+ "text-davinci-edit-001": "p50k_edit",
96
+ "code-davinci-edit-001": "p50k_edit",
97
+ # embeddings
98
+ "text-embedding-ada-002": "cl100k_base",
99
+ # old embeddings
100
+ "text-similarity-davinci-001": "r50k_base",
101
+ "text-similarity-curie-001": "r50k_base",
102
+ "text-similarity-babbage-001": "r50k_base",
103
+ "text-similarity-ada-001": "r50k_base",
104
+ "text-search-davinci-doc-001": "r50k_base",
105
+ "text-search-curie-doc-001": "r50k_base",
106
+ "text-search-babbage-doc-001": "r50k_base",
107
+ "text-search-ada-doc-001": "r50k_base",
108
+ "code-search-babbage-code-001": "r50k_base",
109
+ "code-search-ada-code-001": "r50k_base",
110
+ }
111
+
112
+ # these are models that have a versioned models that are otherwise identical
113
+ PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
114
+ end
15
115
  end
metadata CHANGED
@@ -1,16 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: x64-mingw-ucrt
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-03-19 00:00:00.000000000 Z
11
+ date: 2023-03-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Unofficial Ruby wrapper for Tiktoken by way of the unofficial rust bindings
13
+ description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
+ used by OpenAI. It can be used to count the number of tokens in text before sending
15
+ it to OpenAI APIs.
14
16
  email:
15
17
  - isaac.a.park@gmail.com
16
18
  executables: []
@@ -24,6 +26,7 @@ files:
24
26
  - LICENSE.txt
25
27
  - README.md
26
28
  - Rakefile
29
+ - doctest_helper.rb
27
30
  - lib/tiktoken_ruby.rb
28
31
  - lib/tiktoken_ruby/3.1/tiktoken_ruby.so
29
32
  - lib/tiktoken_ruby/3.2/tiktoken_ruby.so
@@ -36,6 +39,7 @@ licenses:
36
39
  metadata:
37
40
  homepage_uri: https://github.com/IAPark/tiktoken_ruby
38
41
  source_code_uri: https://github.com/IAPark/tiktoken_ruby
42
+ documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
39
43
  post_install_message:
40
44
  rdoc_options: []
41
45
  require_paths: