tiktoken_ruby 0.0.2-x64-mingw-ucrt → 0.0.3-x64-mingw-ucrt

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a76f3a21d5c1994e02b643393862b4d4183c40d5c2ad004cda1280b7ca4c3d9
4
- data.tar.gz: fa20012e669bcd1ef57e6781488405c0436b65a960270a18e2603b1610278950
3
+ metadata.gz: 0fc02070b46f894e41faf76d01edb3448bcf9fec33d8ce9c84a066f3b6a3c42c
4
+ data.tar.gz: d3059efb8fb610aabf5a32427d6d82040882b1384c046cdbd8c8f556babbfd54
5
5
  SHA512:
6
- metadata.gz: db8f73e134a5fdb08e8dbb520e01b62829f373cc7f6073ca80928e6deeb5e4fee30d9fb57dbb4f691b570edaafad7eb6e507ead8d588b975a1a1ebbe477e8489
7
- data.tar.gz: 8f89833ea89e23fdc50d71ca927d69bc15ec45dc81e399e3014f66b854b039dc51bbb3faa5f468b478438aa34aa09cd4f2c7f6c7702e04dcee9b734e72a2a12f
6
+ metadata.gz: 1b5826fa7ca9377abc35a5c4dd985afa2960110def15b992d350363b9fe66340d27e3ee9845abd4ba2571e9bec6777eb0d46902cb4b88c5e1ff3fc9ef0ad2346
7
+ data.tar.gz: f4613b799248700b53e24eaa7e87f3cca774921955b3db8fd5cddecc027b6df37924daa5dabed55943e89bb0bc6d78d7528e6c33eb46dac5ecaa2f1a868af019
data/Gemfile CHANGED
@@ -14,3 +14,5 @@ gem "rspec", "~> 3.0"
14
14
 
15
15
  gem "standard", "~> 1.3"
16
16
  gem 'pry', '~> 0.14.2'
17
+
18
+ gem "yard-doctest", "~> 0.1.17"
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.2)
4
+ tiktoken_ruby (0.0.3)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -12,6 +12,7 @@ GEM
12
12
  json (2.6.3)
13
13
  language_server-protocol (3.17.0.3)
14
14
  method_source (1.0.0)
15
+ minitest (5.18.0)
15
16
  parallel (1.22.1)
16
17
  parser (3.2.1.1)
17
18
  ast (~> 2.4.1)
@@ -59,6 +60,12 @@ GEM
59
60
  rubocop (= 1.48.1)
60
61
  rubocop-performance (= 1.16.0)
61
62
  unicode-display_width (2.4.2)
63
+ webrick (1.7.0)
64
+ yard (0.9.28)
65
+ webrick (~> 1.7.0)
66
+ yard-doctest (0.1.17)
67
+ minitest
68
+ yard
62
69
 
63
70
  PLATFORMS
64
71
  arm64-darwin-22
@@ -72,6 +79,7 @@ DEPENDENCIES
72
79
  rspec (~> 3.0)
73
80
  standard (~> 1.3)
74
81
  tiktoken_ruby!
82
+ yard-doctest (~> 0.1.17)
75
83
 
76
84
  BUNDLED WITH
77
85
  2.4.6
data/README.md CHANGED
@@ -15,11 +15,22 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
15
  $ gem install tiktoken_ruby
16
16
 
17
17
  ## Usage
18
+ Usage should be very similar to the python library. here's a simple example
18
19
 
20
+ Encode and decode text
19
21
  ```ruby
20
- encoding = Tiktoken::Encoding.r50k_base
21
- tokens = encoding.encode("Hello world!")
22
- puts encoding.decode(tokens)
22
+ require 'tiktoken_ruby'
23
+
24
+ enc = Tiktoken.get_encoding("cl100k_base")
25
+ enc.decode(enc.encode("hello world")) #=> "hello world"
26
+ ```
27
+
28
+ Encoders can also be retrieved by model name
29
+ ```ruby
30
+ require 'tiktoken_ruby'
31
+
32
+ enc = Tiktoken.encoding_for_model("gpt-4")
33
+ enc.encode("hello world").length #=> 2
23
34
  ```
24
35
 
25
36
  ## Development
data/doctest_helper.rb ADDED
@@ -0,0 +1 @@
1
+ require 'lib/tiktoken_ruby.rb'
Binary file
Binary file
@@ -1,23 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Tiktoken::Encoding
4
- def self.method_missing(method)
5
- Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(method))
3
+ class Tiktoken::Encoding
4
+ attr_reader :name
5
+
6
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
7
+ # @param encoding [Symbol] The name of the encoding to load
8
+ # @return [Tiktoken::Encoding] The encoding instance
9
+ def self.for_name(encoding)
10
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
6
11
  end
7
12
 
8
- def initialize(ext_base_bpe)
9
- @ext_base_bpe = ext_base_bpe
13
+ # This returns a Tiktoken::Encoding instance for the requested encoding
14
+ # It will reuse an existing encoding if it's already been loaded
15
+ # @param encoding [Symbol] The name of the encoding to load
16
+ # @return [Tiktoken::Encoding] The encoding instance
17
+ def self.for_name_cached(encoding)
18
+ @encodings ||= {}
19
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
10
20
  end
11
21
 
22
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
+ # basically it's unescaped
24
+ # @param text [String] The text to encode
25
+ # @return [Array<Integer>] The encoded tokens
12
26
  def encode_ordinary(text)
13
27
  @ext_base_bpe.encode_ordinary(text)
14
28
  end
15
29
 
30
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
+ # @param text [String] The text to encode
33
+ # @param allowed_special [Array<String>] An array of special tokens to allow
34
+ # @return [Array<Integer>] The encoded tokens
16
35
  def encode(text, allowed_special: [])
17
36
  @ext_base_bpe.encode(text, allowed_special)
18
37
  end
19
38
 
39
+ # Decodes the tokens back into text
40
+ # @param tokens [Array<Integer>] The tokens to decode
41
+ # @return [String] The decoded text
20
42
  def decode(tokens)
21
43
  @ext_base_bpe.decode(tokens)
22
44
  end
45
+
46
+ private
47
+ def initialize(ext_base_bpe, name)
48
+ @ext_base_bpe = ext_base_bpe
49
+ @name = name
50
+ end
23
51
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.2"
4
+ VERSION = "0.0.3"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -11,5 +11,105 @@ rescue LoadError
11
11
  end
12
12
 
13
13
  module Tiktoken
14
- class Error < StandardError; end
14
+ class << self
15
+
16
+ # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
17
+ # it will reuse the instance of that type that was previous loaded
18
+ # @param name [Symbol|String] The name of the encoding to load
19
+ # @return [Tiktoken::Encoding] The encoding instance
20
+ # @example Encode and decode text
21
+ # enc = Tiktoken.get_encoding("cl100k_base")
22
+ # enc.decode(enc.encode("hello world")) #=> "hello world"
23
+ def get_encoding(name)
24
+ name = name.to_sym
25
+ return nil unless SUPPORTED_ENCODINGS.include?(name)
26
+
27
+ Tiktoken::Encoding.for_name_cached(name)
28
+ end
29
+
30
+ # Gets the encoding for an OpenAI model
31
+ # @param model_name [Symbol|String] The name of the model to get the encoding for
32
+ # @return [Tiktoken::Encoding] The encoding instance
33
+ # @example Count tokens for text
34
+ # enc = Tiktoken.encoding_for_model("gpt-4")
35
+ # enc.encode("hello world").length #=> 2
36
+ def encoding_for_model(model_name)
37
+ for prefix in PREFIX_MODELS
38
+ if model_name.to_s.start_with?("#{prefix}-")
39
+ model_name = prefix
40
+ break
41
+ end
42
+ end
43
+
44
+ encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
45
+ return nil unless encoding_name
46
+
47
+ get_encoding(encoding_name)
48
+ end
49
+
50
+ # Lists all the encodings that are supported
51
+ # @return [Array<Symbol>] The list of supported encodings
52
+ def list_encoding_names
53
+ SUPPORTED_ENCODINGS
54
+ end
55
+
56
+ # Lists all the models that are supported
57
+ # @return [Array<Symbol>] The list of supported models
58
+ def list_model_names
59
+ MODEL_TO_ENCODING_NAME.keys
60
+ end
61
+
62
+ private
63
+
64
+ SUPPORTED_ENCODINGS = [
65
+ :r50k_base,
66
+ :p50k_base,
67
+ :p50k_edit,
68
+ :cl100k_base,
69
+ ]
70
+
71
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
72
+ # that is also MIT licensed but by OpenAI
73
+ MODEL_TO_ENCODING_NAME = {
74
+ "gpt-4": "cl100k_base",
75
+ "gpt-3.5-turbo": "cl100k_base",
76
+ # text
77
+ "text-davinci-003": "p50k_base",
78
+ "text-davinci-002": "p50k_base",
79
+ "text-davinci-001": "r50k_base",
80
+ "text-curie-001": "r50k_base",
81
+ "text-babbage-001": "r50k_base",
82
+ "text-ada-001": "r50k_base",
83
+ "davinci": "r50k_base",
84
+ "curie": "r50k_base",
85
+ "babbage": "r50k_base",
86
+ "ada": "r50k_base",
87
+ # code
88
+ "code-davinci-002": "p50k_base",
89
+ "code-davinci-001": "p50k_base",
90
+ "code-cushman-002": "p50k_base",
91
+ "code-cushman-001": "p50k_base",
92
+ "davinci-codex": "p50k_base",
93
+ "cushman-codex": "p50k_base",
94
+ # edit
95
+ "text-davinci-edit-001": "p50k_edit",
96
+ "code-davinci-edit-001": "p50k_edit",
97
+ # embeddings
98
+ "text-embedding-ada-002": "cl100k_base",
99
+ # old embeddings
100
+ "text-similarity-davinci-001": "r50k_base",
101
+ "text-similarity-curie-001": "r50k_base",
102
+ "text-similarity-babbage-001": "r50k_base",
103
+ "text-similarity-ada-001": "r50k_base",
104
+ "text-search-davinci-doc-001": "r50k_base",
105
+ "text-search-curie-doc-001": "r50k_base",
106
+ "text-search-babbage-doc-001": "r50k_base",
107
+ "text-search-ada-doc-001": "r50k_base",
108
+ "code-search-babbage-code-001": "r50k_base",
109
+ "code-search-ada-code-001": "r50k_base",
110
+ }
111
+
112
+ # these are models that have a versioned models that are otherwise identical
113
+ PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
114
+ end
15
115
  end
metadata CHANGED
@@ -1,16 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: x64-mingw-ucrt
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-03-19 00:00:00.000000000 Z
11
+ date: 2023-03-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Unofficial Ruby wrapper for Tiktoken by way of the unofficial rust bindings
13
+ description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
+ used by OpenAI. It can be used to count the number of tokens in text before sending
15
+ it to OpenAI APIs.
14
16
  email:
15
17
  - isaac.a.park@gmail.com
16
18
  executables: []
@@ -24,6 +26,7 @@ files:
24
26
  - LICENSE.txt
25
27
  - README.md
26
28
  - Rakefile
29
+ - doctest_helper.rb
27
30
  - lib/tiktoken_ruby.rb
28
31
  - lib/tiktoken_ruby/3.1/tiktoken_ruby.so
29
32
  - lib/tiktoken_ruby/3.2/tiktoken_ruby.so
@@ -36,6 +39,7 @@ licenses:
36
39
  metadata:
37
40
  homepage_uri: https://github.com/IAPark/tiktoken_ruby
38
41
  source_code_uri: https://github.com/IAPark/tiktoken_ruby
42
+ documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
39
43
  post_install_message:
40
44
  rdoc_options: []
41
45
  require_paths: