tiktoken_ruby 0.0.2-arm64-darwin → 0.0.3-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5dc44b2939b680a8e680ea5abf328389d5118bd86c7c967f218a84ad6f9aaae8
4
- data.tar.gz: c072e6039a5687cb80d5ed44389d2847660efb8d45e657557e6911e1786debb4
3
+ metadata.gz: 2222c8f701416004cca31e6ae0095b944e279c1cbfd1dfc035c25f4c3d8fc7ff
4
+ data.tar.gz: 2ea5b03f8eadfb064454d74db724880b8a5dd7746237b84328feb51ce5813cbe
5
5
  SHA512:
6
- metadata.gz: b6b1d7ebf494a0a7bfd48f47b2aaf16c32dedbebf4e76ad723083d1dbb10fc011f9ec573762d32808af497f11f0a64c5c163106e6176406758682667d7b7f0ed
7
- data.tar.gz: a7005b258a111a16e243b2912aa72bcb7f703ab46b3da66e6c05cc9c79e8ed4f6321d03212dba622ccc29b440b4e7e2f36da93f8377cd30176c91cc574c66367
6
+ metadata.gz: 145d66621b038e291c93274c54528824e6eb1244ce4411d134e5dbf2f2aaefab9561cf06d04a66be343aa169c8e2586b9bea8e09631b58a9a96d23f829aa5020
7
+ data.tar.gz: 4b9af794c9be96281758fc70a0ddaccfb90e4b85fd63c161425175c65ed28a179df9a7daec82503bc0ddb9a9f978ad6da738db79220c1db57f0fedc86df88cf7
data/Gemfile CHANGED
@@ -14,3 +14,5 @@ gem "rspec", "~> 3.0"
14
14
 
15
15
  gem "standard", "~> 1.3"
16
16
  gem 'pry', '~> 0.14.2'
17
+
18
+ gem "yard-doctest", "~> 0.1.17"
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.2)
4
+ tiktoken_ruby (0.0.3)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -12,6 +12,7 @@ GEM
12
12
  json (2.6.3)
13
13
  language_server-protocol (3.17.0.3)
14
14
  method_source (1.0.0)
15
+ minitest (5.18.0)
15
16
  parallel (1.22.1)
16
17
  parser (3.2.1.1)
17
18
  ast (~> 2.4.1)
@@ -59,6 +60,12 @@ GEM
59
60
  rubocop (= 1.48.1)
60
61
  rubocop-performance (= 1.16.0)
61
62
  unicode-display_width (2.4.2)
63
+ webrick (1.7.0)
64
+ yard (0.9.28)
65
+ webrick (~> 1.7.0)
66
+ yard-doctest (0.1.17)
67
+ minitest
68
+ yard
62
69
 
63
70
  PLATFORMS
64
71
  arm64-darwin-22
@@ -72,6 +79,7 @@ DEPENDENCIES
72
79
  rspec (~> 3.0)
73
80
  standard (~> 1.3)
74
81
  tiktoken_ruby!
82
+ yard-doctest (~> 0.1.17)
75
83
 
76
84
  BUNDLED WITH
77
85
  2.4.6
data/README.md CHANGED
@@ -15,11 +15,22 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
15
  $ gem install tiktoken_ruby
16
16
 
17
17
  ## Usage
18
+ Usage should be very similar to the python library. here's a simple example
18
19
 
20
+ Encode and decode text
19
21
  ```ruby
20
- encoding = Tiktoken::Encoding.r50k_base
21
- tokens = encoding.encode("Hello world!")
22
- puts encoding.decode(tokens)
22
+ require 'tiktoken_ruby'
23
+
24
+ enc = Tiktoken.get_encoding("cl100k_base")
25
+ enc.decode(enc.encode("hello world")) #=> "hello world"
26
+ ```
27
+
28
+ Encoders can also be retrieved by model name
29
+ ```ruby
30
+ require 'tiktoken_ruby'
31
+
32
+ enc = Tiktoken.encoding_for_model("gpt-4")
33
+ enc.encode("hello world").length #=> 2
23
34
  ```
24
35
 
25
36
  ## Development
data/doctest_helper.rb ADDED
@@ -0,0 +1 @@
1
+ require 'lib/tiktoken_ruby.rb'
@@ -1,23 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Tiktoken::Encoding
4
- def self.method_missing(method)
5
- Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(method))
3
+ class Tiktoken::Encoding
4
+ attr_reader :name
5
+
6
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
7
+ # @param encoding [Symbol] The name of the encoding to load
8
+ # @return [Tiktoken::Encoding] The encoding instance
9
+ def self.for_name(encoding)
10
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
6
11
  end
7
12
 
8
- def initialize(ext_base_bpe)
9
- @ext_base_bpe = ext_base_bpe
13
+ # This returns a Tiktoken::Encoding instance for the requested encoding
14
+ # It will reuse an existing encoding if it's already been loaded
15
+ # @param encoding [Symbol] The name of the encoding to load
16
+ # @return [Tiktoken::Encoding] The encoding instance
17
+ def self.for_name_cached(encoding)
18
+ @encodings ||= {}
19
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
10
20
  end
11
21
 
22
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
+ # basically it's unescaped
24
+ # @param text [String] The text to encode
25
+ # @return [Array<Integer>] The encoded tokens
12
26
  def encode_ordinary(text)
13
27
  @ext_base_bpe.encode_ordinary(text)
14
28
  end
15
29
 
30
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
+ # @param text [String] The text to encode
33
+ # @param allowed_special [Array<String>] An array of special tokens to allow
34
+ # @return [Array<Integer>] The encoded tokens
16
35
  def encode(text, allowed_special: [])
17
36
  @ext_base_bpe.encode(text, allowed_special)
18
37
  end
19
38
 
39
+ # Decodes the tokens back into text
40
+ # @param tokens [Array<Integer>] The tokens to decode
41
+ # @return [String] The decoded text
20
42
  def decode(tokens)
21
43
  @ext_base_bpe.decode(tokens)
22
44
  end
45
+
46
+ private
47
+ def initialize(ext_base_bpe, name)
48
+ @ext_base_bpe = ext_base_bpe
49
+ @name = name
50
+ end
23
51
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.2"
4
+ VERSION = "0.0.3"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -11,5 +11,105 @@ rescue LoadError
11
11
  end
12
12
 
13
13
  module Tiktoken
14
- class Error < StandardError; end
14
+ class << self
15
+
16
+ # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
17
+ # it will reuse the instance of that type that was previous loaded
18
+ # @param name [Symbol|String] The name of the encoding to load
19
+ # @return [Tiktoken::Encoding] The encoding instance
20
+ # @example Encode and decode text
21
+ # enc = Tiktoken.get_encoding("cl100k_base")
22
+ # enc.decode(enc.encode("hello world")) #=> "hello world"
23
+ def get_encoding(name)
24
+ name = name.to_sym
25
+ return nil unless SUPPORTED_ENCODINGS.include?(name)
26
+
27
+ Tiktoken::Encoding.for_name_cached(name)
28
+ end
29
+
30
+ # Gets the encoding for an OpenAI model
31
+ # @param model_name [Symbol|String] The name of the model to get the encoding for
32
+ # @return [Tiktoken::Encoding] The encoding instance
33
+ # @example Count tokens for text
34
+ # enc = Tiktoken.encoding_for_model("gpt-4")
35
+ # enc.encode("hello world").length #=> 2
36
+ def encoding_for_model(model_name)
37
+ for prefix in PREFIX_MODELS
38
+ if model_name.to_s.start_with?("#{prefix}-")
39
+ model_name = prefix
40
+ break
41
+ end
42
+ end
43
+
44
+ encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
45
+ return nil unless encoding_name
46
+
47
+ get_encoding(encoding_name)
48
+ end
49
+
50
+ # Lists all the encodings that are supported
51
+ # @return [Array<Symbol>] The list of supported encodings
52
+ def list_encoding_names
53
+ SUPPORTED_ENCODINGS
54
+ end
55
+
56
+ # Lists all the models that are supported
57
+ # @return [Array<Symbol>] The list of supported models
58
+ def list_model_names
59
+ MODEL_TO_ENCODING_NAME.keys
60
+ end
61
+
62
+ private
63
+
64
+ SUPPORTED_ENCODINGS = [
65
+ :r50k_base,
66
+ :p50k_base,
67
+ :p50k_edit,
68
+ :cl100k_base,
69
+ ]
70
+
71
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
72
+ # that is also MIT licensed but by OpenAI
73
+ MODEL_TO_ENCODING_NAME = {
74
+ "gpt-4": "cl100k_base",
75
+ "gpt-3.5-turbo": "cl100k_base",
76
+ # text
77
+ "text-davinci-003": "p50k_base",
78
+ "text-davinci-002": "p50k_base",
79
+ "text-davinci-001": "r50k_base",
80
+ "text-curie-001": "r50k_base",
81
+ "text-babbage-001": "r50k_base",
82
+ "text-ada-001": "r50k_base",
83
+ "davinci": "r50k_base",
84
+ "curie": "r50k_base",
85
+ "babbage": "r50k_base",
86
+ "ada": "r50k_base",
87
+ # code
88
+ "code-davinci-002": "p50k_base",
89
+ "code-davinci-001": "p50k_base",
90
+ "code-cushman-002": "p50k_base",
91
+ "code-cushman-001": "p50k_base",
92
+ "davinci-codex": "p50k_base",
93
+ "cushman-codex": "p50k_base",
94
+ # edit
95
+ "text-davinci-edit-001": "p50k_edit",
96
+ "code-davinci-edit-001": "p50k_edit",
97
+ # embeddings
98
+ "text-embedding-ada-002": "cl100k_base",
99
+ # old embeddings
100
+ "text-similarity-davinci-001": "r50k_base",
101
+ "text-similarity-curie-001": "r50k_base",
102
+ "text-similarity-babbage-001": "r50k_base",
103
+ "text-similarity-ada-001": "r50k_base",
104
+ "text-search-davinci-doc-001": "r50k_base",
105
+ "text-search-curie-doc-001": "r50k_base",
106
+ "text-search-babbage-doc-001": "r50k_base",
107
+ "text-search-ada-doc-001": "r50k_base",
108
+ "code-search-babbage-code-001": "r50k_base",
109
+ "code-search-ada-code-001": "r50k_base",
110
+ }
111
+
112
+ # these are models that have a versioned models that are otherwise identical
113
+ PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
114
+ end
15
115
  end
metadata CHANGED
@@ -1,16 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-03-19 00:00:00.000000000 Z
11
+ date: 2023-03-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Unofficial Ruby wrapper for Tiktoken by way of the unofficial rust bindings
13
+ description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
+ used by OpenAI. It can be used to count the number of tokens in text before sending
15
+ it to OpenAI APIs.
14
16
  email:
15
17
  - isaac.a.park@gmail.com
16
18
  executables: []
@@ -24,6 +26,7 @@ files:
24
26
  - LICENSE.txt
25
27
  - README.md
26
28
  - Rakefile
29
+ - doctest_helper.rb
27
30
  - lib/tiktoken_ruby.rb
28
31
  - lib/tiktoken_ruby/2.7/tiktoken_ruby.bundle
29
32
  - lib/tiktoken_ruby/3.0/tiktoken_ruby.bundle
@@ -38,6 +41,7 @@ licenses:
38
41
  metadata:
39
42
  homepage_uri: https://github.com/IAPark/tiktoken_ruby
40
43
  source_code_uri: https://github.com/IAPark/tiktoken_ruby
44
+ documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
41
45
  post_install_message:
42
46
  rdoc_options: []
43
47
  require_paths: