tiktoken_ruby 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 417d96c1d3bb9bac547fb3cc632e061894e13e7d7673a74e08ddc33d33bc3cbb
4
- data.tar.gz: b27d96b688bc4fa5f31cdecfaa629306e262bd34960a25bda93e7b8364e69c57
3
+ metadata.gz: abd9dc6cb619b53de92f8e788dc81ca5df2a01e6de72152cbac3110ca1d228a6
4
+ data.tar.gz: c0f2e679550ec6a8bec569b3c16e735ac469839a58c4ce2747a6d37dd5399d17
5
5
  SHA512:
6
- metadata.gz: 03cdce8758407d412fe29e580ed5caed455d8153923c982b60ed0440f3537f73034c121fa758e01b86354368443ff2e1f2ed4b255833ccf385d99ff79f337058
7
- data.tar.gz: 55b2e0db635609d0f756abef80e761b828eedc345b442b620c086f3e07072895b094a115993d4083633f34511e5a74ff38a2f0187a6308b28abd6f9d5483c016
6
+ metadata.gz: d2386df25639f6e713b450cd0f671bb7613c11f4d7925f566703e422ff22f21e48c8176ef277792908da161232242c58816c0d0964f66494089a5a93732ee062
7
+ data.tar.gz: d03fb53b1f9cd4f8d37cd0ceaa5b1943d55d981e4288c55bba5483b7253400e06f60618289c292539cf2a7fbad3752a85bd4ac02ef12c6ad870a6db22a0bd3d8
data/Gemfile CHANGED
@@ -14,3 +14,5 @@ gem "rspec", "~> 3.0"
14
14
 
15
15
  gem "standard", "~> 1.3"
16
16
  gem 'pry', '~> 0.14.2'
17
+
18
+ gem "yard-doctest", "~> 0.1.17"
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.2)
4
+ tiktoken_ruby (0.0.3)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -12,6 +12,7 @@ GEM
12
12
  json (2.6.3)
13
13
  language_server-protocol (3.17.0.3)
14
14
  method_source (1.0.0)
15
+ minitest (5.18.0)
15
16
  parallel (1.22.1)
16
17
  parser (3.2.1.1)
17
18
  ast (~> 2.4.1)
@@ -59,6 +60,12 @@ GEM
59
60
  rubocop (= 1.48.1)
60
61
  rubocop-performance (= 1.16.0)
61
62
  unicode-display_width (2.4.2)
63
+ webrick (1.7.0)
64
+ yard (0.9.28)
65
+ webrick (~> 1.7.0)
66
+ yard-doctest (0.1.17)
67
+ minitest
68
+ yard
62
69
 
63
70
  PLATFORMS
64
71
  arm64-darwin-22
@@ -71,6 +78,7 @@ DEPENDENCIES
71
78
  rspec (~> 3.0)
72
79
  standard (~> 1.3)
73
80
  tiktoken_ruby!
81
+ yard-doctest (~> 0.1.17)
74
82
 
75
83
  BUNDLED WITH
76
84
  2.4.6
data/README.md CHANGED
@@ -15,11 +15,22 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
15
  $ gem install tiktoken_ruby
16
16
 
17
17
  ## Usage
18
+ Usage should be very similar to the python library. here's a simple example
18
19
 
20
+ Encode and decode text
19
21
  ```ruby
20
- encoding = Tiktoken::Encoding.r50k_base
21
- tokens = encoding.encode("Hello world!")
22
- puts encoding.decode(tokens)
22
+ require 'tiktoken_ruby'
23
+
24
+ enc = Tiktoken.get_encoding("cl100k_base")
25
+ enc.decode(enc.encode("hello world")) #=> "hello world"
26
+ ```
27
+
28
+ Encoders can also be retrieved by model name
29
+ ```ruby
30
+ require 'tiktoken_ruby'
31
+
32
+ enc = Tiktoken.encoding_for_model("gpt-4")
33
+ enc.encode("hello world").length #=> 2
23
34
  ```
24
35
 
25
36
  ## Development
data/doctest_helper.rb ADDED
@@ -0,0 +1 @@
1
+ require 'lib/tiktoken_ruby.rb'
@@ -35,7 +35,7 @@ fn init() -> Result<(), Error> {
35
35
  let factory_module = module.define_module("BpeFactory")?;
36
36
  factory_module.define_singleton_method("r50k_base", function!(r50k_base, 0))?;
37
37
  factory_module.define_singleton_method("p50k_base", function!(p50k_base, 0))?;
38
- factory_module.define_singleton_method("p50k_base", function!(p50k_edit, 0))?;
38
+ factory_module.define_singleton_method("p50k_edit", function!(p50k_edit, 0))?;
39
39
  factory_module.define_singleton_method("cl100k_base", function!(cl100k_base, 0))?;
40
40
 
41
41
 
@@ -1,23 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Tiktoken::Encoding
4
- def self.method_missing(method)
5
- Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(method))
3
+ class Tiktoken::Encoding
4
+ attr_reader :name
5
+
6
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
7
+ # @param encoding [Symbol] The name of the encoding to load
8
+ # @return [Tiktoken::Encoding] The encoding instance
9
+ def self.for_name(encoding)
10
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
6
11
  end
7
12
 
8
- def initialize(ext_base_bpe)
9
- @ext_base_bpe = ext_base_bpe
13
+ # This returns a Tiktoken::Encoding instance for the requested encoding
14
+ # It will reuse an existing encoding if it's already been loaded
15
+ # @param encoding [Symbol] The name of the encoding to load
16
+ # @return [Tiktoken::Encoding] The encoding instance
17
+ def self.for_name_cached(encoding)
18
+ @encodings ||= {}
19
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
10
20
  end
11
21
 
22
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
+ # basically it's unescaped
24
+ # @param text [String] The text to encode
25
+ # @return [Array<Integer>] The encoded tokens
12
26
  def encode_ordinary(text)
13
27
  @ext_base_bpe.encode_ordinary(text)
14
28
  end
15
29
 
30
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
+ # @param text [String] The text to encode
33
+ # @param allowed_special [Array<String>] An array of special tokens to allow
34
+ # @return [Array<Integer>] The encoded tokens
16
35
  def encode(text, allowed_special: [])
17
36
  @ext_base_bpe.encode(text, allowed_special)
18
37
  end
19
38
 
39
+ # Decodes the tokens back into text
40
+ # @param tokens [Array<Integer>] The tokens to decode
41
+ # @return [String] The decoded text
20
42
  def decode(tokens)
21
43
  @ext_base_bpe.decode(tokens)
22
44
  end
45
+
46
+ private
47
+ def initialize(ext_base_bpe, name)
48
+ @ext_base_bpe = ext_base_bpe
49
+ @name = name
50
+ end
23
51
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.2"
4
+ VERSION = "0.0.3"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -11,5 +11,105 @@ rescue LoadError
11
11
  end
12
12
 
13
13
  module Tiktoken
14
- class Error < StandardError; end
14
+ class << self
15
+
16
+ # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
17
+ # it will reuse the instance of that type that was previous loaded
18
+ # @param name [Symbol|String] The name of the encoding to load
19
+ # @return [Tiktoken::Encoding] The encoding instance
20
+ # @example Encode and decode text
21
+ # enc = Tiktoken.get_encoding("cl100k_base")
22
+ # enc.decode(enc.encode("hello world")) #=> "hello world"
23
+ def get_encoding(name)
24
+ name = name.to_sym
25
+ return nil unless SUPPORTED_ENCODINGS.include?(name)
26
+
27
+ Tiktoken::Encoding.for_name_cached(name)
28
+ end
29
+
30
+ # Gets the encoding for an OpenAI model
31
+ # @param model_name [Symbol|String] The name of the model to get the encoding for
32
+ # @return [Tiktoken::Encoding] The encoding instance
33
+ # @example Count tokens for text
34
+ # enc = Tiktoken.encoding_for_model("gpt-4")
35
+ # enc.encode("hello world").length #=> 2
36
+ def encoding_for_model(model_name)
37
+ for prefix in PREFIX_MODELS
38
+ if model_name.to_s.start_with?("#{prefix}-")
39
+ model_name = prefix
40
+ break
41
+ end
42
+ end
43
+
44
+ encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
45
+ return nil unless encoding_name
46
+
47
+ get_encoding(encoding_name)
48
+ end
49
+
50
+ # Lists all the encodings that are supported
51
+ # @return [Array<Symbol>] The list of supported encodings
52
+ def list_encoding_names
53
+ SUPPORTED_ENCODINGS
54
+ end
55
+
56
+ # Lists all the models that are supported
57
+ # @return [Array<Symbol>] The list of supported models
58
+ def list_model_names
59
+ MODEL_TO_ENCODING_NAME.keys
60
+ end
61
+
62
+ private
63
+
64
+ SUPPORTED_ENCODINGS = [
65
+ :r50k_base,
66
+ :p50k_base,
67
+ :p50k_edit,
68
+ :cl100k_base,
69
+ ]
70
+
71
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
72
+ # that is also MIT licensed but by OpenAI
73
+ MODEL_TO_ENCODING_NAME = {
74
+ "gpt-4": "cl100k_base",
75
+ "gpt-3.5-turbo": "cl100k_base",
76
+ # text
77
+ "text-davinci-003": "p50k_base",
78
+ "text-davinci-002": "p50k_base",
79
+ "text-davinci-001": "r50k_base",
80
+ "text-curie-001": "r50k_base",
81
+ "text-babbage-001": "r50k_base",
82
+ "text-ada-001": "r50k_base",
83
+ "davinci": "r50k_base",
84
+ "curie": "r50k_base",
85
+ "babbage": "r50k_base",
86
+ "ada": "r50k_base",
87
+ # code
88
+ "code-davinci-002": "p50k_base",
89
+ "code-davinci-001": "p50k_base",
90
+ "code-cushman-002": "p50k_base",
91
+ "code-cushman-001": "p50k_base",
92
+ "davinci-codex": "p50k_base",
93
+ "cushman-codex": "p50k_base",
94
+ # edit
95
+ "text-davinci-edit-001": "p50k_edit",
96
+ "code-davinci-edit-001": "p50k_edit",
97
+ # embeddings
98
+ "text-embedding-ada-002": "cl100k_base",
99
+ # old embeddings
100
+ "text-similarity-davinci-001": "r50k_base",
101
+ "text-similarity-curie-001": "r50k_base",
102
+ "text-similarity-babbage-001": "r50k_base",
103
+ "text-similarity-ada-001": "r50k_base",
104
+ "text-search-davinci-doc-001": "r50k_base",
105
+ "text-search-curie-doc-001": "r50k_base",
106
+ "text-search-babbage-doc-001": "r50k_base",
107
+ "text-search-ada-doc-001": "r50k_base",
108
+ "code-search-babbage-code-001": "r50k_base",
109
+ "code-search-ada-code-001": "r50k_base",
110
+ }
111
+
112
+ # these are models that have a versioned models that are otherwise identical
113
+ PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
114
+ end
15
115
  end
@@ -9,17 +9,20 @@ Gem::Specification.new do |spec|
9
9
  spec.email = ["isaac.a.park@gmail.com"]
10
10
 
11
11
  spec.summary = "Ruby wrapper for Tiktoken"
12
- spec.description = "Unofficial Ruby wrapper for Tiktoken by way of the unofficial rust bindings"
12
+ spec.description = "An unofficial Ruby wrapper for Tiktoken, " +
13
+ "a BPE tokenizer written by and used by OpenAI. It can be used to " +
14
+ "count the number of tokens in text before sending it to OpenAI APIs."
15
+
13
16
  spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
14
17
  spec.license = "MIT"
15
18
  spec.required_ruby_version = ">= 2.6.0"
16
19
  spec.required_rubygems_version = ">= 3.1.0"
17
20
  spec.platform = Gem::Platform::RUBY
18
21
 
19
- #spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
20
-
21
22
  spec.metadata["homepage_uri"] = spec.homepage
22
23
  spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
24
+ spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
25
+
23
26
  #spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
24
27
 
25
28
  # Specify which files should be added to the gem when it is released.
metadata CHANGED
@@ -1,16 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-03-19 00:00:00.000000000 Z
11
+ date: 2023-03-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Unofficial Ruby wrapper for Tiktoken by way of the unofficial rust bindings
13
+ description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
+ used by OpenAI. It can be used to count the number of tokens in text before sending
15
+ it to OpenAI APIs.
14
16
  email:
15
17
  - isaac.a.park@gmail.com
16
18
  executables: []
@@ -27,6 +29,7 @@ files:
27
29
  - LICENSE.txt
28
30
  - README.md
29
31
  - Rakefile
32
+ - doctest_helper.rb
30
33
  - ext/tiktoken_ruby/Cargo.toml
31
34
  - ext/tiktoken_ruby/extconf.rb
32
35
  - ext/tiktoken_ruby/src/core_bpe_wrapper.rs
@@ -42,6 +45,7 @@ licenses:
42
45
  metadata:
43
46
  homepage_uri: https://github.com/IAPark/tiktoken_ruby
44
47
  source_code_uri: https://github.com/IAPark/tiktoken_ruby
48
+ documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
45
49
  post_install_message:
46
50
  rdoc_options: []
47
51
  require_paths: