tiktoken_ruby 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: de2d6e8e83771f2ef51351e019e9cebc7163a1775bfc0e812da58371574b9b63
4
- data.tar.gz: fe2c629e8b435a181bfa4524655bfa137f1660061ab4f780e0edc15a11a7538d
3
+ metadata.gz: 347a10d045e27fca4cdfec03c4d2eac0150448b8f2125d5bcbcd1b92db83499a
4
+ data.tar.gz: de048e8320daa15b27ffa7ccdd9f7ec618cb2a3ad96fe2bedf71e6f780fc8b6f
5
5
  SHA512:
6
- metadata.gz: bbf721963e873464fae055d23308068fcdac8db4e27dea28653c3fc017f0803da59b4c549a0fb6b6339f79bc4379e2913e1c47fa25f9894ebde840b23aa81edb
7
- data.tar.gz: 1275369f56a2498ce39c5a2b259efdab0684ae834c6d769bfe23d7190c69917a22d46dc7b7675cd15c8a44dcfa8f7c3be2df850035236fa33f8e55afd31db42b
6
+ metadata.gz: 7c587d4b18e777a0f7692aab857fec1fc3b3bcceceab158439197123786475c65d51f5356351bcf0c9c327b94e2ff15a83b2144c2db329aedea0456c1d763ff9
7
+ data.tar.gz: abfdeb836d81555effa5ef6647a77b360dae2b975bb3dc23a22c779f6dd82256630942ddfef01a837ff5195896fa4251925e87dec0d245dce1c8e83719a488f1
data/Cargo.lock CHANGED
@@ -31,16 +31,16 @@ checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
31
31
 
32
32
  [[package]]
33
33
  name = "bindgen"
34
- version = "0.66.1"
34
+ version = "0.69.4"
35
35
  source = "registry+https://github.com/rust-lang/crates.io-index"
36
- checksum = "f2b84e06fc203107bfbad243f4aba2af864eb7db3b1cf46ea0a023b0b433d2a7"
36
+ checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
37
37
  dependencies = [
38
38
  "bitflags 2.4.0",
39
39
  "cexpr",
40
40
  "clang-sys",
41
+ "itertools",
41
42
  "lazy_static",
42
43
  "lazycell",
43
- "peeking_take_while",
44
44
  "proc-macro2",
45
45
  "quote",
46
46
  "regex",
@@ -114,6 +114,12 @@ dependencies = [
114
114
  "libloading",
115
115
  ]
116
116
 
117
+ [[package]]
118
+ name = "either"
119
+ version = "1.10.0"
120
+ source = "registry+https://github.com/rust-lang/crates.io-index"
121
+ checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
122
+
117
123
  [[package]]
118
124
  name = "fancy-regex"
119
125
  version = "0.11.0"
@@ -130,6 +136,15 @@ version = "0.3.1"
130
136
  source = "registry+https://github.com/rust-lang/crates.io-index"
131
137
  checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
132
138
 
139
+ [[package]]
140
+ name = "itertools"
141
+ version = "0.12.1"
142
+ source = "registry+https://github.com/rust-lang/crates.io-index"
143
+ checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
144
+ dependencies = [
145
+ "either",
146
+ ]
147
+
133
148
  [[package]]
134
149
  name = "lazy_static"
135
150
  version = "1.4.0"
@@ -242,12 +257,6 @@ dependencies = [
242
257
  "windows-sys",
243
258
  ]
244
259
 
245
- [[package]]
246
- name = "peeking_take_while"
247
- version = "0.1.2"
248
- source = "registry+https://github.com/rust-lang/crates.io-index"
249
- checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
250
-
251
260
  [[package]]
252
261
  name = "proc-macro2"
253
262
  version = "1.0.66"
@@ -268,18 +277,18 @@ dependencies = [
268
277
 
269
278
  [[package]]
270
279
  name = "rb-sys"
271
- version = "0.9.81"
280
+ version = "0.9.87"
272
281
  source = "registry+https://github.com/rust-lang/crates.io-index"
273
- checksum = "a57240b308b155b09dce81e32829966a99f52d1088b45957e4283e526c5317a1"
282
+ checksum = "225103e3d69bbfe8831f9fd0d2461335f3a9dd06aa6e88bcb6d6970383494d06"
274
283
  dependencies = [
275
284
  "rb-sys-build",
276
285
  ]
277
286
 
278
287
  [[package]]
279
288
  name = "rb-sys-build"
280
- version = "0.9.81"
289
+ version = "0.9.87"
281
290
  source = "registry+https://github.com/rust-lang/crates.io-index"
282
- checksum = "f24ce877a4c5d07f06f6aa6fec3ac95e4b357b9f73b0f5445d8cbb7266d410e8"
291
+ checksum = "bacce8095a5167d5ede618bbd9353e9d9e2f32ddaf54be911106f0ee6baacf09"
283
292
  dependencies = [
284
293
  "bindgen",
285
294
  "lazy_static",
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.7)
4
+ tiktoken_ruby (0.0.8)
5
5
  rb_sys (>= 0.9.86)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -1,8 +1,14 @@
1
1
  [![Gem Version](https://badge.fury.io/rb/tiktoken_ruby.svg)](https://badge.fury.io/rb/tiktoken_ruby)
2
+
2
3
  # tiktoken_ruby
3
4
 
4
5
  [Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
5
- This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
6
+ This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
7
+
8
+ ## Request for maintainers
9
+
10
+ I can't really put substantial time into maintaining this. Probably nothing more than a couple hours every few months. If you have experience maintaining ruby gems and would like to
11
+ lend a hand please send me an email or reply to this [issue](https://github.com/IAPark/tiktoken_ruby/issues/26)
6
12
 
7
13
  ## Installation
8
14
 
@@ -15,17 +21,19 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
21
  $ gem install tiktoken_ruby
16
22
 
17
23
  ## Usage
24
+
18
25
  Usage should be very similar to the python library. Here's a simple example
19
26
 
20
27
  Encode and decode text
28
+
21
29
  ```ruby
22
30
  require 'tiktoken_ruby'
23
-
24
31
  enc = Tiktoken.get_encoding("cl100k_base")
25
32
  enc.decode(enc.encode("hello world")) #=> "hello world"
26
33
  ```
27
34
 
28
35
  Encoders can also be retrieved by model name
36
+
29
37
  ```ruby
30
38
  require 'tiktoken_ruby'
31
39
 
@@ -53,7 +61,6 @@ bundle exec rake compile
53
61
  bundle exec rake spec
54
62
  ```
55
63
 
56
-
57
64
  ## License
58
65
 
59
66
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Tiktoken::Encoding
4
+ CACHE_MUTEX = Mutex.new
5
+
4
6
  attr_reader :name
5
7
 
6
8
  # This returns a new Tiktoken::Encoding instance for the requested encoding
@@ -15,8 +17,10 @@ class Tiktoken::Encoding
15
17
  # @param encoding [Symbol] The name of the encoding to load
16
18
  # @return [Tiktoken::Encoding] The encoding instance
17
19
  def self.for_name_cached(encoding)
18
- @encodings ||= {}
19
- @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ CACHE_MUTEX.synchronize do
21
+ @encodings ||= {}
22
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
23
+ end
20
24
  end
21
25
 
22
26
  # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.7"
4
+ VERSION = "0.0.8"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -28,7 +28,7 @@ module Tiktoken
28
28
 
29
29
  # Gets the encoding for an OpenAI model
30
30
  # @param model_name [Symbol|String] The name of the model to get the encoding for
31
- # @return [Tiktoken::Encoding] The encoding instance
31
+ # @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
32
32
  # @example Count tokens for text
33
33
  # enc = Tiktoken.encoding_for_model("gpt-4")
34
34
  # enc.encode("hello world").length #=> 2
@@ -37,10 +37,12 @@ module Tiktoken
37
37
  return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
38
38
  end
39
39
 
40
- MODEL_PREFIX_TO_ENCODING.each do |prefix, encoding|
41
- if model_name.start_with?(prefix.to_s)
42
- return get_encoding(encoding)
43
- end
40
+ _prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
41
+ model_name.start_with?(prefix.to_s)
42
+ end
43
+
44
+ if encoding
45
+ get_encoding(encoding)
44
46
  end
45
47
  end
46
48
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-02-12 00:00:00.000000000 Z
11
+ date: 2024-04-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys