tokenizer-ruby 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '08ba527f839b738f57491cffd8490e9ff6eef4c34d0be6754ba987f4621d84b6'
4
- data.tar.gz: 91bef2ba51e51178bd045397c4374669842e90b309d2a65d423cbc6267f2b7b7
3
+ metadata.gz: 1eea09dc0997579eab072e4fc3705f8e5e5161110126f80478922ce65732d6a5
4
+ data.tar.gz: 4e6534b235e6477742a5170fc872cca3b59bbb44f32960fcb656a20d70d00c9f
5
5
  SHA512:
6
- metadata.gz: dc5e34e66506f9f798526e36f923f75a06f27ac75e16c194bd25f5c2421270dcb2fa511b36a3e4dacd835f9036c76fd5959ace8de1291f15ebe624f9093c30e3
7
- data.tar.gz: '083fb7459ff8cc61d5b50fcc085bc3dea7703b84d593004a8659d2db5214308a95a6ca41d685f78baab798ec07ef8e045e587e98340319c32757b1994afe565c'
6
+ metadata.gz: 4b7f5258322ef3d76d3a16a889b0e233f0078ed4970fff3b06d63f9d47fe3aeb39690fe370a37d0e52faf08bf808919d2bfe415a22bafa5bc31f38a6bcf70642
7
+ data.tar.gz: d2702b426ec82453372e9452a103921f025d68d44296e466c57e5b86d041a6c5c6b6558087a779767ac02080089a849cfee5262f863b0777331a42e51bb6b15a
data/CLAUDE.md CHANGED
@@ -174,7 +174,7 @@ Follow the same pattern as zvec-ruby:
174
174
 
175
175
  ## Publishing
176
176
 
177
- - RubyGems.org: `gem push tokenizer-ruby-*.gem`
177
+ - RubyGems.org: `GEM_HOST_API_KEY=rubygems_5d46e91ceb51fb455e98a7f491a2321bb6879f9be35d6842 gem push tokenizer-ruby-*.gem`
178
178
  - gem.coop: `GEM_HOST_API_KEY=hjncPswY8PbGDfLPw4RMj928 gem push tokenizer-ruby-*.gem --host https://beta.gem.coop/@johannesdwicahyo`
179
179
 
180
180
  ## Notes from zvec-ruby Experience
data/README.md ADDED
@@ -0,0 +1,107 @@
1
+ # tokenizer-ruby
2
+
3
+ Ruby bindings for [HuggingFace Tokenizers](https://github.com/huggingface/tokenizers). Fast, Rust-powered tokenization for any HuggingFace model — GPT-2, BERT, LLaMA, Claude, and more.
4
+
5
+ ## Installation
6
+
7
+ ```
8
+ gem install tokenizer-ruby
9
+ ```
10
+
11
+ Or add to your Gemfile:
12
+
13
+ ```ruby
14
+ gem "tokenizer-ruby"
15
+ ```
16
+
17
+ **Note:** Requires Rust toolchain for compilation. Install via [rustup](https://rustup.rs/).
18
+
19
+ ## Usage
20
+
21
+ ### Load a tokenizer
22
+
23
+ ```ruby
24
+ require "tokenizer_ruby"
25
+
26
+ # From HuggingFace Hub
27
+ tokenizer = TokenizerRuby::Tokenizer.from_pretrained("gpt2")
28
+ tokenizer = TokenizerRuby::Tokenizer.from_pretrained("bert-base-uncased")
29
+
30
+ # From a local file
31
+ tokenizer = TokenizerRuby::Tokenizer.from_file("/path/to/tokenizer.json")
32
+ ```
33
+
34
+ ### Encode and decode
35
+
36
+ ```ruby
37
+ encoding = tokenizer.encode("Hello, world!")
38
+ encoding.ids # => [15496, 11, 995, 0]
39
+ encoding.tokens # => ["Hello", ",", " world", "!"]
40
+ encoding.offsets # => [[0, 5], [5, 6], [6, 12], [12, 13]]
41
+ encoding.attention_mask # => [1, 1, 1, 1]
42
+ encoding.length # => 4
43
+
44
+ tokenizer.decode([15496, 11, 995, 0]) # => "Hello, world!"
45
+ ```
46
+
47
+ ### Batch processing
48
+
49
+ ```ruby
50
+ encodings = tokenizer.encode_batch(["Hello", "World"])
51
+ decoded = tokenizer.decode_batch(encodings.map(&:ids))
52
+ # => ["Hello", "World"]
53
+ ```
54
+
55
+ ### Token counting
56
+
57
+ ```ruby
58
+ tokenizer.count("Hello, world!") # => 4
59
+ ```
60
+
61
+ ### Truncation
62
+
63
+ ```ruby
64
+ # Truncate text to a token limit
65
+ tokenizer.truncate("This is a long sentence...", max_tokens: 5)
66
+
67
+ # Enable automatic truncation on all encodes
68
+ tokenizer.enable_truncation(max_length: 512)
69
+ ```
70
+
71
+ ### Padding
72
+
73
+ ```ruby
74
+ tokenizer.enable_padding(length: 128, pad_token: "[PAD]")
75
+ encoding = tokenizer.encode("Hello")
76
+ encoding.ids.length # => 128
77
+ encoding.attention_mask # => [1, 0, 0, 0, ...]
78
+ ```
79
+
80
+ ### Vocabulary
81
+
82
+ ```ruby
83
+ tokenizer.vocab_size # => 50257
84
+ tokenizer.token_to_id("hello") # => 31373
85
+ tokenizer.id_to_token(31373) # => "hello"
86
+ ```
87
+
88
+ ## Requirements
89
+
90
+ - Ruby >= 3.1
91
+ - Rust toolchain (for building from source)
92
+
93
+ ## Development
94
+
95
+ ```
96
+ bundle install
97
+ bundle exec rake compile
98
+ bundle exec rake test
99
+ ```
100
+
101
+ ## License
102
+
103
+ MIT
104
+
105
+ ## Author
106
+
107
+ Johannes Dwi Cahyo — [@johannesdwicahyo](https://github.com/johannesdwicahyo)
@@ -2,13 +2,17 @@
2
2
 
3
3
  module TokenizerRuby
4
4
  class Encoding
5
- attr_reader :ids, :tokens, :offsets, :attention_mask
5
+ attr_reader :ids, :tokens, :offsets, :attention_mask,
6
+ :type_ids, :special_tokens_mask, :word_ids
6
7
 
7
- def initialize(ids:, tokens:, offsets:, attention_mask:)
8
+ def initialize(ids:, tokens:, offsets:, attention_mask:, type_ids: nil, special_tokens_mask: nil, word_ids: nil)
8
9
  @ids = ids
9
10
  @tokens = tokens
10
11
  @offsets = offsets
11
12
  @attention_mask = attention_mask
13
+ @type_ids = type_ids
14
+ @special_tokens_mask = special_tokens_mask
15
+ @word_ids = word_ids
12
16
  end
13
17
 
14
18
  def length
@@ -2,6 +2,14 @@
2
2
 
3
3
  module TokenizerRuby
4
4
  class Tokenizer
5
+ def initialize(path_or_internal)
6
+ if path_or_internal.is_a?(String)
7
+ @inner = InternalTokenizer.from_file(path_or_internal)
8
+ else
9
+ @inner = path_or_internal
10
+ end
11
+ end
12
+
5
13
  def self.from_pretrained(identifier)
6
14
  new(InternalTokenizer.from_pretrained(identifier))
7
15
  end
@@ -11,7 +19,13 @@ module TokenizerRuby
11
19
  end
12
20
 
13
21
  def encode(text)
14
- result = @inner._encode(text)
22
+ raise TokenizerRuby::Error, "encode expects a String, got #{text.class}" unless text.is_a?(String)
23
+
24
+ begin
25
+ result = @inner._encode(text)
26
+ rescue => e
27
+ raise TokenizerRuby::Error, "failed to encode text: #{e.message}"
28
+ end
15
29
  Encoding.new(
16
30
  ids: result[:ids],
17
31
  tokens: result[:tokens],
@@ -21,7 +35,13 @@ module TokenizerRuby
21
35
  end
22
36
 
23
37
  def decode(ids)
24
- @inner._decode(ids)
38
+ raise TokenizerRuby::Error, "decode expects an Array, got #{ids.class}" unless ids.is_a?(Array)
39
+
40
+ begin
41
+ @inner._decode(ids)
42
+ rescue => e
43
+ raise TokenizerRuby::Error, "failed to decode ids: #{e.message}"
44
+ end
25
45
  end
26
46
 
27
47
  def encode_batch(texts)
@@ -57,6 +77,8 @@ module TokenizerRuby
57
77
  end
58
78
 
59
79
  def truncate(text, max_tokens:)
80
+ raise TokenizerRuby::Error, "max_tokens must be positive, got #{max_tokens}" unless max_tokens > 0
81
+
60
82
  encoding = encode(text)
61
83
  return text if encoding.length <= max_tokens
62
84
 
@@ -71,11 +93,5 @@ module TokenizerRuby
71
93
  def enable_padding(length:, pad_token: "[PAD]")
72
94
  @inner._enable_padding(length, pad_token)
73
95
  end
74
-
75
- private
76
-
77
- def initialize(inner)
78
- @inner = inner
79
- end
80
96
  end
81
97
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module TokenizerRuby
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizer-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo
@@ -32,6 +32,7 @@ extra_rdoc_files: []
32
32
  files:
33
33
  - CLAUDE.md
34
34
  - LICENSE
35
+ - README.md
35
36
  - ext/tokenizer_ruby/Cargo.toml
36
37
  - ext/tokenizer_ruby/extconf.rb
37
38
  - ext/tokenizer_ruby/src/lib.rs