RubyGems - tokenizer-ruby - Versions diffs - 0.1.0 → 0.1.1 - Mend

tokenizer-ruby 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/CLAUDE.md +1 -1
data/README.md +107 -0
data/lib/tokenizer_ruby/encoding.rb +6 -2
data/lib/tokenizer_ruby/tokenizer.rb +24 -8
data/lib/tokenizer_ruby/version.rb +1 -1
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: '08ba527f839b738f57491cffd8490e9ff6eef4c34d0be6754ba987f4621d84b6'
-  data.tar.gz: 91bef2ba51e51178bd045397c4374669842e90b309d2a65d423cbc6267f2b7b7
+  metadata.gz: 1eea09dc0997579eab072e4fc3705f8e5e5161110126f80478922ce65732d6a5
+  data.tar.gz: 4e6534b235e6477742a5170fc872cca3b59bbb44f32960fcb656a20d70d00c9f
 SHA512:
-  metadata.gz: dc5e34e66506f9f798526e36f923f75a06f27ac75e16c194bd25f5c2421270dcb2fa511b36a3e4dacd835f9036c76fd5959ace8de1291f15ebe624f9093c30e3
-  data.tar.gz: '083fb7459ff8cc61d5b50fcc085bc3dea7703b84d593004a8659d2db5214308a95a6ca41d685f78baab798ec07ef8e045e587e98340319c32757b1994afe565c'
+  metadata.gz: 4b7f5258322ef3d76d3a16a889b0e233f0078ed4970fff3b06d63f9d47fe3aeb39690fe370a37d0e52faf08bf808919d2bfe415a22bafa5bc31f38a6bcf70642
+  data.tar.gz: d2702b426ec82453372e9452a103921f025d68d44296e466c57e5b86d041a6c5c6b6558087a779767ac02080089a849cfee5262f863b0777331a42e51bb6b15a

data/CLAUDE.md CHANGED Viewed

@@ -174,7 +174,7 @@ Follow the same pattern as zvec-ruby:
 ## Publishing
-- RubyGems.org: `gem push tokenizer-ruby-*.gem`
+- RubyGems.org: `GEM_HOST_API_KEY=rubygems_5d46e91ceb51fb455e98a7f491a2321bb6879f9be35d6842 gem push tokenizer-ruby-*.gem`
 - gem.coop: `GEM_HOST_API_KEY=hjncPswY8PbGDfLPw4RMj928 gem push tokenizer-ruby-*.gem --host https://beta.gem.coop/@johannesdwicahyo`
 ## Notes from zvec-ruby Experience

data/README.md ADDED Viewed

@@ -0,0 +1,107 @@
+# tokenizer-ruby
+Ruby bindings for [HuggingFace Tokenizers](https://github.com/huggingface/tokenizers). Fast, Rust-powered tokenization for any HuggingFace model — GPT-2, BERT, LLaMA, Claude, and more.
+## Installation
+```
+gem install tokenizer-ruby
+```
+Or add to your Gemfile:
+```ruby
+gem "tokenizer-ruby"
+```
+**Note:** Requires Rust toolchain for compilation. Install via [rustup](https://rustup.rs/).
+## Usage
+### Load a tokenizer
+```ruby
+require "tokenizer_ruby"
+# From HuggingFace Hub
+tokenizer = TokenizerRuby::Tokenizer.from_pretrained("gpt2")
+tokenizer = TokenizerRuby::Tokenizer.from_pretrained("bert-base-uncased")
+# From a local file
+tokenizer = TokenizerRuby::Tokenizer.from_file("/path/to/tokenizer.json")
+```
+### Encode and decode
+```ruby
+encoding = tokenizer.encode("Hello, world!")
+encoding.ids           # => [15496, 11, 995, 0]
+encoding.tokens        # => ["Hello", ",", " world", "!"]
+encoding.offsets       # => [[0, 5], [5, 6], [6, 12], [12, 13]]
+encoding.attention_mask # => [1, 1, 1, 1]
+encoding.length        # => 4
+tokenizer.decode([15496, 11, 995, 0])  # => "Hello, world!"
+```
+### Batch processing
+```ruby
+encodings = tokenizer.encode_batch(["Hello", "World"])
+decoded = tokenizer.decode_batch(encodings.map(&:ids))
+# => ["Hello", "World"]
+```
+### Token counting
+```ruby
+tokenizer.count("Hello, world!")  # => 4
+```
+### Truncation
+```ruby
+# Truncate text to a token limit
+tokenizer.truncate("This is a long sentence...", max_tokens: 5)
+# Enable automatic truncation on all encodes
+tokenizer.enable_truncation(max_length: 512)
+```
+### Padding
+```ruby
+tokenizer.enable_padding(length: 128, pad_token: "[PAD]")
+encoding = tokenizer.encode("Hello")
+encoding.ids.length        # => 128
+encoding.attention_mask     # => [1, 0, 0, 0, ...]
+```
+### Vocabulary
+```ruby
+tokenizer.vocab_size           # => 50257
+tokenizer.token_to_id("hello") # => 31373
+tokenizer.id_to_token(31373)   # => "hello"
+```
+## Requirements
+- Ruby >= 3.1
+- Rust toolchain (for building from source)
+## Development
+```
+bundle install
+bundle exec rake compile
+bundle exec rake test
+```
+## License
+MIT
+## Author
+Johannes Dwi Cahyo — [@johannesdwicahyo](https://github.com/johannesdwicahyo)

data/lib/tokenizer_ruby/encoding.rb CHANGED Viewed

@@ -2,13 +2,17 @@
 module TokenizerRuby
   class Encoding
-    attr_reader :ids, :tokens, :offsets, :attention_mask
+    attr_reader :ids, :tokens, :offsets, :attention_mask,
+                :type_ids, :special_tokens_mask, :word_ids
-    def initialize(ids:, tokens:, offsets:, attention_mask:)
+    def initialize(ids:, tokens:, offsets:, attention_mask:, type_ids: nil, special_tokens_mask: nil, word_ids: nil)
       @ids = ids
       @tokens = tokens
       @offsets = offsets
       @attention_mask = attention_mask
+      @type_ids = type_ids
+      @special_tokens_mask = special_tokens_mask
+      @word_ids = word_ids
     end
     def length

data/lib/tokenizer_ruby/tokenizer.rb CHANGED Viewed

@@ -2,6 +2,14 @@
 module TokenizerRuby
   class Tokenizer
+    def initialize(path_or_internal)
+      if path_or_internal.is_a?(String)
+        @inner = InternalTokenizer.from_file(path_or_internal)
+      else
+        @inner = path_or_internal
+      end
+    end
     def self.from_pretrained(identifier)
       new(InternalTokenizer.from_pretrained(identifier))
     end
@@ -11,7 +19,13 @@ module TokenizerRuby
     end
     def encode(text)
-      result = @inner._encode(text)
+      raise TokenizerRuby::Error, "encode expects a String, got #{text.class}" unless text.is_a?(String)
+      begin
+        result = @inner._encode(text)
+      rescue => e
+        raise TokenizerRuby::Error, "failed to encode text: #{e.message}"
+      end
       Encoding.new(
         ids: result[:ids],
         tokens: result[:tokens],
@@ -21,7 +35,13 @@ module TokenizerRuby
     end
     def decode(ids)
-      @inner._decode(ids)
+      raise TokenizerRuby::Error, "decode expects an Array, got #{ids.class}" unless ids.is_a?(Array)
+      begin
+        @inner._decode(ids)
+      rescue => e
+        raise TokenizerRuby::Error, "failed to decode ids: #{e.message}"
+      end
     end
     def encode_batch(texts)
@@ -57,6 +77,8 @@ module TokenizerRuby
     end
     def truncate(text, max_tokens:)
+      raise TokenizerRuby::Error, "max_tokens must be positive, got #{max_tokens}" unless max_tokens > 0
       encoding = encode(text)
       return text if encoding.length <= max_tokens
@@ -71,11 +93,5 @@ module TokenizerRuby
     def enable_padding(length:, pad_token: "[PAD]")
       @inner._enable_padding(length, pad_token)
     end
-    private
-    def initialize(inner)
-      @inner = inner
-    end
   end
 end

data/lib/tokenizer_ruby/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module TokenizerRuby
-  VERSION = "0.1.0"
+  VERSION = "0.1.1"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tokenizer-ruby
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Johannes Dwi Cahyo
@@ -32,6 +32,7 @@ extra_rdoc_files: []
 files:
 - CLAUDE.md
 - LICENSE
+- README.md
 - ext/tokenizer_ruby/Cargo.toml
 - ext/tokenizer_ruby/extconf.rb
 - ext/tokenizer_ruby/src/lib.rs