tokenizers 0.3.2 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module Models
3
3
  class Unigram
4
- def self.new(vocab: nil, unk_id: nil)
5
- _new(vocab, unk_id)
4
+ def self.new(vocab: nil, unk_id: nil, byte_fallback: nil)
5
+ _new(vocab, unk_id, byte_fallback)
6
6
  end
7
7
  end
8
8
  end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class Prepend
4
+ def self.new(prepend: "▁")
5
+ _new(prepend)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.3.2"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,14 +1,15 @@
1
1
  # ext
2
2
  begin
3
- require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
3
+ require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
4
  rescue LoadError
5
- require_relative "tokenizers/tokenizers"
5
+ require "tokenizers/tokenizers"
6
6
  end
7
7
 
8
8
  # decoders
9
9
  require_relative "tokenizers/decoders/bpe_decoder"
10
10
  require_relative "tokenizers/decoders/ctc"
11
11
  require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/strip"
12
13
  require_relative "tokenizers/decoders/word_piece"
13
14
 
14
15
  # models
@@ -19,6 +20,7 @@ require_relative "tokenizers/models/unigram"
19
20
 
20
21
  # normalizers
21
22
  require_relative "tokenizers/normalizers/bert_normalizer"
23
+ require_relative "tokenizers/normalizers/prepend"
22
24
  require_relative "tokenizers/normalizers/strip"
23
25
 
24
26
  # pre-tokenizers
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-03-07 00:00:00.000000000 Z
11
+ date: 2023-09-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -56,6 +56,7 @@ files:
56
56
  - lib/tokenizers/decoders/bpe_decoder.rb
57
57
  - lib/tokenizers/decoders/ctc.rb
58
58
  - lib/tokenizers/decoders/metaspace.rb
59
+ - lib/tokenizers/decoders/strip.rb
59
60
  - lib/tokenizers/decoders/word_piece.rb
60
61
  - lib/tokenizers/encoding.rb
61
62
  - lib/tokenizers/from_pretrained.rb
@@ -64,6 +65,7 @@ files:
64
65
  - lib/tokenizers/models/word_level.rb
65
66
  - lib/tokenizers/models/word_piece.rb
66
67
  - lib/tokenizers/normalizers/bert_normalizer.rb
68
+ - lib/tokenizers/normalizers/prepend.rb
67
69
  - lib/tokenizers/normalizers/strip.rb
68
70
  - lib/tokenizers/pre_tokenizers/byte_level.rb
69
71
  - lib/tokenizers/pre_tokenizers/digits.rb
@@ -91,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
91
93
  requirements:
92
94
  - - ">="
93
95
  - !ruby/object:Gem::Version
94
- version: '2.7'
96
+ version: '3'
95
97
  required_rubygems_version: !ruby/object:Gem::Requirement
96
98
  requirements:
97
99
  - - ">="
98
100
  - !ruby/object:Gem::Version
99
101
  version: '0'
100
102
  requirements: []
101
- rubygems_version: 3.4.6
103
+ rubygems_version: 3.4.10
102
104
  signing_key:
103
105
  specification_version: 4
104
106
  summary: Fast state-of-the-art tokenizers for Ruby