tokenizers 0.2.3-x86_64-darwin → 0.3.1-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/Cargo.lock +33 -74
  4. data/LICENSE-THIRD-PARTY.txt +214 -858
  5. data/README.md +4 -0
  6. data/lib/tokenizers/2.7/tokenizers.bundle +0 -0
  7. data/lib/tokenizers/3.0/tokenizers.bundle +0 -0
  8. data/lib/tokenizers/3.1/tokenizers.bundle +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.bundle +0 -0
  10. data/lib/tokenizers/char_bpe_tokenizer.rb +9 -6
  11. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  12. data/lib/tokenizers/decoders/ctc.rb +9 -0
  13. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  14. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  15. data/lib/tokenizers/from_pretrained.rb +2 -2
  16. data/lib/tokenizers/models/bpe.rb +9 -0
  17. data/lib/tokenizers/models/unigram.rb +9 -0
  18. data/lib/tokenizers/models/word_level.rb +13 -0
  19. data/lib/tokenizers/models/word_piece.rb +9 -0
  20. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  21. data/lib/tokenizers/normalizers/strip.rb +9 -0
  22. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  23. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  24. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  25. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  26. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  27. data/lib/tokenizers/processors/byte_level.rb +9 -0
  28. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  29. data/lib/tokenizers/processors/template_processing.rb +9 -0
  30. data/lib/tokenizers/tokenizer.rb +40 -7
  31. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  32. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  33. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  34. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  35. data/lib/tokenizers/version.rb +1 -1
  36. data/lib/tokenizers.rb +42 -2
  37. metadata +24 -2
data/README.md CHANGED
@@ -40,6 +40,10 @@ Load a tokenizer from files
40
40
  tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
41
41
  ```
42
42
 
43
+ ## Training
44
+
45
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
46
+
43
47
  ## History
44
48
 
45
49
  View the [changelog](https://github.com/ankane/tokenizers-ruby/blob/master/CHANGELOG.md)
Binary file
Binary file
Binary file
Binary file
@@ -1,11 +1,14 @@
1
1
  module Tokenizers
2
2
  class CharBPETokenizer
3
- def initialize(vocab, merges)
4
- @tokenizer = Tokenizer.new(BPE.new(vocab, merges))
5
- @tokenizer.add_special_tokens(["<unk>"])
6
- @tokenizer.normalizer = BertNormalizer.new
7
- @tokenizer.pre_tokenizer = BertPreTokenizer.new
8
- @tokenizer.decoder = BPEDecoder.new
3
+ def initialize(vocab, merges, unk_token: "<unk>", suffix: "</w>")
4
+ @tokenizer =
5
+ Tokenizer.new(
6
+ Models::BPE._from_file(vocab, merges, {unk_token: unk_token, end_of_word_suffix: suffix})
7
+ )
8
+ @tokenizer.add_special_tokens([unk_token])
9
+ @tokenizer.normalizer = Normalizers::BertNormalizer.new
10
+ @tokenizer.pre_tokenizer = PreTokenizers::BertPreTokenizer.new
11
+ @tokenizer.decoder = Decoders::BPEDecoder.new
9
12
  end
10
13
 
11
14
  def encode(text, **options)
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class BPEDecoder
4
+ def self.new(suffix: "</w>")
5
+ _new(suffix)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class CTC
4
+ def self.new(pad_token: "<pad>", word_delimiter_token: "|", cleanup: true)
5
+ _new(pad_token, word_delimiter_token, cleanup)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class Metaspace
4
+ def self.new(replacement: "\u2581", add_prefix_space: true)
5
+ _new(replacement, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class WordPiece
4
+ def self.new(prefix: '##', cleanup: true)
5
+ _new(prefix, cleanup)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -44,7 +44,7 @@ module Tokenizers
44
44
  def cached_path(cache_dir, url, options)
45
45
  fsum = Digest::SHA256.hexdigest(url)
46
46
  meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
- meta = meta_paths.map { |f| JSON.load_file(f) }.max_by { |m| m["creation_time"] }
47
+ meta = meta_paths.map { |f| JSON.parse(File.read(f)) }.max_by { |m| m["creation_time"] }
48
48
  etag = meta["etag"] if meta
49
49
 
50
50
  if etag
@@ -57,7 +57,7 @@ module Tokenizers
57
57
 
58
58
  tempfile =
59
59
  begin
60
- URI.open(url, options)
60
+ URI.parse(url).open(options)
61
61
  rescue OpenURI::HTTPError => e
62
62
  if e.message == "304 Not Modified"
63
63
  return resource_path
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Models
3
+ class BPE
4
+ def self.new(vocab: nil, merges: nil, **kwargs)
5
+ _new(vocab, merges, kwargs)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Models
3
+ class Unigram
4
+ def self.new(vocab: nil, unk_id: nil)
5
+ _new(vocab, unk_id)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,13 @@
1
+ module Tokenizers
2
+ module Models
3
+ class WordLevel
4
+ def self.new(vocab: nil, unk_token: nil)
5
+ _new(vocab, unk_token)
6
+ end
7
+
8
+ def self.from_file(vocab, unk_token: nil)
9
+ _from_file(vocab, unk_token)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Models
3
+ class WordPiece
4
+ def self.new(vocab: nil, **kwargs)
5
+ _new(vocab, kwargs)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class BertNormalizer
4
+ def self.new(clean_text: true, handle_chinese_chars: true, strip_accents: nil, lowercase: true)
5
+ _new(clean_text, handle_chinese_chars, strip_accents, lowercase)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class Strip
4
+ def self.new(left: true, right: true)
5
+ _new(left, right)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class ByteLevel
4
+ def self.new(add_prefix_space: true, use_regex: true)
5
+ _new(add_prefix_space, use_regex)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Digits
4
+ def self.new(individual_digits: false)
5
+ _new(individual_digits)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Metaspace
4
+ def self.new(replacement: "\u2581", add_prefix_space: true)
5
+ _new(replacement, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Punctuation
4
+ def self.new(behavior: "isolated")
5
+ _new(behavior)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Split
4
+ def self.new(pattern, behavior, invert: false)
5
+ _new(pattern, behavior, invert)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class ByteLevel
4
+ def self.new(trim_offsets: true)
5
+ _new(trim_offsets)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class RobertaProcessing
4
+ def self.new(sep, cls, trim_offsets: true, add_prefix_space: true)
5
+ _new(sep, cls, trim_offsets, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class TemplateProcessing
4
+ def self.new(single: nil, pair: nil, special_tokens: nil)
5
+ _new(single, pair, special_tokens)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,12 +1,45 @@
1
1
  module Tokenizers
2
2
  class Tokenizer
3
- # TODO change add_special_tokens default to true in 0.3.0
4
- def encode(sequence, add_special_tokens: nil)
5
- if add_special_tokens.nil?
6
- warn "[tokenizers] add_special_tokens will default to true in 0.3.0. Pass add_special_tokens: true/false to silence this warning."
7
- add_special_tokens = false
8
- end
9
- _encode(sequence, add_special_tokens)
3
+ extend FromPretrained
4
+
5
+ def to_s(pretty: false)
6
+ _to_s(pretty)
7
+ end
8
+
9
+ def save(path, pretty: false)
10
+ _save(path, pretty)
11
+ end
12
+
13
+ def encode(sequence, pair = nil, is_pretokenized: false, add_special_tokens: true)
14
+ _encode(sequence, pair, is_pretokenized, add_special_tokens)
15
+ end
16
+
17
+ def encode_batch(input, is_pretokenized: false, add_special_tokens: true)
18
+ _encode_batch(input, is_pretokenized, add_special_tokens)
19
+ end
20
+
21
+ def decode(ids, skip_special_tokens: true)
22
+ _decode(ids, skip_special_tokens)
23
+ end
24
+
25
+ def decode_batch(sequences, skip_special_tokens: true)
26
+ _decode_batch(sequences, skip_special_tokens)
27
+ end
28
+
29
+ def enable_padding(**options)
30
+ _enable_padding(options)
31
+ end
32
+
33
+ def enable_truncation(max_length, **options)
34
+ _enable_truncation(max_length, options)
35
+ end
36
+
37
+ def vocab(with_added_tokens: true)
38
+ _vocab(with_added_tokens)
39
+ end
40
+
41
+ def vocab_size(with_added_tokens: true)
42
+ _vocab_size(with_added_tokens)
10
43
  end
11
44
  end
12
45
  end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class BpeTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class UnigramTrainer
4
+ def self.new(vocab_size: 8000,
5
+ show_progress: true,
6
+ special_tokens: [],
7
+ initial_alphabet: [],
8
+ shrinking_factor: 0.75,
9
+ unk_token: nil,
10
+ max_piece_length: 16,
11
+ n_sub_iterations: 2)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ show_progress: show_progress,
16
+ special_tokens: special_tokens,
17
+ initial_alphabet: initial_alphabet,
18
+ shrinking_factor: shrinking_factor,
19
+ unk_token: unk_token,
20
+ max_piece_length: max_piece_length,
21
+ n_sub_iterations: n_sub_iterations
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordLevelTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordPieceTrainer
4
+ def self.new(vocab_size: 30000,
5
+ min_frequency: 0,
6
+ show_progress: true,
7
+ special_tokens: [],
8
+ limit_alphabet: nil,
9
+ initial_alphabet: [],
10
+ continuing_subword_prefix: "##",
11
+ end_of_word_suffix: nil)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ min_frequency: min_frequency,
16
+ show_progress: show_progress,
17
+ special_tokens: special_tokens,
18
+ limit_alphabet: limit_alphabet,
19
+ initial_alphabet: initial_alphabet,
20
+ continuing_subword_prefix: continuing_subword_prefix,
21
+ end_of_word_suffix: end_of_word_suffix
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.2.3"
2
+ VERSION = "0.3.1"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -5,7 +5,41 @@ rescue LoadError
5
5
  require_relative "tokenizers/tokenizers"
6
6
  end
7
7
 
8
- # modules
8
+ # decoders
9
+ require_relative "tokenizers/decoders/bpe_decoder"
10
+ require_relative "tokenizers/decoders/ctc"
11
+ require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/word_piece"
13
+
14
+ # models
15
+ require_relative "tokenizers/models/bpe"
16
+ require_relative "tokenizers/models/word_level"
17
+ require_relative "tokenizers/models/word_piece"
18
+ require_relative "tokenizers/models/unigram"
19
+
20
+ # normalizers
21
+ require_relative "tokenizers/normalizers/bert_normalizer"
22
+ require_relative "tokenizers/normalizers/strip"
23
+
24
+ # pre-tokenizers
25
+ require_relative "tokenizers/pre_tokenizers/byte_level"
26
+ require_relative "tokenizers/pre_tokenizers/digits"
27
+ require_relative "tokenizers/pre_tokenizers/metaspace"
28
+ require_relative "tokenizers/pre_tokenizers/punctuation"
29
+ require_relative "tokenizers/pre_tokenizers/split"
30
+
31
+ # processors
32
+ require_relative "tokenizers/processors/byte_level"
33
+ require_relative "tokenizers/processors/roberta_processing"
34
+ require_relative "tokenizers/processors/template_processing"
35
+
36
+ # trainers
37
+ require_relative "tokenizers/trainers/bpe_trainer"
38
+ require_relative "tokenizers/trainers/unigram_trainer"
39
+ require_relative "tokenizers/trainers/word_level_trainer"
40
+ require_relative "tokenizers/trainers/word_piece_trainer"
41
+
42
+ # other
9
43
  require_relative "tokenizers/char_bpe_tokenizer"
10
44
  require_relative "tokenizers/encoding"
11
45
  require_relative "tokenizers/from_pretrained"
@@ -15,5 +49,11 @@ require_relative "tokenizers/version"
15
49
  module Tokenizers
16
50
  class Error < StandardError; end
17
51
 
18
- extend FromPretrained
52
+ def self.from_pretrained(...)
53
+ Tokenizer.from_pretrained(...)
54
+ end
55
+
56
+ def self.from_file(...)
57
+ Tokenizer.from_file(...)
58
+ end
19
59
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.1
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-22 00:00:00.000000000 Z
11
+ date: 2023-02-09 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -28,9 +28,31 @@ files:
28
28
  - lib/tokenizers/3.1/tokenizers.bundle
29
29
  - lib/tokenizers/3.2/tokenizers.bundle
30
30
  - lib/tokenizers/char_bpe_tokenizer.rb
31
+ - lib/tokenizers/decoders/bpe_decoder.rb
32
+ - lib/tokenizers/decoders/ctc.rb
33
+ - lib/tokenizers/decoders/metaspace.rb
34
+ - lib/tokenizers/decoders/word_piece.rb
31
35
  - lib/tokenizers/encoding.rb
32
36
  - lib/tokenizers/from_pretrained.rb
37
+ - lib/tokenizers/models/bpe.rb
38
+ - lib/tokenizers/models/unigram.rb
39
+ - lib/tokenizers/models/word_level.rb
40
+ - lib/tokenizers/models/word_piece.rb
41
+ - lib/tokenizers/normalizers/bert_normalizer.rb
42
+ - lib/tokenizers/normalizers/strip.rb
43
+ - lib/tokenizers/pre_tokenizers/byte_level.rb
44
+ - lib/tokenizers/pre_tokenizers/digits.rb
45
+ - lib/tokenizers/pre_tokenizers/metaspace.rb
46
+ - lib/tokenizers/pre_tokenizers/punctuation.rb
47
+ - lib/tokenizers/pre_tokenizers/split.rb
48
+ - lib/tokenizers/processors/byte_level.rb
49
+ - lib/tokenizers/processors/roberta_processing.rb
50
+ - lib/tokenizers/processors/template_processing.rb
33
51
  - lib/tokenizers/tokenizer.rb
52
+ - lib/tokenizers/trainers/bpe_trainer.rb
53
+ - lib/tokenizers/trainers/unigram_trainer.rb
54
+ - lib/tokenizers/trainers/word_level_trainer.rb
55
+ - lib/tokenizers/trainers/word_piece_trainer.rb
34
56
  - lib/tokenizers/version.rb
35
57
  homepage: https://github.com/ankane/tokenizers-ruby
36
58
  licenses: