tokenizers 0.2.2-x86_64-linux → 0.3.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/Cargo.lock +33 -74
  4. data/LICENSE-THIRD-PARTY.txt +41 -685
  5. data/README.md +4 -0
  6. data/lib/tokenizers/2.7/tokenizers.so +0 -0
  7. data/lib/tokenizers/3.0/tokenizers.so +0 -0
  8. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  10. data/lib/tokenizers/char_bpe_tokenizer.rb +11 -8
  11. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  12. data/lib/tokenizers/decoders/ctc.rb +9 -0
  13. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  14. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  15. data/lib/tokenizers/encoding.rb +19 -0
  16. data/lib/tokenizers/from_pretrained.rb +1 -1
  17. data/lib/tokenizers/models/bpe.rb +9 -0
  18. data/lib/tokenizers/models/unigram.rb +9 -0
  19. data/lib/tokenizers/models/word_level.rb +13 -0
  20. data/lib/tokenizers/models/word_piece.rb +9 -0
  21. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  22. data/lib/tokenizers/normalizers/strip.rb +9 -0
  23. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  24. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  25. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  26. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  27. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  28. data/lib/tokenizers/processors/byte_level.rb +9 -0
  29. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  30. data/lib/tokenizers/processors/template_processing.rb +9 -0
  31. data/lib/tokenizers/tokenizer.rb +45 -0
  32. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  33. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  34. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  35. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  36. data/lib/tokenizers/version.rb +1 -1
  37. data/lib/tokenizers.rb +49 -7
  38. metadata +27 -3
data/README.md CHANGED
@@ -40,6 +40,10 @@ Load a tokenizer from files
40
40
  tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
41
41
  ```
42
42
 
43
+ ## Training
44
+
45
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
46
+
43
47
  ## History
44
48
 
45
49
  View the [changelog](https://github.com/ankane/tokenizers-ruby/blob/master/CHANGELOG.md)
Binary file
Binary file
Binary file
Binary file
@@ -1,15 +1,18 @@
1
1
  module Tokenizers
2
2
  class CharBPETokenizer
3
- def initialize(vocab, merges)
4
- @tokenizer = Tokenizer.new(BPE.new(vocab, merges))
5
- @tokenizer.add_special_tokens(["<unk>"])
6
- @tokenizer.normalizer = BertNormalizer.new
7
- @tokenizer.pre_tokenizer = BertPreTokenizer.new
8
- @tokenizer.decoder = BPEDecoder.new
3
+ def initialize(vocab, merges, unk_token: "<unk>", suffix: "</w>")
4
+ @tokenizer =
5
+ Tokenizer.new(
6
+ Models::BPE._from_file(vocab, merges, {unk_token: unk_token, end_of_word_suffix: suffix})
7
+ )
8
+ @tokenizer.add_special_tokens([unk_token])
9
+ @tokenizer.normalizer = Normalizers::BertNormalizer.new
10
+ @tokenizer.pre_tokenizer = PreTokenizers::BertPreTokenizer.new
11
+ @tokenizer.decoder = Decoders::BPEDecoder.new
9
12
  end
10
13
 
11
- def encode(text)
12
- @tokenizer.encode(text)
14
+ def encode(text, **options)
15
+ @tokenizer.encode(text, **options)
13
16
  end
14
17
 
15
18
  def decode(ids)
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class BPEDecoder
4
+ def self.new(suffix: "</w>")
5
+ _new(suffix)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class CTC
4
+ def self.new(pad_token: "<pad>", word_delimiter_token: "|", cleanup: true)
5
+ _new(pad_token, word_delimiter_token, cleanup)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class Metaspace
4
+ def self.new(replacement: "\u2581", add_prefix_space: true)
5
+ _new(replacement, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class WordPiece
4
+ def self.new(prefix: '##', cleanup: true)
5
+ _new(prefix, cleanup)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ module Tokenizers
2
+ class Encoding
3
+ def word_to_tokens(word_index, sequence_index = 0)
4
+ _word_to_tokens(word_index, sequence_index)
5
+ end
6
+
7
+ def word_to_chars(word_index, sequence_index = 0)
8
+ _word_to_chars(word_index, sequence_index)
9
+ end
10
+
11
+ def char_to_token(char_pos, sequence_index = 0)
12
+ _char_to_token(char_pos, sequence_index)
13
+ end
14
+
15
+ def char_to_word(char_pos, sequence_index = 0)
16
+ _char_to_word(word_index, sequence_index)
17
+ end
18
+ end
19
+ end
@@ -57,7 +57,7 @@ module Tokenizers
57
57
 
58
58
  tempfile =
59
59
  begin
60
- URI.open(url, options)
60
+ URI.parse(url).open(options)
61
61
  rescue OpenURI::HTTPError => e
62
62
  if e.message == "304 Not Modified"
63
63
  return resource_path
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Models
3
+ class BPE
4
+ def self.new(vocab: nil, merges: nil, **kwargs)
5
+ _new(vocab, merges, kwargs)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Models
3
+ class Unigram
4
+ def self.new(vocab: nil, unk_id: nil)
5
+ _new(vocab, unk_id)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,13 @@
1
+ module Tokenizers
2
+ module Models
3
+ class WordLevel
4
+ def self.new(vocab: nil, unk_token: nil)
5
+ _new(vocab, unk_token)
6
+ end
7
+
8
+ def self.from_file(vocab, unk_token: nil)
9
+ _from_file(vocab, unk_token)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Models
3
+ class WordPiece
4
+ def self.new(vocab: nil, **kwargs)
5
+ _new(vocab, kwargs)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class BertNormalizer
4
+ def self.new(clean_text: true, handle_chinese_chars: true, strip_accents: nil, lowercase: true)
5
+ _new(clean_text, handle_chinese_chars, strip_accents, lowercase)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class Strip
4
+ def self.new(left: true, right: true)
5
+ _new(left, right)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class ByteLevel
4
+ def self.new(add_prefix_space: true, use_regex: true)
5
+ _new(add_prefix_space, use_regex)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Digits
4
+ def self.new(individual_digits: false)
5
+ _new(individual_digits)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Metaspace
4
+ def self.new(replacement: "\u2581", add_prefix_space: true)
5
+ _new(replacement, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Punctuation
4
+ def self.new(behavior: "isolated")
5
+ _new(behavior)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Split
4
+ def self.new(pattern, behavior, invert: false)
5
+ _new(pattern, behavior, invert)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class ByteLevel
4
+ def self.new(trim_offsets: true)
5
+ _new(trim_offsets)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class RobertaProcessing
4
+ def self.new(sep, cls, trim_offsets: true, add_prefix_space: true)
5
+ _new(sep, cls, trim_offsets, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class TemplateProcessing
4
+ def self.new(single: nil, pair: nil, special_tokens: nil)
5
+ _new(single, pair, special_tokens)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,45 @@
1
+ module Tokenizers
2
+ class Tokenizer
3
+ extend FromPretrained
4
+
5
+ def to_s(pretty: false)
6
+ _to_s(pretty)
7
+ end
8
+
9
+ def save(path, pretty: false)
10
+ _save(path, pretty)
11
+ end
12
+
13
+ def encode(sequence, pair = nil, is_pretokenized: false, add_special_tokens: true)
14
+ _encode(sequence, pair, is_pretokenized, add_special_tokens)
15
+ end
16
+
17
+ def encode_batch(input, is_pretokenized: false, add_special_tokens: true)
18
+ _encode_batch(input, is_pretokenized, add_special_tokens)
19
+ end
20
+
21
+ def decode(ids, skip_special_tokens: true)
22
+ _decode(ids, skip_special_tokens)
23
+ end
24
+
25
+ def decode_batch(sequences, skip_special_tokens: true)
26
+ _decode_batch(sequences, skip_special_tokens)
27
+ end
28
+
29
+ def enable_padding(**options)
30
+ _enable_padding(options)
31
+ end
32
+
33
+ def enable_truncation(max_length, **options)
34
+ _enable_truncation(max_length, options)
35
+ end
36
+
37
+ def vocab(with_added_tokens: true)
38
+ _vocab(with_added_tokens)
39
+ end
40
+
41
+ def vocab_size(with_added_tokens: true)
42
+ _vocab_size(with_added_tokens)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class BpeTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class UnigramTrainer
4
+ def self.new(vocab_size: 8000,
5
+ show_progress: true,
6
+ special_tokens: [],
7
+ initial_alphabet: [],
8
+ shrinking_factor: 0.75,
9
+ unk_token: nil,
10
+ max_piece_length: 16,
11
+ n_sub_iterations: 2)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ show_progress: show_progress,
16
+ special_tokens: special_tokens,
17
+ initial_alphabet: initial_alphabet,
18
+ shrinking_factor: shrinking_factor,
19
+ unk_token: unk_token,
20
+ max_piece_length: max_piece_length,
21
+ n_sub_iterations: n_sub_iterations
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordLevelTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordPieceTrainer
4
+ def self.new(vocab_size: 30000,
5
+ min_frequency: 0,
6
+ show_progress: true,
7
+ special_tokens: [],
8
+ limit_alphabet: nil,
9
+ initial_alphabet: [],
10
+ continuing_subword_prefix: "##",
11
+ end_of_word_suffix: nil)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ min_frequency: min_frequency,
16
+ show_progress: show_progress,
17
+ special_tokens: special_tokens,
18
+ limit_alphabet: limit_alphabet,
19
+ initial_alphabet: initial_alphabet,
20
+ continuing_subword_prefix: continuing_subword_prefix,
21
+ end_of_word_suffix: end_of_word_suffix
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,17 +1,59 @@
1
1
  # ext
2
2
  begin
3
- require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
3
+ require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
4
  rescue LoadError
5
- require "tokenizers/tokenizers"
5
+ require_relative "tokenizers/tokenizers"
6
6
  end
7
7
 
8
- # modules
9
- require "tokenizers/char_bpe_tokenizer"
10
- require "tokenizers/from_pretrained"
11
- require "tokenizers/version"
8
+ # decoders
9
+ require_relative "tokenizers/decoders/bpe_decoder"
10
+ require_relative "tokenizers/decoders/ctc"
11
+ require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/word_piece"
13
+
14
+ # models
15
+ require_relative "tokenizers/models/bpe"
16
+ require_relative "tokenizers/models/word_level"
17
+ require_relative "tokenizers/models/word_piece"
18
+ require_relative "tokenizers/models/unigram"
19
+
20
+ # normalizers
21
+ require_relative "tokenizers/normalizers/bert_normalizer"
22
+ require_relative "tokenizers/normalizers/strip"
23
+
24
+ # pre-tokenizers
25
+ require_relative "tokenizers/pre_tokenizers/byte_level"
26
+ require_relative "tokenizers/pre_tokenizers/digits"
27
+ require_relative "tokenizers/pre_tokenizers/metaspace"
28
+ require_relative "tokenizers/pre_tokenizers/punctuation"
29
+ require_relative "tokenizers/pre_tokenizers/split"
30
+
31
+ # processors
32
+ require_relative "tokenizers/processors/byte_level"
33
+ require_relative "tokenizers/processors/roberta_processing"
34
+ require_relative "tokenizers/processors/template_processing"
35
+
36
+ # trainers
37
+ require_relative "tokenizers/trainers/bpe_trainer"
38
+ require_relative "tokenizers/trainers/unigram_trainer"
39
+ require_relative "tokenizers/trainers/word_level_trainer"
40
+ require_relative "tokenizers/trainers/word_piece_trainer"
41
+
42
+ # other
43
+ require_relative "tokenizers/char_bpe_tokenizer"
44
+ require_relative "tokenizers/encoding"
45
+ require_relative "tokenizers/from_pretrained"
46
+ require_relative "tokenizers/tokenizer"
47
+ require_relative "tokenizers/version"
12
48
 
13
49
  module Tokenizers
14
50
  class Error < StandardError; end
15
51
 
16
- extend FromPretrained
52
+ def self.from_pretrained(...)
53
+ Tokenizer.from_pretrained(...)
54
+ end
55
+
56
+ def self.from_file(...)
57
+ Tokenizer.from_file(...)
58
+ end
17
59
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-15 00:00:00.000000000 Z
11
+ date: 2023-02-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -28,7 +28,31 @@ files:
28
28
  - lib/tokenizers/3.1/tokenizers.so
29
29
  - lib/tokenizers/3.2/tokenizers.so
30
30
  - lib/tokenizers/char_bpe_tokenizer.rb
31
+ - lib/tokenizers/decoders/bpe_decoder.rb
32
+ - lib/tokenizers/decoders/ctc.rb
33
+ - lib/tokenizers/decoders/metaspace.rb
34
+ - lib/tokenizers/decoders/word_piece.rb
35
+ - lib/tokenizers/encoding.rb
31
36
  - lib/tokenizers/from_pretrained.rb
37
+ - lib/tokenizers/models/bpe.rb
38
+ - lib/tokenizers/models/unigram.rb
39
+ - lib/tokenizers/models/word_level.rb
40
+ - lib/tokenizers/models/word_piece.rb
41
+ - lib/tokenizers/normalizers/bert_normalizer.rb
42
+ - lib/tokenizers/normalizers/strip.rb
43
+ - lib/tokenizers/pre_tokenizers/byte_level.rb
44
+ - lib/tokenizers/pre_tokenizers/digits.rb
45
+ - lib/tokenizers/pre_tokenizers/metaspace.rb
46
+ - lib/tokenizers/pre_tokenizers/punctuation.rb
47
+ - lib/tokenizers/pre_tokenizers/split.rb
48
+ - lib/tokenizers/processors/byte_level.rb
49
+ - lib/tokenizers/processors/roberta_processing.rb
50
+ - lib/tokenizers/processors/template_processing.rb
51
+ - lib/tokenizers/tokenizer.rb
52
+ - lib/tokenizers/trainers/bpe_trainer.rb
53
+ - lib/tokenizers/trainers/unigram_trainer.rb
54
+ - lib/tokenizers/trainers/word_level_trainer.rb
55
+ - lib/tokenizers/trainers/word_piece_trainer.rb
32
56
  - lib/tokenizers/version.rb
33
57
  homepage: https://github.com/ankane/tokenizers-ruby
34
58
  licenses:
@@ -52,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
52
76
  - !ruby/object:Gem::Version
53
77
  version: '0'
54
78
  requirements: []
55
- rubygems_version: 3.4.3
79
+ rubygems_version: 3.4.4
56
80
  signing_key:
57
81
  specification_version: 4
58
82
  summary: Fast state-of-the-art tokenizers for Ruby