tokenizers 0.2.2-aarch64-linux → 0.3.0-aarch64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/Cargo.lock +33 -74
  4. data/LICENSE-THIRD-PARTY.txt +41 -685
  5. data/README.md +4 -0
  6. data/lib/tokenizers/2.7/tokenizers.so +0 -0
  7. data/lib/tokenizers/3.0/tokenizers.so +0 -0
  8. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  10. data/lib/tokenizers/char_bpe_tokenizer.rb +11 -8
  11. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  12. data/lib/tokenizers/decoders/ctc.rb +9 -0
  13. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  14. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  15. data/lib/tokenizers/encoding.rb +19 -0
  16. data/lib/tokenizers/from_pretrained.rb +1 -1
  17. data/lib/tokenizers/models/bpe.rb +9 -0
  18. data/lib/tokenizers/models/unigram.rb +9 -0
  19. data/lib/tokenizers/models/word_level.rb +13 -0
  20. data/lib/tokenizers/models/word_piece.rb +9 -0
  21. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  22. data/lib/tokenizers/normalizers/strip.rb +9 -0
  23. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  24. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  25. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  26. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  27. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  28. data/lib/tokenizers/processors/byte_level.rb +9 -0
  29. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  30. data/lib/tokenizers/processors/template_processing.rb +9 -0
  31. data/lib/tokenizers/tokenizer.rb +45 -0
  32. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  33. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  34. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  35. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  36. data/lib/tokenizers/version.rb +1 -1
  37. data/lib/tokenizers.rb +49 -7
  38. metadata +27 -3
data/README.md CHANGED
@@ -40,6 +40,10 @@ Load a tokenizer from files
40
40
  tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
41
41
  ```
42
42
 
43
+ ## Training
44
+
45
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
46
+
43
47
  ## History
44
48
 
45
49
  View the [changelog](https://github.com/ankane/tokenizers-ruby/blob/master/CHANGELOG.md)
Binary file
Binary file
Binary file
Binary file
@@ -1,15 +1,18 @@
1
1
  module Tokenizers
2
2
  class CharBPETokenizer
3
- def initialize(vocab, merges)
4
- @tokenizer = Tokenizer.new(BPE.new(vocab, merges))
5
- @tokenizer.add_special_tokens(["<unk>"])
6
- @tokenizer.normalizer = BertNormalizer.new
7
- @tokenizer.pre_tokenizer = BertPreTokenizer.new
8
- @tokenizer.decoder = BPEDecoder.new
3
+ def initialize(vocab, merges, unk_token: "<unk>", suffix: "</w>")
4
+ @tokenizer =
5
+ Tokenizer.new(
6
+ Models::BPE._from_file(vocab, merges, {unk_token: unk_token, end_of_word_suffix: suffix})
7
+ )
8
+ @tokenizer.add_special_tokens([unk_token])
9
+ @tokenizer.normalizer = Normalizers::BertNormalizer.new
10
+ @tokenizer.pre_tokenizer = PreTokenizers::BertPreTokenizer.new
11
+ @tokenizer.decoder = Decoders::BPEDecoder.new
9
12
  end
10
13
 
11
- def encode(text)
12
- @tokenizer.encode(text)
14
+ def encode(text, **options)
15
+ @tokenizer.encode(text, **options)
13
16
  end
14
17
 
15
18
  def decode(ids)
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class BPEDecoder
4
+ def self.new(suffix: "</w>")
5
+ _new(suffix)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class CTC
4
+ def self.new(pad_token: "<pad>", word_delimiter_token: "|", cleanup: true)
5
+ _new(pad_token, word_delimiter_token, cleanup)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class Metaspace
4
+ def self.new(replacement: "\u2581", add_prefix_space: true)
5
+ _new(replacement, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class WordPiece
4
+ def self.new(prefix: '##', cleanup: true)
5
+ _new(prefix, cleanup)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ module Tokenizers
2
+ class Encoding
3
+ def word_to_tokens(word_index, sequence_index = 0)
4
+ _word_to_tokens(word_index, sequence_index)
5
+ end
6
+
7
+ def word_to_chars(word_index, sequence_index = 0)
8
+ _word_to_chars(word_index, sequence_index)
9
+ end
10
+
11
+ def char_to_token(char_pos, sequence_index = 0)
12
+ _char_to_token(char_pos, sequence_index)
13
+ end
14
+
15
+ def char_to_word(char_pos, sequence_index = 0)
16
+ _char_to_word(word_index, sequence_index)
17
+ end
18
+ end
19
+ end
@@ -57,7 +57,7 @@ module Tokenizers
57
57
 
58
58
  tempfile =
59
59
  begin
60
- URI.open(url, options)
60
+ URI.parse(url).open(options)
61
61
  rescue OpenURI::HTTPError => e
62
62
  if e.message == "304 Not Modified"
63
63
  return resource_path
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Models
3
+ class BPE
4
+ def self.new(vocab: nil, merges: nil, **kwargs)
5
+ _new(vocab, merges, kwargs)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Models
3
+ class Unigram
4
+ def self.new(vocab: nil, unk_id: nil)
5
+ _new(vocab, unk_id)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,13 @@
1
+ module Tokenizers
2
+ module Models
3
+ class WordLevel
4
+ def self.new(vocab: nil, unk_token: nil)
5
+ _new(vocab, unk_token)
6
+ end
7
+
8
+ def self.from_file(vocab, unk_token: nil)
9
+ _from_file(vocab, unk_token)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Models
3
+ class WordPiece
4
+ def self.new(vocab: nil, **kwargs)
5
+ _new(vocab, kwargs)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class BertNormalizer
4
+ def self.new(clean_text: true, handle_chinese_chars: true, strip_accents: nil, lowercase: true)
5
+ _new(clean_text, handle_chinese_chars, strip_accents, lowercase)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class Strip
4
+ def self.new(left: true, right: true)
5
+ _new(left, right)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class ByteLevel
4
+ def self.new(add_prefix_space: true, use_regex: true)
5
+ _new(add_prefix_space, use_regex)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Digits
4
+ def self.new(individual_digits: false)
5
+ _new(individual_digits)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Metaspace
4
+ def self.new(replacement: "\u2581", add_prefix_space: true)
5
+ _new(replacement, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Punctuation
4
+ def self.new(behavior: "isolated")
5
+ _new(behavior)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module PreTokenizers
3
+ class Split
4
+ def self.new(pattern, behavior, invert: false)
5
+ _new(pattern, behavior, invert)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class ByteLevel
4
+ def self.new(trim_offsets: true)
5
+ _new(trim_offsets)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class RobertaProcessing
4
+ def self.new(sep, cls, trim_offsets: true, add_prefix_space: true)
5
+ _new(sep, cls, trim_offsets, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class TemplateProcessing
4
+ def self.new(single: nil, pair: nil, special_tokens: nil)
5
+ _new(single, pair, special_tokens)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,45 @@
1
+ module Tokenizers
2
+ class Tokenizer
3
+ extend FromPretrained
4
+
5
+ def to_s(pretty: false)
6
+ _to_s(pretty)
7
+ end
8
+
9
+ def save(path, pretty: false)
10
+ _save(path, pretty)
11
+ end
12
+
13
+ def encode(sequence, pair = nil, is_pretokenized: false, add_special_tokens: true)
14
+ _encode(sequence, pair, is_pretokenized, add_special_tokens)
15
+ end
16
+
17
+ def encode_batch(input, is_pretokenized: false, add_special_tokens: true)
18
+ _encode_batch(input, is_pretokenized, add_special_tokens)
19
+ end
20
+
21
+ def decode(ids, skip_special_tokens: true)
22
+ _decode(ids, skip_special_tokens)
23
+ end
24
+
25
+ def decode_batch(sequences, skip_special_tokens: true)
26
+ _decode_batch(sequences, skip_special_tokens)
27
+ end
28
+
29
+ def enable_padding(**options)
30
+ _enable_padding(options)
31
+ end
32
+
33
+ def enable_truncation(max_length, **options)
34
+ _enable_truncation(max_length, options)
35
+ end
36
+
37
+ def vocab(with_added_tokens: true)
38
+ _vocab(with_added_tokens)
39
+ end
40
+
41
+ def vocab_size(with_added_tokens: true)
42
+ _vocab_size(with_added_tokens)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class BpeTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class UnigramTrainer
4
+ def self.new(vocab_size: 8000,
5
+ show_progress: true,
6
+ special_tokens: [],
7
+ initial_alphabet: [],
8
+ shrinking_factor: 0.75,
9
+ unk_token: nil,
10
+ max_piece_length: 16,
11
+ n_sub_iterations: 2)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ show_progress: show_progress,
16
+ special_tokens: special_tokens,
17
+ initial_alphabet: initial_alphabet,
18
+ shrinking_factor: shrinking_factor,
19
+ unk_token: unk_token,
20
+ max_piece_length: max_piece_length,
21
+ n_sub_iterations: n_sub_iterations
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordLevelTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordPieceTrainer
4
+ def self.new(vocab_size: 30000,
5
+ min_frequency: 0,
6
+ show_progress: true,
7
+ special_tokens: [],
8
+ limit_alphabet: nil,
9
+ initial_alphabet: [],
10
+ continuing_subword_prefix: "##",
11
+ end_of_word_suffix: nil)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ min_frequency: min_frequency,
16
+ show_progress: show_progress,
17
+ special_tokens: special_tokens,
18
+ limit_alphabet: limit_alphabet,
19
+ initial_alphabet: initial_alphabet,
20
+ continuing_subword_prefix: continuing_subword_prefix,
21
+ end_of_word_suffix: end_of_word_suffix
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,17 +1,59 @@
1
1
  # ext
2
2
  begin
3
- require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
3
+ require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
4
  rescue LoadError
5
- require "tokenizers/tokenizers"
5
+ require_relative "tokenizers/tokenizers"
6
6
  end
7
7
 
8
- # modules
9
- require "tokenizers/char_bpe_tokenizer"
10
- require "tokenizers/from_pretrained"
11
- require "tokenizers/version"
8
+ # decoders
9
+ require_relative "tokenizers/decoders/bpe_decoder"
10
+ require_relative "tokenizers/decoders/ctc"
11
+ require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/word_piece"
13
+
14
+ # models
15
+ require_relative "tokenizers/models/bpe"
16
+ require_relative "tokenizers/models/word_level"
17
+ require_relative "tokenizers/models/word_piece"
18
+ require_relative "tokenizers/models/unigram"
19
+
20
+ # normalizers
21
+ require_relative "tokenizers/normalizers/bert_normalizer"
22
+ require_relative "tokenizers/normalizers/strip"
23
+
24
+ # pre-tokenizers
25
+ require_relative "tokenizers/pre_tokenizers/byte_level"
26
+ require_relative "tokenizers/pre_tokenizers/digits"
27
+ require_relative "tokenizers/pre_tokenizers/metaspace"
28
+ require_relative "tokenizers/pre_tokenizers/punctuation"
29
+ require_relative "tokenizers/pre_tokenizers/split"
30
+
31
+ # processors
32
+ require_relative "tokenizers/processors/byte_level"
33
+ require_relative "tokenizers/processors/roberta_processing"
34
+ require_relative "tokenizers/processors/template_processing"
35
+
36
+ # trainers
37
+ require_relative "tokenizers/trainers/bpe_trainer"
38
+ require_relative "tokenizers/trainers/unigram_trainer"
39
+ require_relative "tokenizers/trainers/word_level_trainer"
40
+ require_relative "tokenizers/trainers/word_piece_trainer"
41
+
42
+ # other
43
+ require_relative "tokenizers/char_bpe_tokenizer"
44
+ require_relative "tokenizers/encoding"
45
+ require_relative "tokenizers/from_pretrained"
46
+ require_relative "tokenizers/tokenizer"
47
+ require_relative "tokenizers/version"
12
48
 
13
49
  module Tokenizers
14
50
  class Error < StandardError; end
15
51
 
16
- extend FromPretrained
52
+ def self.from_pretrained(...)
53
+ Tokenizer.from_pretrained(...)
54
+ end
55
+
56
+ def self.from_file(...)
57
+ Tokenizer.from_file(...)
58
+ end
17
59
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: aarch64-linux
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-15 00:00:00.000000000 Z
11
+ date: 2023-02-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -28,7 +28,31 @@ files:
28
28
  - lib/tokenizers/3.1/tokenizers.so
29
29
  - lib/tokenizers/3.2/tokenizers.so
30
30
  - lib/tokenizers/char_bpe_tokenizer.rb
31
+ - lib/tokenizers/decoders/bpe_decoder.rb
32
+ - lib/tokenizers/decoders/ctc.rb
33
+ - lib/tokenizers/decoders/metaspace.rb
34
+ - lib/tokenizers/decoders/word_piece.rb
35
+ - lib/tokenizers/encoding.rb
31
36
  - lib/tokenizers/from_pretrained.rb
37
+ - lib/tokenizers/models/bpe.rb
38
+ - lib/tokenizers/models/unigram.rb
39
+ - lib/tokenizers/models/word_level.rb
40
+ - lib/tokenizers/models/word_piece.rb
41
+ - lib/tokenizers/normalizers/bert_normalizer.rb
42
+ - lib/tokenizers/normalizers/strip.rb
43
+ - lib/tokenizers/pre_tokenizers/byte_level.rb
44
+ - lib/tokenizers/pre_tokenizers/digits.rb
45
+ - lib/tokenizers/pre_tokenizers/metaspace.rb
46
+ - lib/tokenizers/pre_tokenizers/punctuation.rb
47
+ - lib/tokenizers/pre_tokenizers/split.rb
48
+ - lib/tokenizers/processors/byte_level.rb
49
+ - lib/tokenizers/processors/roberta_processing.rb
50
+ - lib/tokenizers/processors/template_processing.rb
51
+ - lib/tokenizers/tokenizer.rb
52
+ - lib/tokenizers/trainers/bpe_trainer.rb
53
+ - lib/tokenizers/trainers/unigram_trainer.rb
54
+ - lib/tokenizers/trainers/word_level_trainer.rb
55
+ - lib/tokenizers/trainers/word_piece_trainer.rb
32
56
  - lib/tokenizers/version.rb
33
57
  homepage: https://github.com/ankane/tokenizers-ruby
34
58
  licenses:
@@ -52,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
52
76
  - !ruby/object:Gem::Version
53
77
  version: '0'
54
78
  requirements: []
55
- rubygems_version: 3.4.3
79
+ rubygems_version: 3.4.4
56
80
  signing_key:
57
81
  specification_version: 4
58
82
  summary: Fast state-of-the-art tokenizers for Ruby