tokenizers 0.3.2-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +56 -0
  3. data/Cargo.lock +873 -0
  4. data/Cargo.toml +5 -0
  5. data/LICENSE-THIRD-PARTY.txt +17286 -0
  6. data/LICENSE.txt +202 -0
  7. data/README.md +69 -0
  8. data/lib/tokenizers/2.7/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.0/tokenizers.so +0 -0
  10. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  11. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  12. data/lib/tokenizers/char_bpe_tokenizer.rb +22 -0
  13. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  14. data/lib/tokenizers/decoders/ctc.rb +9 -0
  15. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  16. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  17. data/lib/tokenizers/encoding.rb +19 -0
  18. data/lib/tokenizers/from_pretrained.rb +119 -0
  19. data/lib/tokenizers/models/bpe.rb +9 -0
  20. data/lib/tokenizers/models/unigram.rb +9 -0
  21. data/lib/tokenizers/models/word_level.rb +13 -0
  22. data/lib/tokenizers/models/word_piece.rb +9 -0
  23. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  24. data/lib/tokenizers/normalizers/strip.rb +9 -0
  25. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  26. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  27. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  28. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  29. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  30. data/lib/tokenizers/processors/byte_level.rb +9 -0
  31. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  32. data/lib/tokenizers/processors/template_processing.rb +9 -0
  33. data/lib/tokenizers/tokenizer.rb +45 -0
  34. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  35. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  36. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  37. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  38. data/lib/tokenizers/version.rb +3 -0
  39. data/lib/tokenizers.rb +59 -0
  40. metadata +83 -0
@@ -0,0 +1,45 @@
1
+ module Tokenizers
2
+ class Tokenizer
3
+ extend FromPretrained
4
+
5
+ def to_s(pretty: false)
6
+ _to_s(pretty)
7
+ end
8
+
9
+ def save(path, pretty: false)
10
+ _save(path, pretty)
11
+ end
12
+
13
+ def encode(sequence, pair = nil, is_pretokenized: false, add_special_tokens: true)
14
+ _encode(sequence, pair, is_pretokenized, add_special_tokens)
15
+ end
16
+
17
+ def encode_batch(input, is_pretokenized: false, add_special_tokens: true)
18
+ _encode_batch(input, is_pretokenized, add_special_tokens)
19
+ end
20
+
21
+ def decode(ids, skip_special_tokens: true)
22
+ _decode(ids, skip_special_tokens)
23
+ end
24
+
25
+ def decode_batch(sequences, skip_special_tokens: true)
26
+ _decode_batch(sequences, skip_special_tokens)
27
+ end
28
+
29
+ def enable_padding(**options)
30
+ _enable_padding(options)
31
+ end
32
+
33
+ def enable_truncation(max_length, **options)
34
+ _enable_truncation(max_length, options)
35
+ end
36
+
37
+ def vocab(with_added_tokens: true)
38
+ _vocab(with_added_tokens)
39
+ end
40
+
41
+ def vocab_size(with_added_tokens: true)
42
+ _vocab_size(with_added_tokens)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class BpeTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class UnigramTrainer
4
+ def self.new(vocab_size: 8000,
5
+ show_progress: true,
6
+ special_tokens: [],
7
+ initial_alphabet: [],
8
+ shrinking_factor: 0.75,
9
+ unk_token: nil,
10
+ max_piece_length: 16,
11
+ n_sub_iterations: 2)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ show_progress: show_progress,
16
+ special_tokens: special_tokens,
17
+ initial_alphabet: initial_alphabet,
18
+ shrinking_factor: shrinking_factor,
19
+ unk_token: unk_token,
20
+ max_piece_length: max_piece_length,
21
+ n_sub_iterations: n_sub_iterations
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordLevelTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordPieceTrainer
4
+ def self.new(vocab_size: 30000,
5
+ min_frequency: 0,
6
+ show_progress: true,
7
+ special_tokens: [],
8
+ limit_alphabet: nil,
9
+ initial_alphabet: [],
10
+ continuing_subword_prefix: "##",
11
+ end_of_word_suffix: nil)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ min_frequency: min_frequency,
16
+ show_progress: show_progress,
17
+ special_tokens: special_tokens,
18
+ limit_alphabet: limit_alphabet,
19
+ initial_alphabet: initial_alphabet,
20
+ continuing_subword_prefix: continuing_subword_prefix,
21
+ end_of_word_suffix: end_of_word_suffix
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module Tokenizers
2
+ VERSION = "0.3.2"
3
+ end
data/lib/tokenizers.rb ADDED
@@ -0,0 +1,59 @@
1
+ # ext
2
+ begin
3
+ require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
+ rescue LoadError
5
+ require_relative "tokenizers/tokenizers"
6
+ end
7
+
8
+ # decoders
9
+ require_relative "tokenizers/decoders/bpe_decoder"
10
+ require_relative "tokenizers/decoders/ctc"
11
+ require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/word_piece"
13
+
14
+ # models
15
+ require_relative "tokenizers/models/bpe"
16
+ require_relative "tokenizers/models/word_level"
17
+ require_relative "tokenizers/models/word_piece"
18
+ require_relative "tokenizers/models/unigram"
19
+
20
+ # normalizers
21
+ require_relative "tokenizers/normalizers/bert_normalizer"
22
+ require_relative "tokenizers/normalizers/strip"
23
+
24
+ # pre-tokenizers
25
+ require_relative "tokenizers/pre_tokenizers/byte_level"
26
+ require_relative "tokenizers/pre_tokenizers/digits"
27
+ require_relative "tokenizers/pre_tokenizers/metaspace"
28
+ require_relative "tokenizers/pre_tokenizers/punctuation"
29
+ require_relative "tokenizers/pre_tokenizers/split"
30
+
31
+ # processors
32
+ require_relative "tokenizers/processors/byte_level"
33
+ require_relative "tokenizers/processors/roberta_processing"
34
+ require_relative "tokenizers/processors/template_processing"
35
+
36
+ # trainers
37
+ require_relative "tokenizers/trainers/bpe_trainer"
38
+ require_relative "tokenizers/trainers/unigram_trainer"
39
+ require_relative "tokenizers/trainers/word_level_trainer"
40
+ require_relative "tokenizers/trainers/word_piece_trainer"
41
+
42
+ # other
43
+ require_relative "tokenizers/char_bpe_tokenizer"
44
+ require_relative "tokenizers/encoding"
45
+ require_relative "tokenizers/from_pretrained"
46
+ require_relative "tokenizers/tokenizer"
47
+ require_relative "tokenizers/version"
48
+
49
+ module Tokenizers
50
+ class Error < StandardError; end
51
+
52
+ def self.from_pretrained(...)
53
+ Tokenizer.from_pretrained(...)
54
+ end
55
+
56
+ def self.from_file(...)
57
+ Tokenizer.from_file(...)
58
+ end
59
+ end
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tokenizers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.2
5
+ platform: x86_64-linux-musl
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-03-07 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: andrew@ankane.org
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - CHANGELOG.md
20
+ - Cargo.lock
21
+ - Cargo.toml
22
+ - LICENSE-THIRD-PARTY.txt
23
+ - LICENSE.txt
24
+ - README.md
25
+ - lib/tokenizers.rb
26
+ - lib/tokenizers/2.7/tokenizers.so
27
+ - lib/tokenizers/3.0/tokenizers.so
28
+ - lib/tokenizers/3.1/tokenizers.so
29
+ - lib/tokenizers/3.2/tokenizers.so
30
+ - lib/tokenizers/char_bpe_tokenizer.rb
31
+ - lib/tokenizers/decoders/bpe_decoder.rb
32
+ - lib/tokenizers/decoders/ctc.rb
33
+ - lib/tokenizers/decoders/metaspace.rb
34
+ - lib/tokenizers/decoders/word_piece.rb
35
+ - lib/tokenizers/encoding.rb
36
+ - lib/tokenizers/from_pretrained.rb
37
+ - lib/tokenizers/models/bpe.rb
38
+ - lib/tokenizers/models/unigram.rb
39
+ - lib/tokenizers/models/word_level.rb
40
+ - lib/tokenizers/models/word_piece.rb
41
+ - lib/tokenizers/normalizers/bert_normalizer.rb
42
+ - lib/tokenizers/normalizers/strip.rb
43
+ - lib/tokenizers/pre_tokenizers/byte_level.rb
44
+ - lib/tokenizers/pre_tokenizers/digits.rb
45
+ - lib/tokenizers/pre_tokenizers/metaspace.rb
46
+ - lib/tokenizers/pre_tokenizers/punctuation.rb
47
+ - lib/tokenizers/pre_tokenizers/split.rb
48
+ - lib/tokenizers/processors/byte_level.rb
49
+ - lib/tokenizers/processors/roberta_processing.rb
50
+ - lib/tokenizers/processors/template_processing.rb
51
+ - lib/tokenizers/tokenizer.rb
52
+ - lib/tokenizers/trainers/bpe_trainer.rb
53
+ - lib/tokenizers/trainers/unigram_trainer.rb
54
+ - lib/tokenizers/trainers/word_level_trainer.rb
55
+ - lib/tokenizers/trainers/word_piece_trainer.rb
56
+ - lib/tokenizers/version.rb
57
+ homepage: https://github.com/ankane/tokenizers-ruby
58
+ licenses:
59
+ - Apache-2.0
60
+ metadata: {}
61
+ post_install_message:
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '2.7'
70
+ - - "<"
71
+ - !ruby/object:Gem::Version
72
+ version: 3.3.dev
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubygems_version: 3.4.4
80
+ signing_key:
81
+ specification_version: 4
82
+ summary: Fast state-of-the-art tokenizers for Ruby
83
+ test_files: []