tokenizers 0.5.3-x64-mingw-ucrt

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +107 -0
  3. data/Cargo.lock +898 -0
  4. data/Cargo.toml +6 -0
  5. data/LICENSE-THIRD-PARTY.txt +17427 -0
  6. data/LICENSE.txt +202 -0
  7. data/README.md +105 -0
  8. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  10. data/lib/tokenizers/3.3/tokenizers.so +0 -0
  11. data/lib/tokenizers/added_token.rb +7 -0
  12. data/lib/tokenizers/char_bpe_tokenizer.rb +22 -0
  13. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  14. data/lib/tokenizers/decoders/ctc.rb +9 -0
  15. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  16. data/lib/tokenizers/decoders/strip.rb +9 -0
  17. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  18. data/lib/tokenizers/encoding.rb +19 -0
  19. data/lib/tokenizers/from_pretrained.rb +125 -0
  20. data/lib/tokenizers/models/bpe.rb +9 -0
  21. data/lib/tokenizers/models/unigram.rb +9 -0
  22. data/lib/tokenizers/models/word_level.rb +13 -0
  23. data/lib/tokenizers/models/word_piece.rb +9 -0
  24. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  25. data/lib/tokenizers/normalizers/prepend.rb +9 -0
  26. data/lib/tokenizers/normalizers/strip.rb +9 -0
  27. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  28. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  29. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  30. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  31. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  32. data/lib/tokenizers/processors/byte_level.rb +9 -0
  33. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  34. data/lib/tokenizers/processors/template_processing.rb +9 -0
  35. data/lib/tokenizers/tokenizer.rb +45 -0
  36. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  37. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  38. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  39. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  40. data/lib/tokenizers/version.rb +3 -0
  41. data/lib/tokenizers.rb +62 -0
  42. metadata +85 -0
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class ByteLevel
4
+ def self.new(trim_offsets: true)
5
+ _new(trim_offsets)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class RobertaProcessing
4
+ def self.new(sep, cls, trim_offsets: true, add_prefix_space: true)
5
+ _new(sep, cls, trim_offsets, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class TemplateProcessing
4
+ def self.new(single: nil, pair: nil, special_tokens: nil)
5
+ _new(single, pair, special_tokens)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,45 @@
1
+ module Tokenizers
2
+ class Tokenizer
3
+ extend FromPretrained
4
+
5
+ def to_s(pretty: false)
6
+ _to_s(pretty)
7
+ end
8
+
9
+ def save(path, pretty: false)
10
+ _save(path, pretty)
11
+ end
12
+
13
+ def encode(sequence, pair = nil, is_pretokenized: false, add_special_tokens: true)
14
+ _encode(sequence, pair, is_pretokenized, add_special_tokens)
15
+ end
16
+
17
+ def encode_batch(input, is_pretokenized: false, add_special_tokens: true)
18
+ _encode_batch(input, is_pretokenized, add_special_tokens)
19
+ end
20
+
21
+ def decode(ids, skip_special_tokens: true)
22
+ _decode(ids, skip_special_tokens)
23
+ end
24
+
25
+ def decode_batch(sequences, skip_special_tokens: true)
26
+ _decode_batch(sequences, skip_special_tokens)
27
+ end
28
+
29
+ def enable_padding(**options)
30
+ _enable_padding(options)
31
+ end
32
+
33
+ def enable_truncation(max_length, **options)
34
+ _enable_truncation(max_length, options)
35
+ end
36
+
37
+ def vocab(with_added_tokens: true)
38
+ _vocab(with_added_tokens)
39
+ end
40
+
41
+ def vocab_size(with_added_tokens: true)
42
+ _vocab_size(with_added_tokens)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class BpeTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class UnigramTrainer
4
+ def self.new(vocab_size: 8000,
5
+ show_progress: true,
6
+ special_tokens: [],
7
+ initial_alphabet: [],
8
+ shrinking_factor: 0.75,
9
+ unk_token: nil,
10
+ max_piece_length: 16,
11
+ n_sub_iterations: 2)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ show_progress: show_progress,
16
+ special_tokens: special_tokens,
17
+ initial_alphabet: initial_alphabet,
18
+ shrinking_factor: shrinking_factor,
19
+ unk_token: unk_token,
20
+ max_piece_length: max_piece_length,
21
+ n_sub_iterations: n_sub_iterations
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordLevelTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordPieceTrainer
4
+ def self.new(vocab_size: 30000,
5
+ min_frequency: 0,
6
+ show_progress: true,
7
+ special_tokens: [],
8
+ limit_alphabet: nil,
9
+ initial_alphabet: [],
10
+ continuing_subword_prefix: "##",
11
+ end_of_word_suffix: nil)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ min_frequency: min_frequency,
16
+ show_progress: show_progress,
17
+ special_tokens: special_tokens,
18
+ limit_alphabet: limit_alphabet,
19
+ initial_alphabet: initial_alphabet,
20
+ continuing_subword_prefix: continuing_subword_prefix,
21
+ end_of_word_suffix: end_of_word_suffix
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module Tokenizers
2
+ VERSION = "0.5.3"
3
+ end
data/lib/tokenizers.rb ADDED
@@ -0,0 +1,62 @@
1
+ # ext
2
+ begin
3
+ require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
+ rescue LoadError
5
+ require "tokenizers/tokenizers"
6
+ end
7
+
8
+ # decoders
9
+ require_relative "tokenizers/decoders/bpe_decoder"
10
+ require_relative "tokenizers/decoders/ctc"
11
+ require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/strip"
13
+ require_relative "tokenizers/decoders/word_piece"
14
+
15
+ # models
16
+ require_relative "tokenizers/models/bpe"
17
+ require_relative "tokenizers/models/word_level"
18
+ require_relative "tokenizers/models/word_piece"
19
+ require_relative "tokenizers/models/unigram"
20
+
21
+ # normalizers
22
+ require_relative "tokenizers/normalizers/bert_normalizer"
23
+ require_relative "tokenizers/normalizers/prepend"
24
+ require_relative "tokenizers/normalizers/strip"
25
+
26
+ # pre-tokenizers
27
+ require_relative "tokenizers/pre_tokenizers/byte_level"
28
+ require_relative "tokenizers/pre_tokenizers/digits"
29
+ require_relative "tokenizers/pre_tokenizers/metaspace"
30
+ require_relative "tokenizers/pre_tokenizers/punctuation"
31
+ require_relative "tokenizers/pre_tokenizers/split"
32
+
33
+ # processors
34
+ require_relative "tokenizers/processors/byte_level"
35
+ require_relative "tokenizers/processors/roberta_processing"
36
+ require_relative "tokenizers/processors/template_processing"
37
+
38
+ # trainers
39
+ require_relative "tokenizers/trainers/bpe_trainer"
40
+ require_relative "tokenizers/trainers/unigram_trainer"
41
+ require_relative "tokenizers/trainers/word_level_trainer"
42
+ require_relative "tokenizers/trainers/word_piece_trainer"
43
+
44
+ # other
45
+ require_relative "tokenizers/added_token"
46
+ require_relative "tokenizers/char_bpe_tokenizer"
47
+ require_relative "tokenizers/encoding"
48
+ require_relative "tokenizers/from_pretrained"
49
+ require_relative "tokenizers/tokenizer"
50
+ require_relative "tokenizers/version"
51
+
52
+ module Tokenizers
53
+ class Error < StandardError; end
54
+
55
+ def self.from_pretrained(...)
56
+ Tokenizer.from_pretrained(...)
57
+ end
58
+
59
+ def self.from_file(...)
60
+ Tokenizer.from_file(...)
61
+ end
62
+ end
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tokenizers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.3
5
+ platform: x64-mingw-ucrt
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-09-17 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: andrew@ankane.org
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - CHANGELOG.md
20
+ - Cargo.lock
21
+ - Cargo.toml
22
+ - LICENSE-THIRD-PARTY.txt
23
+ - LICENSE.txt
24
+ - README.md
25
+ - lib/tokenizers.rb
26
+ - lib/tokenizers/3.1/tokenizers.so
27
+ - lib/tokenizers/3.2/tokenizers.so
28
+ - lib/tokenizers/3.3/tokenizers.so
29
+ - lib/tokenizers/added_token.rb
30
+ - lib/tokenizers/char_bpe_tokenizer.rb
31
+ - lib/tokenizers/decoders/bpe_decoder.rb
32
+ - lib/tokenizers/decoders/ctc.rb
33
+ - lib/tokenizers/decoders/metaspace.rb
34
+ - lib/tokenizers/decoders/strip.rb
35
+ - lib/tokenizers/decoders/word_piece.rb
36
+ - lib/tokenizers/encoding.rb
37
+ - lib/tokenizers/from_pretrained.rb
38
+ - lib/tokenizers/models/bpe.rb
39
+ - lib/tokenizers/models/unigram.rb
40
+ - lib/tokenizers/models/word_level.rb
41
+ - lib/tokenizers/models/word_piece.rb
42
+ - lib/tokenizers/normalizers/bert_normalizer.rb
43
+ - lib/tokenizers/normalizers/prepend.rb
44
+ - lib/tokenizers/normalizers/strip.rb
45
+ - lib/tokenizers/pre_tokenizers/byte_level.rb
46
+ - lib/tokenizers/pre_tokenizers/digits.rb
47
+ - lib/tokenizers/pre_tokenizers/metaspace.rb
48
+ - lib/tokenizers/pre_tokenizers/punctuation.rb
49
+ - lib/tokenizers/pre_tokenizers/split.rb
50
+ - lib/tokenizers/processors/byte_level.rb
51
+ - lib/tokenizers/processors/roberta_processing.rb
52
+ - lib/tokenizers/processors/template_processing.rb
53
+ - lib/tokenizers/tokenizer.rb
54
+ - lib/tokenizers/trainers/bpe_trainer.rb
55
+ - lib/tokenizers/trainers/unigram_trainer.rb
56
+ - lib/tokenizers/trainers/word_level_trainer.rb
57
+ - lib/tokenizers/trainers/word_piece_trainer.rb
58
+ - lib/tokenizers/version.rb
59
+ homepage: https://github.com/ankane/tokenizers-ruby
60
+ licenses:
61
+ - Apache-2.0
62
+ metadata: {}
63
+ post_install_message:
64
+ rdoc_options: []
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '3.1'
72
+ - - "<"
73
+ - !ruby/object:Gem::Version
74
+ version: 3.4.dev
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubygems_version: 3.4.4
82
+ signing_key:
83
+ specification_version: 4
84
+ summary: Fast state-of-the-art tokenizers for Ruby
85
+ test_files: []