tokenizers 0.5.3-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +107 -0
  3. data/Cargo.lock +898 -0
  4. data/Cargo.toml +6 -0
  5. data/LICENSE-THIRD-PARTY.txt +17427 -0
  6. data/LICENSE.txt +202 -0
  7. data/README.md +105 -0
  8. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  10. data/lib/tokenizers/3.3/tokenizers.so +0 -0
  11. data/lib/tokenizers/added_token.rb +7 -0
  12. data/lib/tokenizers/char_bpe_tokenizer.rb +22 -0
  13. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  14. data/lib/tokenizers/decoders/ctc.rb +9 -0
  15. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  16. data/lib/tokenizers/decoders/strip.rb +9 -0
  17. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  18. data/lib/tokenizers/encoding.rb +19 -0
  19. data/lib/tokenizers/from_pretrained.rb +125 -0
  20. data/lib/tokenizers/models/bpe.rb +9 -0
  21. data/lib/tokenizers/models/unigram.rb +9 -0
  22. data/lib/tokenizers/models/word_level.rb +13 -0
  23. data/lib/tokenizers/models/word_piece.rb +9 -0
  24. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  25. data/lib/tokenizers/normalizers/prepend.rb +9 -0
  26. data/lib/tokenizers/normalizers/strip.rb +9 -0
  27. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  28. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  29. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  30. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  31. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  32. data/lib/tokenizers/processors/byte_level.rb +9 -0
  33. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  34. data/lib/tokenizers/processors/template_processing.rb +9 -0
  35. data/lib/tokenizers/tokenizer.rb +45 -0
  36. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  37. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  38. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  39. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  40. data/lib/tokenizers/version.rb +3 -0
  41. data/lib/tokenizers.rb +62 -0
  42. metadata +85 -0
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class ByteLevel
4
+ def self.new(trim_offsets: true)
5
+ _new(trim_offsets)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class RobertaProcessing
4
+ def self.new(sep, cls, trim_offsets: true, add_prefix_space: true)
5
+ _new(sep, cls, trim_offsets, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class TemplateProcessing
4
+ def self.new(single: nil, pair: nil, special_tokens: nil)
5
+ _new(single, pair, special_tokens)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,45 @@
1
+ module Tokenizers
2
+ class Tokenizer
3
+ extend FromPretrained
4
+
5
+ def to_s(pretty: false)
6
+ _to_s(pretty)
7
+ end
8
+
9
+ def save(path, pretty: false)
10
+ _save(path, pretty)
11
+ end
12
+
13
+ def encode(sequence, pair = nil, is_pretokenized: false, add_special_tokens: true)
14
+ _encode(sequence, pair, is_pretokenized, add_special_tokens)
15
+ end
16
+
17
+ def encode_batch(input, is_pretokenized: false, add_special_tokens: true)
18
+ _encode_batch(input, is_pretokenized, add_special_tokens)
19
+ end
20
+
21
+ def decode(ids, skip_special_tokens: true)
22
+ _decode(ids, skip_special_tokens)
23
+ end
24
+
25
+ def decode_batch(sequences, skip_special_tokens: true)
26
+ _decode_batch(sequences, skip_special_tokens)
27
+ end
28
+
29
+ def enable_padding(**options)
30
+ _enable_padding(options)
31
+ end
32
+
33
+ def enable_truncation(max_length, **options)
34
+ _enable_truncation(max_length, options)
35
+ end
36
+
37
+ def vocab(with_added_tokens: true)
38
+ _vocab(with_added_tokens)
39
+ end
40
+
41
+ def vocab_size(with_added_tokens: true)
42
+ _vocab_size(with_added_tokens)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class BpeTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class UnigramTrainer
4
+ def self.new(vocab_size: 8000,
5
+ show_progress: true,
6
+ special_tokens: [],
7
+ initial_alphabet: [],
8
+ shrinking_factor: 0.75,
9
+ unk_token: nil,
10
+ max_piece_length: 16,
11
+ n_sub_iterations: 2)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ show_progress: show_progress,
16
+ special_tokens: special_tokens,
17
+ initial_alphabet: initial_alphabet,
18
+ shrinking_factor: shrinking_factor,
19
+ unk_token: unk_token,
20
+ max_piece_length: max_piece_length,
21
+ n_sub_iterations: n_sub_iterations
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordLevelTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordPieceTrainer
4
+ def self.new(vocab_size: 30000,
5
+ min_frequency: 0,
6
+ show_progress: true,
7
+ special_tokens: [],
8
+ limit_alphabet: nil,
9
+ initial_alphabet: [],
10
+ continuing_subword_prefix: "##",
11
+ end_of_word_suffix: nil)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ min_frequency: min_frequency,
16
+ show_progress: show_progress,
17
+ special_tokens: special_tokens,
18
+ limit_alphabet: limit_alphabet,
19
+ initial_alphabet: initial_alphabet,
20
+ continuing_subword_prefix: continuing_subword_prefix,
21
+ end_of_word_suffix: end_of_word_suffix
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module Tokenizers
2
+ VERSION = "0.5.3"
3
+ end
data/lib/tokenizers.rb ADDED
@@ -0,0 +1,62 @@
1
+ # ext
2
+ begin
3
+ require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
+ rescue LoadError
5
+ require "tokenizers/tokenizers"
6
+ end
7
+
8
+ # decoders
9
+ require_relative "tokenizers/decoders/bpe_decoder"
10
+ require_relative "tokenizers/decoders/ctc"
11
+ require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/strip"
13
+ require_relative "tokenizers/decoders/word_piece"
14
+
15
+ # models
16
+ require_relative "tokenizers/models/bpe"
17
+ require_relative "tokenizers/models/word_level"
18
+ require_relative "tokenizers/models/word_piece"
19
+ require_relative "tokenizers/models/unigram"
20
+
21
+ # normalizers
22
+ require_relative "tokenizers/normalizers/bert_normalizer"
23
+ require_relative "tokenizers/normalizers/prepend"
24
+ require_relative "tokenizers/normalizers/strip"
25
+
26
+ # pre-tokenizers
27
+ require_relative "tokenizers/pre_tokenizers/byte_level"
28
+ require_relative "tokenizers/pre_tokenizers/digits"
29
+ require_relative "tokenizers/pre_tokenizers/metaspace"
30
+ require_relative "tokenizers/pre_tokenizers/punctuation"
31
+ require_relative "tokenizers/pre_tokenizers/split"
32
+
33
+ # processors
34
+ require_relative "tokenizers/processors/byte_level"
35
+ require_relative "tokenizers/processors/roberta_processing"
36
+ require_relative "tokenizers/processors/template_processing"
37
+
38
+ # trainers
39
+ require_relative "tokenizers/trainers/bpe_trainer"
40
+ require_relative "tokenizers/trainers/unigram_trainer"
41
+ require_relative "tokenizers/trainers/word_level_trainer"
42
+ require_relative "tokenizers/trainers/word_piece_trainer"
43
+
44
+ # other
45
+ require_relative "tokenizers/added_token"
46
+ require_relative "tokenizers/char_bpe_tokenizer"
47
+ require_relative "tokenizers/encoding"
48
+ require_relative "tokenizers/from_pretrained"
49
+ require_relative "tokenizers/tokenizer"
50
+ require_relative "tokenizers/version"
51
+
52
+ module Tokenizers
53
+ class Error < StandardError; end
54
+
55
+ def self.from_pretrained(...)
56
+ Tokenizer.from_pretrained(...)
57
+ end
58
+
59
+ def self.from_file(...)
60
+ Tokenizer.from_file(...)
61
+ end
62
+ end
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tokenizers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.3
5
+ platform: x64-mingw-ucrt
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-09-17 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: andrew@ankane.org
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - CHANGELOG.md
20
+ - Cargo.lock
21
+ - Cargo.toml
22
+ - LICENSE-THIRD-PARTY.txt
23
+ - LICENSE.txt
24
+ - README.md
25
+ - lib/tokenizers.rb
26
+ - lib/tokenizers/3.1/tokenizers.so
27
+ - lib/tokenizers/3.2/tokenizers.so
28
+ - lib/tokenizers/3.3/tokenizers.so
29
+ - lib/tokenizers/added_token.rb
30
+ - lib/tokenizers/char_bpe_tokenizer.rb
31
+ - lib/tokenizers/decoders/bpe_decoder.rb
32
+ - lib/tokenizers/decoders/ctc.rb
33
+ - lib/tokenizers/decoders/metaspace.rb
34
+ - lib/tokenizers/decoders/strip.rb
35
+ - lib/tokenizers/decoders/word_piece.rb
36
+ - lib/tokenizers/encoding.rb
37
+ - lib/tokenizers/from_pretrained.rb
38
+ - lib/tokenizers/models/bpe.rb
39
+ - lib/tokenizers/models/unigram.rb
40
+ - lib/tokenizers/models/word_level.rb
41
+ - lib/tokenizers/models/word_piece.rb
42
+ - lib/tokenizers/normalizers/bert_normalizer.rb
43
+ - lib/tokenizers/normalizers/prepend.rb
44
+ - lib/tokenizers/normalizers/strip.rb
45
+ - lib/tokenizers/pre_tokenizers/byte_level.rb
46
+ - lib/tokenizers/pre_tokenizers/digits.rb
47
+ - lib/tokenizers/pre_tokenizers/metaspace.rb
48
+ - lib/tokenizers/pre_tokenizers/punctuation.rb
49
+ - lib/tokenizers/pre_tokenizers/split.rb
50
+ - lib/tokenizers/processors/byte_level.rb
51
+ - lib/tokenizers/processors/roberta_processing.rb
52
+ - lib/tokenizers/processors/template_processing.rb
53
+ - lib/tokenizers/tokenizer.rb
54
+ - lib/tokenizers/trainers/bpe_trainer.rb
55
+ - lib/tokenizers/trainers/unigram_trainer.rb
56
+ - lib/tokenizers/trainers/word_level_trainer.rb
57
+ - lib/tokenizers/trainers/word_piece_trainer.rb
58
+ - lib/tokenizers/version.rb
59
+ homepage: https://github.com/ankane/tokenizers-ruby
60
+ licenses:
61
+ - Apache-2.0
62
+ metadata: {}
63
+ post_install_message:
64
+ rdoc_options: []
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '3.1'
72
+ - - "<"
73
+ - !ruby/object:Gem::Version
74
+ version: 3.4.dev
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubygems_version: 3.4.4
82
+ signing_key:
83
+ specification_version: 4
84
+ summary: Fast state-of-the-art tokenizers for Ruby
85
+ test_files: []