tokenizers 0.5.1-aarch64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +95 -0
  3. data/Cargo.lock +895 -0
  4. data/Cargo.toml +6 -0
  5. data/LICENSE-THIRD-PARTY.txt +17104 -0
  6. data/LICENSE.txt +202 -0
  7. data/README.md +105 -0
  8. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  10. data/lib/tokenizers/3.3/tokenizers.so +0 -0
  11. data/lib/tokenizers/char_bpe_tokenizer.rb +22 -0
  12. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  13. data/lib/tokenizers/decoders/ctc.rb +9 -0
  14. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  15. data/lib/tokenizers/decoders/strip.rb +9 -0
  16. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  17. data/lib/tokenizers/encoding.rb +19 -0
  18. data/lib/tokenizers/from_pretrained.rb +125 -0
  19. data/lib/tokenizers/models/bpe.rb +9 -0
  20. data/lib/tokenizers/models/unigram.rb +9 -0
  21. data/lib/tokenizers/models/word_level.rb +13 -0
  22. data/lib/tokenizers/models/word_piece.rb +9 -0
  23. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  24. data/lib/tokenizers/normalizers/prepend.rb +9 -0
  25. data/lib/tokenizers/normalizers/strip.rb +9 -0
  26. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  27. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  28. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  29. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  30. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  31. data/lib/tokenizers/processors/byte_level.rb +9 -0
  32. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  33. data/lib/tokenizers/processors/template_processing.rb +9 -0
  34. data/lib/tokenizers/tokenizer.rb +45 -0
  35. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  36. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  37. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  38. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  39. data/lib/tokenizers/version.rb +3 -0
  40. data/lib/tokenizers.rb +61 -0
  41. metadata +84 -0
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class RobertaProcessing
4
+ def self.new(sep, cls, trim_offsets: true, add_prefix_space: true)
5
+ _new(sep, cls, trim_offsets, add_prefix_space)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Processors
3
+ class TemplateProcessing
4
+ def self.new(single: nil, pair: nil, special_tokens: nil)
5
+ _new(single, pair, special_tokens)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,45 @@
1
+ module Tokenizers
2
+ class Tokenizer
3
+ extend FromPretrained
4
+
5
+ def to_s(pretty: false)
6
+ _to_s(pretty)
7
+ end
8
+
9
+ def save(path, pretty: false)
10
+ _save(path, pretty)
11
+ end
12
+
13
+ def encode(sequence, pair = nil, is_pretokenized: false, add_special_tokens: true)
14
+ _encode(sequence, pair, is_pretokenized, add_special_tokens)
15
+ end
16
+
17
+ def encode_batch(input, is_pretokenized: false, add_special_tokens: true)
18
+ _encode_batch(input, is_pretokenized, add_special_tokens)
19
+ end
20
+
21
+ def decode(ids, skip_special_tokens: true)
22
+ _decode(ids, skip_special_tokens)
23
+ end
24
+
25
+ def decode_batch(sequences, skip_special_tokens: true)
26
+ _decode_batch(sequences, skip_special_tokens)
27
+ end
28
+
29
+ def enable_padding(**options)
30
+ _enable_padding(options)
31
+ end
32
+
33
+ def enable_truncation(max_length, **options)
34
+ _enable_truncation(max_length, options)
35
+ end
36
+
37
+ def vocab(with_added_tokens: true)
38
+ _vocab(with_added_tokens)
39
+ end
40
+
41
+ def vocab_size(with_added_tokens: true)
42
+ _vocab_size(with_added_tokens)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class BpeTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class UnigramTrainer
4
+ def self.new(vocab_size: 8000,
5
+ show_progress: true,
6
+ special_tokens: [],
7
+ initial_alphabet: [],
8
+ shrinking_factor: 0.75,
9
+ unk_token: nil,
10
+ max_piece_length: 16,
11
+ n_sub_iterations: 2)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ show_progress: show_progress,
16
+ special_tokens: special_tokens,
17
+ initial_alphabet: initial_alphabet,
18
+ shrinking_factor: shrinking_factor,
19
+ unk_token: unk_token,
20
+ max_piece_length: max_piece_length,
21
+ n_sub_iterations: n_sub_iterations
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordLevelTrainer
4
+ def self.new(**options)
5
+ _new(options)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ module Tokenizers
2
+ module Trainers
3
+ class WordPieceTrainer
4
+ def self.new(vocab_size: 30000,
5
+ min_frequency: 0,
6
+ show_progress: true,
7
+ special_tokens: [],
8
+ limit_alphabet: nil,
9
+ initial_alphabet: [],
10
+ continuing_subword_prefix: "##",
11
+ end_of_word_suffix: nil)
12
+
13
+ _new({
14
+ vocab_size: vocab_size,
15
+ min_frequency: min_frequency,
16
+ show_progress: show_progress,
17
+ special_tokens: special_tokens,
18
+ limit_alphabet: limit_alphabet,
19
+ initial_alphabet: initial_alphabet,
20
+ continuing_subword_prefix: continuing_subword_prefix,
21
+ end_of_word_suffix: end_of_word_suffix
22
+ })
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module Tokenizers
2
+ VERSION = "0.5.1"
3
+ end
data/lib/tokenizers.rb ADDED
@@ -0,0 +1,61 @@
1
+ # ext
2
+ begin
3
+ require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
+ rescue LoadError
5
+ require "tokenizers/tokenizers"
6
+ end
7
+
8
+ # decoders
9
+ require_relative "tokenizers/decoders/bpe_decoder"
10
+ require_relative "tokenizers/decoders/ctc"
11
+ require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/strip"
13
+ require_relative "tokenizers/decoders/word_piece"
14
+
15
+ # models
16
+ require_relative "tokenizers/models/bpe"
17
+ require_relative "tokenizers/models/word_level"
18
+ require_relative "tokenizers/models/word_piece"
19
+ require_relative "tokenizers/models/unigram"
20
+
21
+ # normalizers
22
+ require_relative "tokenizers/normalizers/bert_normalizer"
23
+ require_relative "tokenizers/normalizers/prepend"
24
+ require_relative "tokenizers/normalizers/strip"
25
+
26
+ # pre-tokenizers
27
+ require_relative "tokenizers/pre_tokenizers/byte_level"
28
+ require_relative "tokenizers/pre_tokenizers/digits"
29
+ require_relative "tokenizers/pre_tokenizers/metaspace"
30
+ require_relative "tokenizers/pre_tokenizers/punctuation"
31
+ require_relative "tokenizers/pre_tokenizers/split"
32
+
33
+ # processors
34
+ require_relative "tokenizers/processors/byte_level"
35
+ require_relative "tokenizers/processors/roberta_processing"
36
+ require_relative "tokenizers/processors/template_processing"
37
+
38
+ # trainers
39
+ require_relative "tokenizers/trainers/bpe_trainer"
40
+ require_relative "tokenizers/trainers/unigram_trainer"
41
+ require_relative "tokenizers/trainers/word_level_trainer"
42
+ require_relative "tokenizers/trainers/word_piece_trainer"
43
+
44
+ # other
45
+ require_relative "tokenizers/char_bpe_tokenizer"
46
+ require_relative "tokenizers/encoding"
47
+ require_relative "tokenizers/from_pretrained"
48
+ require_relative "tokenizers/tokenizer"
49
+ require_relative "tokenizers/version"
50
+
51
+ module Tokenizers
52
+ class Error < StandardError; end
53
+
54
+ def self.from_pretrained(...)
55
+ Tokenizer.from_pretrained(...)
56
+ end
57
+
58
+ def self.from_file(...)
59
+ Tokenizer.from_file(...)
60
+ end
61
+ end
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tokenizers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.1
5
+ platform: aarch64-linux-musl
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-08-13 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: andrew@ankane.org
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - CHANGELOG.md
20
+ - Cargo.lock
21
+ - Cargo.toml
22
+ - LICENSE-THIRD-PARTY.txt
23
+ - LICENSE.txt
24
+ - README.md
25
+ - lib/tokenizers.rb
26
+ - lib/tokenizers/3.1/tokenizers.so
27
+ - lib/tokenizers/3.2/tokenizers.so
28
+ - lib/tokenizers/3.3/tokenizers.so
29
+ - lib/tokenizers/char_bpe_tokenizer.rb
30
+ - lib/tokenizers/decoders/bpe_decoder.rb
31
+ - lib/tokenizers/decoders/ctc.rb
32
+ - lib/tokenizers/decoders/metaspace.rb
33
+ - lib/tokenizers/decoders/strip.rb
34
+ - lib/tokenizers/decoders/word_piece.rb
35
+ - lib/tokenizers/encoding.rb
36
+ - lib/tokenizers/from_pretrained.rb
37
+ - lib/tokenizers/models/bpe.rb
38
+ - lib/tokenizers/models/unigram.rb
39
+ - lib/tokenizers/models/word_level.rb
40
+ - lib/tokenizers/models/word_piece.rb
41
+ - lib/tokenizers/normalizers/bert_normalizer.rb
42
+ - lib/tokenizers/normalizers/prepend.rb
43
+ - lib/tokenizers/normalizers/strip.rb
44
+ - lib/tokenizers/pre_tokenizers/byte_level.rb
45
+ - lib/tokenizers/pre_tokenizers/digits.rb
46
+ - lib/tokenizers/pre_tokenizers/metaspace.rb
47
+ - lib/tokenizers/pre_tokenizers/punctuation.rb
48
+ - lib/tokenizers/pre_tokenizers/split.rb
49
+ - lib/tokenizers/processors/byte_level.rb
50
+ - lib/tokenizers/processors/roberta_processing.rb
51
+ - lib/tokenizers/processors/template_processing.rb
52
+ - lib/tokenizers/tokenizer.rb
53
+ - lib/tokenizers/trainers/bpe_trainer.rb
54
+ - lib/tokenizers/trainers/unigram_trainer.rb
55
+ - lib/tokenizers/trainers/word_level_trainer.rb
56
+ - lib/tokenizers/trainers/word_piece_trainer.rb
57
+ - lib/tokenizers/version.rb
58
+ homepage: https://github.com/ankane/tokenizers-ruby
59
+ licenses:
60
+ - Apache-2.0
61
+ metadata: {}
62
+ post_install_message:
63
+ rdoc_options: []
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '3.1'
71
+ - - "<"
72
+ - !ruby/object:Gem::Version
73
+ version: 3.4.dev
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ version: 3.3.22
79
+ requirements: []
80
+ rubygems_version: 3.4.4
81
+ signing_key:
82
+ specification_version: 4
83
+ summary: Fast state-of-the-art tokenizers for Ruby
84
+ test_files: []