transformers-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +203 -0
  4. data/README.md +163 -0
  5. data/lib/transformers/activations.rb +57 -0
  6. data/lib/transformers/configuration_utils.rb +285 -0
  7. data/lib/transformers/convert_slow_tokenizer.rb +90 -0
  8. data/lib/transformers/data/processors/squad.rb +115 -0
  9. data/lib/transformers/dynamic_module_utils.rb +25 -0
  10. data/lib/transformers/feature_extraction_utils.rb +110 -0
  11. data/lib/transformers/hf_hub/constants.rb +71 -0
  12. data/lib/transformers/hf_hub/errors.rb +11 -0
  13. data/lib/transformers/hf_hub/file_download.rb +764 -0
  14. data/lib/transformers/hf_hub/utils/_errors.rb +94 -0
  15. data/lib/transformers/hf_hub/utils/_headers.rb +109 -0
  16. data/lib/transformers/image_processing_base.rb +169 -0
  17. data/lib/transformers/image_processing_utils.rb +63 -0
  18. data/lib/transformers/image_transforms.rb +208 -0
  19. data/lib/transformers/image_utils.rb +165 -0
  20. data/lib/transformers/modeling_outputs.rb +81 -0
  21. data/lib/transformers/modeling_utils.rb +888 -0
  22. data/lib/transformers/models/auto/auto_factory.rb +138 -0
  23. data/lib/transformers/models/auto/configuration_auto.rb +61 -0
  24. data/lib/transformers/models/auto/feature_extraction_auto.rb +20 -0
  25. data/lib/transformers/models/auto/image_processing_auto.rb +104 -0
  26. data/lib/transformers/models/auto/modeling_auto.rb +80 -0
  27. data/lib/transformers/models/auto/tokenization_auto.rb +160 -0
  28. data/lib/transformers/models/bert/configuration_bert.rb +65 -0
  29. data/lib/transformers/models/bert/modeling_bert.rb +836 -0
  30. data/lib/transformers/models/bert/tokenization_bert.rb +115 -0
  31. data/lib/transformers/models/bert/tokenization_bert_fast.rb +52 -0
  32. data/lib/transformers/models/distilbert/configuration_distilbert.rb +63 -0
  33. data/lib/transformers/models/distilbert/modeling_distilbert.rb +616 -0
  34. data/lib/transformers/models/distilbert/tokenization_distilbert.rb +114 -0
  35. data/lib/transformers/models/distilbert/tokenization_distilbert_fast.rb +71 -0
  36. data/lib/transformers/models/vit/configuration_vit.rb +60 -0
  37. data/lib/transformers/models/vit/image_processing_vit.rb +170 -0
  38. data/lib/transformers/models/vit/modeling_vit.rb +506 -0
  39. data/lib/transformers/pipelines/_init.rb +348 -0
  40. data/lib/transformers/pipelines/base.rb +301 -0
  41. data/lib/transformers/pipelines/feature_extraction.rb +47 -0
  42. data/lib/transformers/pipelines/image_classification.rb +110 -0
  43. data/lib/transformers/pipelines/image_feature_extraction.rb +56 -0
  44. data/lib/transformers/pipelines/pt_utils.rb +53 -0
  45. data/lib/transformers/pipelines/question_answering.rb +508 -0
  46. data/lib/transformers/pipelines/text_classification.rb +123 -0
  47. data/lib/transformers/pipelines/token_classification.rb +282 -0
  48. data/lib/transformers/ruby_utils.rb +33 -0
  49. data/lib/transformers/sentence_transformer.rb +37 -0
  50. data/lib/transformers/tokenization_utils.rb +152 -0
  51. data/lib/transformers/tokenization_utils_base.rb +937 -0
  52. data/lib/transformers/tokenization_utils_fast.rb +386 -0
  53. data/lib/transformers/torch_utils.rb +25 -0
  54. data/lib/transformers/utils/_init.rb +31 -0
  55. data/lib/transformers/utils/generic.rb +107 -0
  56. data/lib/transformers/utils/hub.rb +209 -0
  57. data/lib/transformers/utils/import_utils.rb +45 -0
  58. data/lib/transformers/utils/logging.rb +52 -0
  59. data/lib/transformers/version.rb +3 -0
  60. data/lib/transformers-rb.rb +1 -0
  61. data/lib/transformers.rb +100 -0
  62. data/licenses/LICENSE-huggingface-hub.txt +201 -0
  63. data/licenses/LICENSE-sentence-transformers.txt +201 -0
  64. data/licenses/NOTICE-sentence-transformers.txt +5 -0
  65. metadata +161 -0
@@ -0,0 +1,115 @@
1
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Transformers
16
+ module Bert
17
+ class BertTokenizer < PreTrainedTokenizer
18
+ class BasicTokenizer
19
+ attr_reader :do_lower_case, :tokenize_chinese_chars, :never_split, :strip_accents, :do_split_on_punc
20
+
21
+ def initialize(
22
+ do_lower_case: true,
23
+ never_split: nil,
24
+ tokenize_chinese_chars: true,
25
+ strip_accents: nil,
26
+ do_split_on_punc: true
27
+ )
28
+ if never_split.nil?
29
+ never_split = []
30
+ end
31
+ @do_lower_case = do_lower_case
32
+ @never_split = Set.new(never_split)
33
+ @tokenize_chinese_chars = tokenize_chinese_chars
34
+ @strip_accents = strip_accents
35
+ @do_split_on_punc = do_split_on_punc
36
+ end
37
+ end
38
+
39
+ class WordpieceTokenizer
40
+ def initialize(vocab:, unk_token:, max_input_chars_per_word: 100)
41
+ @vocab = vocab
42
+ @unk_token = unk_token
43
+ @max_input_chars_per_word = max_input_chars_per_word
44
+ end
45
+ end
46
+
47
+ attr_reader :vocab, :basic_tokenizer
48
+
49
+ def initialize(
50
+ vocab_file:,
51
+ do_lower_case: true,
52
+ do_basic_tokenize: true,
53
+ never_split: nil,
54
+ unk_token: "[UNK]",
55
+ sep_token: "[SEP]",
56
+ pad_token: "[PAD]",
57
+ cls_token: "[CLS]",
58
+ mask_token: "[MASK]",
59
+ tokenize_chinese_chars: true,
60
+ strip_accents: nil,
61
+ **kwargs
62
+ )
63
+ if !File.exist?(vocab_file)
64
+ raise ArgumentError,
65
+ "Can't find a vocabulary file at path '#{vocab_file}'. To load the vocabulary from a Google pretrained" +
66
+ " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
67
+ end
68
+ @vocab = load_vocab(vocab_file)
69
+ @ids_to_tokens = @vocab.invert
70
+ @do_basic_tokenize = do_basic_tokenize
71
+ if do_basic_tokenize
72
+ @basic_tokenizer =
73
+ BasicTokenizer.new(
74
+ do_lower_case: do_lower_case,
75
+ never_split: never_split,
76
+ tokenize_chinese_chars: tokenize_chinese_chars,
77
+ strip_accents: strip_accents
78
+ )
79
+ end
80
+
81
+ @wordpiece_tokenizer = WordpieceTokenizer.new(vocab: @vocab, unk_token: unk_token.to_s)
82
+
83
+ super(
84
+ do_lower_case: do_lower_case,
85
+ do_basic_tokenize: do_basic_tokenize,
86
+ never_split: never_split,
87
+ unk_token: unk_token,
88
+ sep_token: sep_token,
89
+ pad_token: pad_token,
90
+ cls_token: cls_token,
91
+ mask_token: mask_token,
92
+ tokenize_chinese_chars: tokenize_chinese_chars,
93
+ strip_accents: strip_accents,
94
+ **kwargs
95
+ )
96
+ end
97
+
98
+ def _convert_token_to_id(token)
99
+ @vocab.fetch(token, @vocab.fetch(@unk_token))
100
+ end
101
+
102
+ private
103
+
104
+ def load_vocab(vocab_file)
105
+ vocab = {}
106
+ tokens = File.readlines(vocab_file)
107
+ tokens.each_with_index do |token, index|
108
+ token = token.chomp
109
+ vocab[token] = index
110
+ end
111
+ vocab
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,52 @@
1
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Transformers
16
+ module Bert
17
+ class BertTokenizerFast < PreTrainedTokenizerFast
18
+ VOCAB_FILES_NAMES = {vocab_file: "vocab.txt", tokenizer_file: "tokenizer.json"}
19
+
20
+ self.vocab_files_names = VOCAB_FILES_NAMES
21
+ self.slow_tokenizer_class = BertTokenizer
22
+
23
+ def initialize(
24
+ vocab_file: nil,
25
+ tokenizer_file: nil,
26
+ do_lower_case: true,
27
+ unk_token: "[UNK]",
28
+ sep_token: "[SEP]",
29
+ pad_token: "[PAD]",
30
+ cls_token: "[CLS]",
31
+ mask_token: "[MASK]",
32
+ tokenize_chinese_chars: true,
33
+ strip_accents: nil,
34
+ **kwargs
35
+ )
36
+ super(
37
+ vocab_file,
38
+ tokenizer_file: tokenizer_file,
39
+ do_lower_case: do_lower_case,
40
+ unk_token: unk_token,
41
+ sep_token: sep_token,
42
+ pad_token: pad_token,
43
+ cls_token: cls_token,
44
+ mask_token: mask_token,
45
+ tokenize_chinese_chars: tokenize_chinese_chars,
46
+ strip_accents: strip_accents,
47
+ **kwargs
48
+ )
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,63 @@
1
+ # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Transformers
16
+ module Distilbert
17
+ class DistilBertConfig < PretrainedConfig
18
+ self.model_type = "distilbert"
19
+ self.attribute_map = {
20
+ hidden_size: "dim",
21
+ num_attention_heads: "n_heads",
22
+ num_hidden_layers: "n_layers",
23
+ }
24
+
25
+ attr_reader :vocab_size, :max_position_embeddings, :sinusoidal_pos_embds, :n_layers, :n_heads,
26
+ :dim, :hidden_dim, :dropout, :attention_dropout, :activation, :initializer_range, :qa_dropout,
27
+ :seq_classif_dropout, :pad_token_id
28
+
29
+ def initialize(
30
+ vocab_size: 30522,
31
+ max_position_embeddings: 512,
32
+ sinusoidal_pos_embds: false,
33
+ n_layers: 6,
34
+ n_heads: 12,
35
+ dim: 768,
36
+ hidden_dim: 4 * 768,
37
+ dropout: 0.1,
38
+ attention_dropout: 0.1,
39
+ activation: "gelu",
40
+ initializer_range: 0.02,
41
+ qa_dropout: 0.1,
42
+ seq_classif_dropout: 0.2,
43
+ pad_token_id: 0,
44
+ **kwargs
45
+ )
46
+ @vocab_size = vocab_size
47
+ @max_position_embeddings = max_position_embeddings
48
+ @sinusoidal_pos_embds = sinusoidal_pos_embds
49
+ @n_layers = n_layers
50
+ @n_heads = n_heads
51
+ @dim = dim
52
+ @hidden_dim = hidden_dim
53
+ @dropout = dropout
54
+ @attention_dropout = attention_dropout
55
+ @activation = activation
56
+ @initializer_range = initializer_range
57
+ @qa_dropout = qa_dropout
58
+ @seq_classif_dropout = seq_classif_dropout
59
+ super(**kwargs, pad_token_id: pad_token_id)
60
+ end
61
+ end
62
+ end
63
+ end