transformers-rb 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +203 -0
- data/README.md +163 -0
- data/lib/transformers/activations.rb +57 -0
- data/lib/transformers/configuration_utils.rb +285 -0
- data/lib/transformers/convert_slow_tokenizer.rb +90 -0
- data/lib/transformers/data/processors/squad.rb +115 -0
- data/lib/transformers/dynamic_module_utils.rb +25 -0
- data/lib/transformers/feature_extraction_utils.rb +110 -0
- data/lib/transformers/hf_hub/constants.rb +71 -0
- data/lib/transformers/hf_hub/errors.rb +11 -0
- data/lib/transformers/hf_hub/file_download.rb +764 -0
- data/lib/transformers/hf_hub/utils/_errors.rb +94 -0
- data/lib/transformers/hf_hub/utils/_headers.rb +109 -0
- data/lib/transformers/image_processing_base.rb +169 -0
- data/lib/transformers/image_processing_utils.rb +63 -0
- data/lib/transformers/image_transforms.rb +208 -0
- data/lib/transformers/image_utils.rb +165 -0
- data/lib/transformers/modeling_outputs.rb +81 -0
- data/lib/transformers/modeling_utils.rb +888 -0
- data/lib/transformers/models/auto/auto_factory.rb +138 -0
- data/lib/transformers/models/auto/configuration_auto.rb +61 -0
- data/lib/transformers/models/auto/feature_extraction_auto.rb +20 -0
- data/lib/transformers/models/auto/image_processing_auto.rb +104 -0
- data/lib/transformers/models/auto/modeling_auto.rb +80 -0
- data/lib/transformers/models/auto/tokenization_auto.rb +160 -0
- data/lib/transformers/models/bert/configuration_bert.rb +65 -0
- data/lib/transformers/models/bert/modeling_bert.rb +836 -0
- data/lib/transformers/models/bert/tokenization_bert.rb +115 -0
- data/lib/transformers/models/bert/tokenization_bert_fast.rb +52 -0
- data/lib/transformers/models/distilbert/configuration_distilbert.rb +63 -0
- data/lib/transformers/models/distilbert/modeling_distilbert.rb +616 -0
- data/lib/transformers/models/distilbert/tokenization_distilbert.rb +114 -0
- data/lib/transformers/models/distilbert/tokenization_distilbert_fast.rb +71 -0
- data/lib/transformers/models/vit/configuration_vit.rb +60 -0
- data/lib/transformers/models/vit/image_processing_vit.rb +170 -0
- data/lib/transformers/models/vit/modeling_vit.rb +506 -0
- data/lib/transformers/pipelines/_init.rb +348 -0
- data/lib/transformers/pipelines/base.rb +301 -0
- data/lib/transformers/pipelines/feature_extraction.rb +47 -0
- data/lib/transformers/pipelines/image_classification.rb +110 -0
- data/lib/transformers/pipelines/image_feature_extraction.rb +56 -0
- data/lib/transformers/pipelines/pt_utils.rb +53 -0
- data/lib/transformers/pipelines/question_answering.rb +508 -0
- data/lib/transformers/pipelines/text_classification.rb +123 -0
- data/lib/transformers/pipelines/token_classification.rb +282 -0
- data/lib/transformers/ruby_utils.rb +33 -0
- data/lib/transformers/sentence_transformer.rb +37 -0
- data/lib/transformers/tokenization_utils.rb +152 -0
- data/lib/transformers/tokenization_utils_base.rb +937 -0
- data/lib/transformers/tokenization_utils_fast.rb +386 -0
- data/lib/transformers/torch_utils.rb +25 -0
- data/lib/transformers/utils/_init.rb +31 -0
- data/lib/transformers/utils/generic.rb +107 -0
- data/lib/transformers/utils/hub.rb +209 -0
- data/lib/transformers/utils/import_utils.rb +45 -0
- data/lib/transformers/utils/logging.rb +52 -0
- data/lib/transformers/version.rb +3 -0
- data/lib/transformers-rb.rb +1 -0
- data/lib/transformers.rb +100 -0
- data/licenses/LICENSE-huggingface-hub.txt +201 -0
- data/licenses/LICENSE-sentence-transformers.txt +201 -0
- data/licenses/NOTICE-sentence-transformers.txt +5 -0
- metadata +161 -0
@@ -0,0 +1,115 @@
|
|
1
|
+
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Transformers
|
16
|
+
module Bert
|
17
|
+
class BertTokenizer < PreTrainedTokenizer
|
18
|
+
class BasicTokenizer
|
19
|
+
attr_reader :do_lower_case, :tokenize_chinese_chars, :never_split, :strip_accents, :do_split_on_punc
|
20
|
+
|
21
|
+
def initialize(
|
22
|
+
do_lower_case: true,
|
23
|
+
never_split: nil,
|
24
|
+
tokenize_chinese_chars: true,
|
25
|
+
strip_accents: nil,
|
26
|
+
do_split_on_punc: true
|
27
|
+
)
|
28
|
+
if never_split.nil?
|
29
|
+
never_split = []
|
30
|
+
end
|
31
|
+
@do_lower_case = do_lower_case
|
32
|
+
@never_split = Set.new(never_split)
|
33
|
+
@tokenize_chinese_chars = tokenize_chinese_chars
|
34
|
+
@strip_accents = strip_accents
|
35
|
+
@do_split_on_punc = do_split_on_punc
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class WordpieceTokenizer
|
40
|
+
def initialize(vocab:, unk_token:, max_input_chars_per_word: 100)
|
41
|
+
@vocab = vocab
|
42
|
+
@unk_token = unk_token
|
43
|
+
@max_input_chars_per_word = max_input_chars_per_word
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
attr_reader :vocab, :basic_tokenizer
|
48
|
+
|
49
|
+
def initialize(
|
50
|
+
vocab_file:,
|
51
|
+
do_lower_case: true,
|
52
|
+
do_basic_tokenize: true,
|
53
|
+
never_split: nil,
|
54
|
+
unk_token: "[UNK]",
|
55
|
+
sep_token: "[SEP]",
|
56
|
+
pad_token: "[PAD]",
|
57
|
+
cls_token: "[CLS]",
|
58
|
+
mask_token: "[MASK]",
|
59
|
+
tokenize_chinese_chars: true,
|
60
|
+
strip_accents: nil,
|
61
|
+
**kwargs
|
62
|
+
)
|
63
|
+
if !File.exist?(vocab_file)
|
64
|
+
raise ArgumentError,
|
65
|
+
"Can't find a vocabulary file at path '#{vocab_file}'. To load the vocabulary from a Google pretrained" +
|
66
|
+
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
67
|
+
end
|
68
|
+
@vocab = load_vocab(vocab_file)
|
69
|
+
@ids_to_tokens = @vocab.invert
|
70
|
+
@do_basic_tokenize = do_basic_tokenize
|
71
|
+
if do_basic_tokenize
|
72
|
+
@basic_tokenizer =
|
73
|
+
BasicTokenizer.new(
|
74
|
+
do_lower_case: do_lower_case,
|
75
|
+
never_split: never_split,
|
76
|
+
tokenize_chinese_chars: tokenize_chinese_chars,
|
77
|
+
strip_accents: strip_accents
|
78
|
+
)
|
79
|
+
end
|
80
|
+
|
81
|
+
@wordpiece_tokenizer = WordpieceTokenizer.new(vocab: @vocab, unk_token: unk_token.to_s)
|
82
|
+
|
83
|
+
super(
|
84
|
+
do_lower_case: do_lower_case,
|
85
|
+
do_basic_tokenize: do_basic_tokenize,
|
86
|
+
never_split: never_split,
|
87
|
+
unk_token: unk_token,
|
88
|
+
sep_token: sep_token,
|
89
|
+
pad_token: pad_token,
|
90
|
+
cls_token: cls_token,
|
91
|
+
mask_token: mask_token,
|
92
|
+
tokenize_chinese_chars: tokenize_chinese_chars,
|
93
|
+
strip_accents: strip_accents,
|
94
|
+
**kwargs
|
95
|
+
)
|
96
|
+
end
|
97
|
+
|
98
|
+
def _convert_token_to_id(token)
|
99
|
+
@vocab.fetch(token, @vocab.fetch(@unk_token))
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
|
104
|
+
def load_vocab(vocab_file)
|
105
|
+
vocab = {}
|
106
|
+
tokens = File.readlines(vocab_file)
|
107
|
+
tokens.each_with_index do |token, index|
|
108
|
+
token = token.chomp
|
109
|
+
vocab[token] = index
|
110
|
+
end
|
111
|
+
vocab
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Transformers
|
16
|
+
module Bert
|
17
|
+
class BertTokenizerFast < PreTrainedTokenizerFast
|
18
|
+
VOCAB_FILES_NAMES = {vocab_file: "vocab.txt", tokenizer_file: "tokenizer.json"}
|
19
|
+
|
20
|
+
self.vocab_files_names = VOCAB_FILES_NAMES
|
21
|
+
self.slow_tokenizer_class = BertTokenizer
|
22
|
+
|
23
|
+
def initialize(
|
24
|
+
vocab_file: nil,
|
25
|
+
tokenizer_file: nil,
|
26
|
+
do_lower_case: true,
|
27
|
+
unk_token: "[UNK]",
|
28
|
+
sep_token: "[SEP]",
|
29
|
+
pad_token: "[PAD]",
|
30
|
+
cls_token: "[CLS]",
|
31
|
+
mask_token: "[MASK]",
|
32
|
+
tokenize_chinese_chars: true,
|
33
|
+
strip_accents: nil,
|
34
|
+
**kwargs
|
35
|
+
)
|
36
|
+
super(
|
37
|
+
vocab_file,
|
38
|
+
tokenizer_file: tokenizer_file,
|
39
|
+
do_lower_case: do_lower_case,
|
40
|
+
unk_token: unk_token,
|
41
|
+
sep_token: sep_token,
|
42
|
+
pad_token: pad_token,
|
43
|
+
cls_token: cls_token,
|
44
|
+
mask_token: mask_token,
|
45
|
+
tokenize_chinese_chars: tokenize_chinese_chars,
|
46
|
+
strip_accents: strip_accents,
|
47
|
+
**kwargs
|
48
|
+
)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Transformers
|
16
|
+
module Distilbert
|
17
|
+
class DistilBertConfig < PretrainedConfig
|
18
|
+
self.model_type = "distilbert"
|
19
|
+
self.attribute_map = {
|
20
|
+
hidden_size: "dim",
|
21
|
+
num_attention_heads: "n_heads",
|
22
|
+
num_hidden_layers: "n_layers",
|
23
|
+
}
|
24
|
+
|
25
|
+
attr_reader :vocab_size, :max_position_embeddings, :sinusoidal_pos_embds, :n_layers, :n_heads,
|
26
|
+
:dim, :hidden_dim, :dropout, :attention_dropout, :activation, :initializer_range, :qa_dropout,
|
27
|
+
:seq_classif_dropout, :pad_token_id
|
28
|
+
|
29
|
+
def initialize(
|
30
|
+
vocab_size: 30522,
|
31
|
+
max_position_embeddings: 512,
|
32
|
+
sinusoidal_pos_embds: false,
|
33
|
+
n_layers: 6,
|
34
|
+
n_heads: 12,
|
35
|
+
dim: 768,
|
36
|
+
hidden_dim: 4 * 768,
|
37
|
+
dropout: 0.1,
|
38
|
+
attention_dropout: 0.1,
|
39
|
+
activation: "gelu",
|
40
|
+
initializer_range: 0.02,
|
41
|
+
qa_dropout: 0.1,
|
42
|
+
seq_classif_dropout: 0.2,
|
43
|
+
pad_token_id: 0,
|
44
|
+
**kwargs
|
45
|
+
)
|
46
|
+
@vocab_size = vocab_size
|
47
|
+
@max_position_embeddings = max_position_embeddings
|
48
|
+
@sinusoidal_pos_embds = sinusoidal_pos_embds
|
49
|
+
@n_layers = n_layers
|
50
|
+
@n_heads = n_heads
|
51
|
+
@dim = dim
|
52
|
+
@hidden_dim = hidden_dim
|
53
|
+
@dropout = dropout
|
54
|
+
@attention_dropout = attention_dropout
|
55
|
+
@activation = activation
|
56
|
+
@initializer_range = initializer_range
|
57
|
+
@qa_dropout = qa_dropout
|
58
|
+
@seq_classif_dropout = seq_classif_dropout
|
59
|
+
super(**kwargs, pad_token_id: pad_token_id)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|