SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
- SinaTools-1.0.1.dist-info/RECORD +73 -0
- sinatools/VERSION +1 -1
- sinatools/ner/__init__.py +5 -7
- sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- sinatools/ner/trainers/BertTrainer.py +163 -163
- sinatools/ner/trainers/__init__.py +2 -2
- SinaTools-0.1.40.dist-info/RECORD +0 -123
- sinatools/arabert/arabert/__init__.py +0 -14
- sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools/arabert/arabert/optimization.py +0 -202
- sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,230 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Writes out text data as tfrecords that ELECTRA can be pre-trained on."""
|
17
|
-
|
18
|
-
import argparse
|
19
|
-
import multiprocessing
|
20
|
-
import os
|
21
|
-
import random
|
22
|
-
import time
|
23
|
-
import tensorflow as tf
|
24
|
-
|
25
|
-
from model import tokenization
|
26
|
-
from util import utils
|
27
|
-
|
28
|
-
|
29
|
-
def create_int_feature(values):
|
30
|
-
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
|
31
|
-
return feature
|
32
|
-
|
33
|
-
|
34
|
-
class ExampleBuilder(object):
|
35
|
-
"""Given a stream of input text, creates pretraining examples."""
|
36
|
-
|
37
|
-
def __init__(self, tokenizer, max_length):
|
38
|
-
self._tokenizer = tokenizer
|
39
|
-
self._current_sentences = []
|
40
|
-
self._current_length = 0
|
41
|
-
self._max_length = max_length
|
42
|
-
self._target_length = max_length
|
43
|
-
|
44
|
-
def add_line(self, line):
|
45
|
-
"""Adds a line of text to the current example being built."""
|
46
|
-
line = line.strip().replace("\n", " ")
|
47
|
-
if (not line) and self._current_length != 0: # empty lines separate docs
|
48
|
-
return self._create_example()
|
49
|
-
bert_tokens = self._tokenizer.tokenize(line)
|
50
|
-
bert_tokids = self._tokenizer.convert_tokens_to_ids(bert_tokens)
|
51
|
-
self._current_sentences.append(bert_tokids)
|
52
|
-
self._current_length += len(bert_tokids)
|
53
|
-
if self._current_length >= self._target_length:
|
54
|
-
return self._create_example()
|
55
|
-
return None
|
56
|
-
|
57
|
-
def _create_example(self):
|
58
|
-
"""Creates a pre-training example from the current list of sentences."""
|
59
|
-
# small chance to only have one segment as in classification tasks
|
60
|
-
if random.random() < 0.1:
|
61
|
-
first_segment_target_length = 100000
|
62
|
-
else:
|
63
|
-
# -3 due to not yet having [CLS]/[SEP] tokens in the input text
|
64
|
-
first_segment_target_length = (self._target_length - 3) // 2
|
65
|
-
|
66
|
-
first_segment = []
|
67
|
-
second_segment = []
|
68
|
-
for sentence in self._current_sentences:
|
69
|
-
# the sentence goes to the first segment if (1) the first segment is
|
70
|
-
# empty, (2) the sentence doesn't put the first segment over length or
|
71
|
-
# (3) 50% of the time when it does put the first segment over length
|
72
|
-
if (len(first_segment) == 0 or
|
73
|
-
len(first_segment) + len(sentence) < first_segment_target_length or
|
74
|
-
(len(second_segment) == 0 and
|
75
|
-
len(first_segment) < first_segment_target_length and
|
76
|
-
random.random() < 0.5)):
|
77
|
-
first_segment += sentence
|
78
|
-
else:
|
79
|
-
second_segment += sentence
|
80
|
-
|
81
|
-
# trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
|
82
|
-
first_segment = first_segment[:self._max_length - 2]
|
83
|
-
second_segment = second_segment[:max(0, self._max_length -
|
84
|
-
len(first_segment) - 3)]
|
85
|
-
|
86
|
-
# prepare to start building the next example
|
87
|
-
self._current_sentences = []
|
88
|
-
self._current_length = 0
|
89
|
-
# small chance for random-length instead of max_length-length example
|
90
|
-
if random.random() < 0.05:
|
91
|
-
self._target_length = random.randint(5, self._max_length)
|
92
|
-
else:
|
93
|
-
self._target_length = self._max_length
|
94
|
-
|
95
|
-
return self._make_tf_example(first_segment, second_segment)
|
96
|
-
|
97
|
-
def _make_tf_example(self, first_segment, second_segment):
|
98
|
-
"""Converts two "segments" of text into a tf.train.Example."""
|
99
|
-
vocab = self._tokenizer.vocab
|
100
|
-
input_ids = [vocab["[CLS]"]] + first_segment + [vocab["[SEP]"]]
|
101
|
-
segment_ids = [0] * len(input_ids)
|
102
|
-
if second_segment:
|
103
|
-
input_ids += second_segment + [vocab["[SEP]"]]
|
104
|
-
segment_ids += [1] * (len(second_segment) + 1)
|
105
|
-
input_mask = [1] * len(input_ids)
|
106
|
-
input_ids += [0] * (self._max_length - len(input_ids))
|
107
|
-
input_mask += [0] * (self._max_length - len(input_mask))
|
108
|
-
segment_ids += [0] * (self._max_length - len(segment_ids))
|
109
|
-
tf_example = tf.train.Example(features=tf.train.Features(feature={
|
110
|
-
"input_ids": create_int_feature(input_ids),
|
111
|
-
"input_mask": create_int_feature(input_mask),
|
112
|
-
"segment_ids": create_int_feature(segment_ids)
|
113
|
-
}))
|
114
|
-
return tf_example
|
115
|
-
|
116
|
-
|
117
|
-
class ExampleWriter(object):
|
118
|
-
"""Writes pre-training examples to disk."""
|
119
|
-
|
120
|
-
def __init__(self, job_id, vocab_file, output_dir, max_seq_length,
|
121
|
-
num_jobs, blanks_separate_docs, do_lower_case,
|
122
|
-
num_out_files=1000):
|
123
|
-
self._blanks_separate_docs = blanks_separate_docs
|
124
|
-
tokenizer = tokenization.FullTokenizer(
|
125
|
-
vocab_file=vocab_file,
|
126
|
-
do_lower_case=do_lower_case)
|
127
|
-
self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
|
128
|
-
self._writers = []
|
129
|
-
for i in range(num_out_files):
|
130
|
-
if i % num_jobs == job_id:
|
131
|
-
output_fname = os.path.join(
|
132
|
-
output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format(
|
133
|
-
i, num_out_files))
|
134
|
-
self._writers.append(tf.io.TFRecordWriter(output_fname))
|
135
|
-
self.n_written = 0
|
136
|
-
|
137
|
-
def write_examples(self, input_file):
|
138
|
-
"""Writes out examples from the provided input file."""
|
139
|
-
with tf.io.gfile.GFile(input_file) as f:
|
140
|
-
for line in f:
|
141
|
-
line = line.strip()
|
142
|
-
if line or self._blanks_separate_docs:
|
143
|
-
example = self._example_builder.add_line(line)
|
144
|
-
if example:
|
145
|
-
self._writers[self.n_written % len(self._writers)].write(
|
146
|
-
example.SerializeToString())
|
147
|
-
self.n_written += 1
|
148
|
-
example = self._example_builder.add_line("")
|
149
|
-
if example:
|
150
|
-
self._writers[self.n_written % len(self._writers)].write(
|
151
|
-
example.SerializeToString())
|
152
|
-
self.n_written += 1
|
153
|
-
|
154
|
-
def finish(self):
|
155
|
-
for writer in self._writers:
|
156
|
-
writer.close()
|
157
|
-
|
158
|
-
|
159
|
-
def write_examples(job_id, args):
|
160
|
-
"""A single process creating and writing out pre-processed examples."""
|
161
|
-
|
162
|
-
def log(*args):
|
163
|
-
msg = " ".join(map(str, args))
|
164
|
-
print("Job {}:".format(job_id), msg)
|
165
|
-
|
166
|
-
log("Creating example writer")
|
167
|
-
example_writer = ExampleWriter(
|
168
|
-
job_id=job_id,
|
169
|
-
vocab_file=args.vocab_file,
|
170
|
-
output_dir=args.output_dir,
|
171
|
-
max_seq_length=args.max_seq_length,
|
172
|
-
num_jobs=args.num_processes,
|
173
|
-
blanks_separate_docs=args.blanks_separate_docs,
|
174
|
-
do_lower_case=args.do_lower_case
|
175
|
-
)
|
176
|
-
log("Writing tf examples")
|
177
|
-
fnames = sorted(tf.io.gfile.listdir(args.corpus_dir))
|
178
|
-
fnames = [f for (i, f) in enumerate(fnames)
|
179
|
-
if i % args.num_processes == job_id]
|
180
|
-
random.shuffle(fnames)
|
181
|
-
start_time = time.time()
|
182
|
-
for file_no, fname in enumerate(fnames):
|
183
|
-
if file_no > 0:
|
184
|
-
elapsed = time.time() - start_time
|
185
|
-
log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
|
186
|
-
"{:} examples written".format(
|
187
|
-
file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
|
188
|
-
int((len(fnames) - file_no) / (file_no / elapsed)),
|
189
|
-
example_writer.n_written))
|
190
|
-
example_writer.write_examples(os.path.join(args.corpus_dir, fname))
|
191
|
-
example_writer.finish()
|
192
|
-
log("Done!")
|
193
|
-
|
194
|
-
|
195
|
-
def main():
|
196
|
-
parser = argparse.ArgumentParser(description=__doc__)
|
197
|
-
parser.add_argument("--corpus-dir", required=True,
|
198
|
-
help="Location of pre-training text files.")
|
199
|
-
parser.add_argument("--vocab-file", required=True,
|
200
|
-
help="Location of vocabulary file.")
|
201
|
-
parser.add_argument("--output-dir", required=True,
|
202
|
-
help="Where to write out the tfrecords.")
|
203
|
-
parser.add_argument("--max-seq-length", default=128, type=int,
|
204
|
-
help="Number of tokens per example.")
|
205
|
-
parser.add_argument("--num-processes", default=1, type=int,
|
206
|
-
help="Parallelize across multiple processes.")
|
207
|
-
parser.add_argument("--blanks-separate-docs", default=True, type=bool,
|
208
|
-
help="Whether blank lines indicate document boundaries.")
|
209
|
-
parser.add_argument("--do-lower-case", dest='do_lower_case',
|
210
|
-
action='store_true', help="Lower case input text.")
|
211
|
-
parser.add_argument("--no-lower-case", dest='do_lower_case',
|
212
|
-
action='store_false', help="Don't lower case input text.")
|
213
|
-
parser.set_defaults(do_lower_case=True)
|
214
|
-
args = parser.parse_args()
|
215
|
-
|
216
|
-
utils.rmkdir(args.output_dir)
|
217
|
-
if args.num_processes == 1:
|
218
|
-
write_examples(0, args)
|
219
|
-
else:
|
220
|
-
jobs = []
|
221
|
-
for i in range(args.num_processes):
|
222
|
-
job = multiprocessing.Process(target=write_examples, args=(i, args))
|
223
|
-
jobs.append(job)
|
224
|
-
job.start()
|
225
|
-
for job in jobs:
|
226
|
-
job.join()
|
227
|
-
|
228
|
-
|
229
|
-
if __name__ == "__main__":
|
230
|
-
main()
|
@@ -1,90 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
|
3
|
-
import argparse
|
4
|
-
import os
|
5
|
-
import tensorflow as tf
|
6
|
-
|
7
|
-
import build_pretraining_dataset
|
8
|
-
from model import tokenization
|
9
|
-
|
10
|
-
class ExampleWriter(object):
|
11
|
-
"""Writes pre-training examples to disk."""
|
12
|
-
|
13
|
-
def __init__(self, input_fname, vocab_file, output_dir, max_seq_length,
|
14
|
-
blanks_separate_docs, do_lower_case):
|
15
|
-
self._blanks_separate_docs = blanks_separate_docs
|
16
|
-
tokenizer = tokenization.FullTokenizer(
|
17
|
-
vocab_file=vocab_file,
|
18
|
-
do_lower_case=do_lower_case)
|
19
|
-
self._example_builder = build_pretraining_dataset.ExampleBuilder(tokenizer, max_seq_length)
|
20
|
-
output_fname = os.path.join(output_dir, "{}.tfrecord".format(input_fname.split("/")[-1]))
|
21
|
-
self._writer = tf.io.TFRecordWriter(output_fname)
|
22
|
-
self.n_written = 0
|
23
|
-
|
24
|
-
def write_examples(self, input_file):
|
25
|
-
"""Writes out examples from the provided input file."""
|
26
|
-
with tf.io.gfile.GFile(input_file) as f:
|
27
|
-
for line in f:
|
28
|
-
line = line.strip()
|
29
|
-
if line or self._blanks_separate_docs:
|
30
|
-
example = self._example_builder.add_line(line)
|
31
|
-
if example:
|
32
|
-
self._writer.write(example.SerializeToString())
|
33
|
-
self.n_written += 1
|
34
|
-
example = self._example_builder.add_line("")
|
35
|
-
if example:
|
36
|
-
self._writer.write(example.SerializeToString())
|
37
|
-
self.n_written += 1
|
38
|
-
|
39
|
-
def finish(self):
|
40
|
-
self._writer.close()
|
41
|
-
|
42
|
-
def write_examples(args):
|
43
|
-
"""A single process creating and writing out pre-processed examples."""
|
44
|
-
|
45
|
-
def log(*args):
|
46
|
-
msg = " ".join(map(str, args))
|
47
|
-
print(msg)
|
48
|
-
|
49
|
-
log("Creating example writer")
|
50
|
-
example_writer = ExampleWriter(
|
51
|
-
input_fname=args.input_file,
|
52
|
-
vocab_file=args.vocab_file,
|
53
|
-
output_dir=args.output_dir,
|
54
|
-
max_seq_length=args.max_seq_length,
|
55
|
-
blanks_separate_docs=args.blanks_separate_docs,
|
56
|
-
do_lower_case=args.do_lower_case
|
57
|
-
)
|
58
|
-
log("Writing tf example")
|
59
|
-
|
60
|
-
example_writer.write_examples(args.input_file)
|
61
|
-
example_writer.finish()
|
62
|
-
log("Done!")
|
63
|
-
return
|
64
|
-
|
65
|
-
|
66
|
-
def main():
|
67
|
-
parser = argparse.ArgumentParser(description=__doc__)
|
68
|
-
parser.add_argument("--input-file", required=True,
|
69
|
-
help="Location of pre-training text files.")
|
70
|
-
parser.add_argument("--vocab-file", required=True,
|
71
|
-
help="Location of vocabulary file.")
|
72
|
-
parser.add_argument("--output-dir", required=True,
|
73
|
-
help="Where to write out the tfrecords.")
|
74
|
-
parser.add_argument("--max-seq-length", default=128, type=int,
|
75
|
-
help="Number of tokens per example.")
|
76
|
-
parser.add_argument("--blanks-separate-docs", default=True, type=bool,
|
77
|
-
help="Whether blank lines indicate document boundaries.")
|
78
|
-
parser.add_argument("--do-lower-case", dest='do_lower_case',
|
79
|
-
action='store_true', help="Lower case input text.")
|
80
|
-
parser.add_argument("--no-lower-case", dest='do_lower_case',
|
81
|
-
action='store_false', help="Don't lower case input text.")
|
82
|
-
parser.set_defaults(do_lower_case=True)
|
83
|
-
args = parser.parse_args()
|
84
|
-
|
85
|
-
write_examples(args)
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
if __name__ == "__main__":
|
90
|
-
main()
|
@@ -1,172 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Config controlling hyperparameters for fine-tuning ELECTRA."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import os
|
23
|
-
|
24
|
-
import tensorflow as tf
|
25
|
-
|
26
|
-
|
27
|
-
class FinetuningConfig(object):
|
28
|
-
"""Fine-tuning hyperparameters."""
|
29
|
-
|
30
|
-
def __init__(self, model_name, data_dir, **kwargs):
|
31
|
-
# general
|
32
|
-
self.model_name = model_name
|
33
|
-
self.debug = False # debug mode for quickly running things
|
34
|
-
self.log_examples = False # print out some train examples for debugging
|
35
|
-
self.num_trials = 1 # how many train+eval runs to perform
|
36
|
-
self.do_train = True # train a model
|
37
|
-
self.do_eval = True # evaluate the model
|
38
|
-
self.keep_all_models = True # if False, only keep the last trial's ckpt
|
39
|
-
|
40
|
-
# model
|
41
|
-
self.model_size = "base" # one of "small", "base", or "large"
|
42
|
-
self.task_names = ["chunk"] # which tasks to learn
|
43
|
-
# override the default transformer hparams for the provided model size; see
|
44
|
-
# modeling.BertConfig for the possible hparams and util.training_utils for
|
45
|
-
# the defaults
|
46
|
-
self.model_hparam_overrides = (
|
47
|
-
kwargs["model_hparam_overrides"]
|
48
|
-
if "model_hparam_overrides" in kwargs else {})
|
49
|
-
self.embedding_size = None # bert hidden size by default
|
50
|
-
self.vocab_size = 64000 # number of tokens in the vocabulary
|
51
|
-
self.do_lower_case = True
|
52
|
-
|
53
|
-
# training
|
54
|
-
self.learning_rate = 1e-4
|
55
|
-
self.weight_decay_rate = 0.01
|
56
|
-
self.layerwise_lr_decay = 0.8 # if > 0, the learning rate for a layer is
|
57
|
-
# lr * lr_decay^(depth - max_depth) i.e.,
|
58
|
-
# shallower layers have lower learning rates
|
59
|
-
self.num_train_epochs = 3.0 # passes over the dataset during training
|
60
|
-
self.warmup_proportion = 0.1 # how much of training to warm up the LR for
|
61
|
-
self.save_checkpoints_steps = 1000000
|
62
|
-
self.iterations_per_loop = 1000
|
63
|
-
self.use_tfrecords_if_existing = True # don't make tfrecords and write them
|
64
|
-
# to disc if existing ones are found
|
65
|
-
|
66
|
-
# writing model outputs to disc
|
67
|
-
self.write_test_outputs = False # whether to write test set outputs,
|
68
|
-
# currently supported for GLUE + SQuAD 2.0
|
69
|
-
self.n_writes_test = 5 # write test set predictions for the first n trials
|
70
|
-
|
71
|
-
# sizing
|
72
|
-
self.max_seq_length = 128
|
73
|
-
self.train_batch_size = 32
|
74
|
-
self.eval_batch_size = 32
|
75
|
-
self.predict_batch_size = 32
|
76
|
-
self.double_unordered = True # for tasks like paraphrase where sentence
|
77
|
-
# order doesn't matter, train the model on
|
78
|
-
# on both sentence orderings for each example
|
79
|
-
# for qa tasks
|
80
|
-
self.max_query_length = 64 # max tokens in q as opposed to context
|
81
|
-
self.doc_stride = 128 # stride when splitting doc into multiple examples
|
82
|
-
self.n_best_size = 20 # number of predictions per example to save
|
83
|
-
self.max_answer_length = 30 # filter out answers longer than this length
|
84
|
-
self.answerable_classifier = True # answerable classifier for SQuAD 2.0
|
85
|
-
self.answerable_uses_start_logits = True # more advanced answerable
|
86
|
-
# classifier using predicted start
|
87
|
-
self.answerable_weight = 0.5 # weight for answerability loss
|
88
|
-
self.joint_prediction = True # jointly predict the start and end positions
|
89
|
-
# of the answer span
|
90
|
-
self.beam_size = 20 # beam size when doing joint predictions
|
91
|
-
self.qa_na_threshold = -2.75 # threshold for "no answer" when writing SQuAD
|
92
|
-
# 2.0 test outputs
|
93
|
-
|
94
|
-
# TPU settings
|
95
|
-
self.use_tpu = False
|
96
|
-
self.num_tpu_cores = 1
|
97
|
-
self.tpu_job_name = None
|
98
|
-
self.tpu_name = None # cloud TPU to use for training
|
99
|
-
self.tpu_zone = None # GCE zone where the Cloud TPU is located in
|
100
|
-
self.gcp_project = None # project name for the Cloud TPU-enabled project
|
101
|
-
|
102
|
-
# default locations of data files
|
103
|
-
self.data_dir = data_dir
|
104
|
-
pretrained_model_dir = os.path.join(data_dir, "models", model_name)
|
105
|
-
self.raw_data_dir = os.path.join(data_dir, "finetuning_data", "{:}").format
|
106
|
-
self.vocab_file = os.path.join(pretrained_model_dir, "vocab.txt")
|
107
|
-
if not tf.io.gfile.exists(self.vocab_file):
|
108
|
-
self.vocab_file = os.path.join(self.data_dir, "vocab.txt")
|
109
|
-
task_names_str = ",".join(
|
110
|
-
kwargs["task_names"] if "task_names" in kwargs else self.task_names)
|
111
|
-
self.init_checkpoint = None if self.debug else pretrained_model_dir
|
112
|
-
self.model_dir = os.path.join(pretrained_model_dir, "finetuning_models",
|
113
|
-
task_names_str + "_model")
|
114
|
-
results_dir = os.path.join(pretrained_model_dir, "results")
|
115
|
-
self.results_txt = os.path.join(results_dir,
|
116
|
-
task_names_str + "_results.txt")
|
117
|
-
self.results_pkl = os.path.join(results_dir,
|
118
|
-
task_names_str + "_results.pkl")
|
119
|
-
qa_topdir = os.path.join(results_dir, task_names_str + "_qa")
|
120
|
-
self.qa_eval_file = os.path.join(qa_topdir, "{:}_eval.json").format
|
121
|
-
self.qa_preds_file = os.path.join(qa_topdir, "{:}_preds.json").format
|
122
|
-
self.qa_na_file = os.path.join(qa_topdir, "{:}_null_odds.json").format
|
123
|
-
self.preprocessed_data_dir = os.path.join(
|
124
|
-
pretrained_model_dir, "finetuning_tfrecords",
|
125
|
-
task_names_str + "_tfrecords" + ("-debug" if self.debug else ""))
|
126
|
-
self.test_predictions = os.path.join(
|
127
|
-
pretrained_model_dir, "test_predictions",
|
128
|
-
"{:}_{:}_{:}_predictions.pkl").format
|
129
|
-
|
130
|
-
# update defaults with passed-in hyperparameters
|
131
|
-
self.update(kwargs)
|
132
|
-
|
133
|
-
# default hyperparameters for single-task models
|
134
|
-
if len(self.task_names) == 1:
|
135
|
-
task_name = self.task_names[0]
|
136
|
-
if task_name == "rte" or task_name == "sts":
|
137
|
-
self.num_train_epochs = 10.0
|
138
|
-
elif "squad" in task_name or "qa" in task_name:
|
139
|
-
self.max_seq_length = 512
|
140
|
-
self.num_train_epochs = 2.0
|
141
|
-
self.write_distill_outputs = False
|
142
|
-
self.write_test_outputs = False
|
143
|
-
elif task_name == "chunk":
|
144
|
-
self.max_seq_length = 256
|
145
|
-
else:
|
146
|
-
self.num_train_epochs = 3.0
|
147
|
-
|
148
|
-
# default hyperparameters for different model sizes
|
149
|
-
if self.model_size == "large":
|
150
|
-
self.learning_rate = 5e-5
|
151
|
-
self.layerwise_lr_decay = 0.9
|
152
|
-
elif self.model_size == "small":
|
153
|
-
self.embedding_size = 128
|
154
|
-
|
155
|
-
# debug-mode settings
|
156
|
-
if self.debug:
|
157
|
-
self.save_checkpoints_steps = 1000000
|
158
|
-
self.use_tfrecords_if_existing = False
|
159
|
-
self.num_trials = 1
|
160
|
-
self.iterations_per_loop = 1
|
161
|
-
self.train_batch_size = 32
|
162
|
-
self.num_train_epochs = 3.0
|
163
|
-
self.log_examples = True
|
164
|
-
|
165
|
-
# passed-in-arguments override (for example) debug-mode defaults
|
166
|
-
self.update(kwargs)
|
167
|
-
|
168
|
-
def update(self, kwargs):
|
169
|
-
for k, v in kwargs.items():
|
170
|
-
if k not in self.__dict__:
|
171
|
-
raise ValueError("Unknown hparam " + k)
|
172
|
-
self.__dict__[k] = v
|
@@ -1,143 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Config controlling hyperparameters for pre-training ELECTRA."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import os
|
23
|
-
|
24
|
-
|
25
|
-
class PretrainingConfig(object):
|
26
|
-
"""Defines pre-training hyperparameters."""
|
27
|
-
|
28
|
-
def __init__(self, model_name, data_dir, **kwargs):
|
29
|
-
self.model_name = model_name
|
30
|
-
self.debug = False # debug mode for quickly running things
|
31
|
-
self.do_train = True # pre-train ELECTRA
|
32
|
-
self.do_eval = False # evaluate generator/discriminator on unlabeled data
|
33
|
-
|
34
|
-
# loss functions
|
35
|
-
# train ELECTRA or Electric? if both are false, trains a masked LM like BERT
|
36
|
-
self.electra_objective = True
|
37
|
-
self.electric_objective = False
|
38
|
-
self.gen_weight = 1.0 # masked language modeling / generator loss
|
39
|
-
self.disc_weight = 50.0 # discriminator loss
|
40
|
-
self.mask_prob = 0.15 # percent of input tokens to mask out / replace
|
41
|
-
|
42
|
-
# optimization
|
43
|
-
self.learning_rate = 2e-4
|
44
|
-
self.lr_decay_power = 1.0 # linear weight decay by default
|
45
|
-
self.weight_decay_rate = 0.01
|
46
|
-
self.num_warmup_steps = 10000
|
47
|
-
|
48
|
-
# training settings
|
49
|
-
self.iterations_per_loop = 5000
|
50
|
-
self.save_checkpoints_steps = 25000
|
51
|
-
self.num_train_steps = 2000000
|
52
|
-
self.num_eval_steps = 10000
|
53
|
-
self.keep_checkpoint_max = 0 # maximum number of recent checkpoint files to keep;
|
54
|
-
# change to 0 or None to keep all checkpoints
|
55
|
-
|
56
|
-
# model settings
|
57
|
-
self.model_size = "base" # one of "small", "base", or "large"
|
58
|
-
# override the default transformer hparams for the provided model size; see
|
59
|
-
# modeling.BertConfig for the possible hparams and util.training_utils for
|
60
|
-
# the defaults
|
61
|
-
self.model_hparam_overrides = (
|
62
|
-
kwargs["model_hparam_overrides"]
|
63
|
-
if "model_hparam_overrides" in kwargs else {})
|
64
|
-
self.embedding_size = None # bert hidden size by default
|
65
|
-
self.vocab_size = 64000 # number of tokens in the vocabulary
|
66
|
-
self.do_lower_case = False # lowercase the input?
|
67
|
-
|
68
|
-
# generator settings
|
69
|
-
self.uniform_generator = False # generator is uniform at random
|
70
|
-
self.two_tower_generator = False # generator is a two-tower cloze model
|
71
|
-
self.untied_generator_embeddings = False # tie generator/discriminator
|
72
|
-
# token embeddings?
|
73
|
-
self.untied_generator = True # tie all generator/discriminator weights?
|
74
|
-
self.generator_layers = 1.0 # frac of discriminator layers for generator
|
75
|
-
self.generator_hidden_size = 0.25 # frac of discrim hidden size for gen
|
76
|
-
self.disallow_correct = False # force the generator to sample incorrect
|
77
|
-
# tokens (so 15% of tokens are always
|
78
|
-
# fake)
|
79
|
-
self.temperature = 1.0 # temperature for sampling from generator
|
80
|
-
|
81
|
-
# batch sizes
|
82
|
-
self.max_seq_length = 512
|
83
|
-
self.train_batch_size = 256
|
84
|
-
self.eval_batch_size = 256
|
85
|
-
|
86
|
-
# TPU settings
|
87
|
-
self.use_tpu = True
|
88
|
-
self.num_tpu_cores = 8
|
89
|
-
self.tpu_job_name = None
|
90
|
-
self.tpu_name = "" # cloud TPU to use for training
|
91
|
-
self.tpu_zone = "" # GCE zone where the Cloud TPU is located in
|
92
|
-
self.gcp_project = "" # project name for the Cloud TPU-enabled project
|
93
|
-
|
94
|
-
# default locations of data files
|
95
|
-
self.pretrain_tfrecords = os.path.join(
|
96
|
-
data_dir, "pretraining_data/512/*")
|
97
|
-
self.vocab_file = os.path.join(data_dir, "bertvocab_final.txt")
|
98
|
-
self.model_dir = os.path.join(data_dir, "models", model_name)
|
99
|
-
results_dir = os.path.join(self.model_dir, "results")
|
100
|
-
self.results_txt = os.path.join(results_dir, "unsup_results.txt")
|
101
|
-
self.results_pkl = os.path.join(results_dir, "unsup_results.pkl")
|
102
|
-
|
103
|
-
# update defaults with passed-in hyperparameters
|
104
|
-
self.update(kwargs)
|
105
|
-
|
106
|
-
self.max_predictions_per_seq = int((self.mask_prob + 0.005) *
|
107
|
-
self.max_seq_length)
|
108
|
-
|
109
|
-
# debug-mode settings
|
110
|
-
if self.debug:
|
111
|
-
self.train_batch_size = 8
|
112
|
-
self.num_train_steps = 20
|
113
|
-
self.eval_batch_size = 4
|
114
|
-
self.iterations_per_loop = 1
|
115
|
-
self.num_eval_steps = 2
|
116
|
-
|
117
|
-
# defaults for different-sized model
|
118
|
-
if self.model_size == "small":
|
119
|
-
self.embedding_size = 128
|
120
|
-
# Here are the hyperparameters we used for larger models; see Table 6 in the
|
121
|
-
# paper for the full hyperparameters
|
122
|
-
else:
|
123
|
-
self.max_seq_length = 512
|
124
|
-
self.learning_rate = 2e-4
|
125
|
-
if self.model_size == "base":
|
126
|
-
self.embedding_size = 768
|
127
|
-
self.generator_hidden_size = 0.33333
|
128
|
-
self.train_batch_size = 256
|
129
|
-
else:
|
130
|
-
self.embedding_size = 1024
|
131
|
-
self.mask_prob = 0.25
|
132
|
-
self.train_batch_size = 2048
|
133
|
-
if self.electric_objective:
|
134
|
-
self.two_tower_generator = True # electric requires a two-tower generator
|
135
|
-
|
136
|
-
# passed-in-arguments override (for example) debug-mode defaults
|
137
|
-
self.update(kwargs)
|
138
|
-
|
139
|
-
def update(self, kwargs):
|
140
|
-
for k, v in kwargs.items():
|
141
|
-
if k not in self.__dict__:
|
142
|
-
raise ValueError("Unknown hparam " + k)
|
143
|
-
self.__dict__[k] = v
|
@@ -1,14 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|