SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,230 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Writes out text data as tfrecords that ELECTRA can be pre-trained on."""
17
-
18
- import argparse
19
- import multiprocessing
20
- import os
21
- import random
22
- import time
23
- import tensorflow as tf
24
-
25
- from model import tokenization
26
- from util import utils
27
-
28
-
29
- def create_int_feature(values):
30
- feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
31
- return feature
32
-
33
-
34
- class ExampleBuilder(object):
35
- """Given a stream of input text, creates pretraining examples."""
36
-
37
- def __init__(self, tokenizer, max_length):
38
- self._tokenizer = tokenizer
39
- self._current_sentences = []
40
- self._current_length = 0
41
- self._max_length = max_length
42
- self._target_length = max_length
43
-
44
- def add_line(self, line):
45
- """Adds a line of text to the current example being built."""
46
- line = line.strip().replace("\n", " ")
47
- if (not line) and self._current_length != 0: # empty lines separate docs
48
- return self._create_example()
49
- bert_tokens = self._tokenizer.tokenize(line)
50
- bert_tokids = self._tokenizer.convert_tokens_to_ids(bert_tokens)
51
- self._current_sentences.append(bert_tokids)
52
- self._current_length += len(bert_tokids)
53
- if self._current_length >= self._target_length:
54
- return self._create_example()
55
- return None
56
-
57
- def _create_example(self):
58
- """Creates a pre-training example from the current list of sentences."""
59
- # small chance to only have one segment as in classification tasks
60
- if random.random() < 0.1:
61
- first_segment_target_length = 100000
62
- else:
63
- # -3 due to not yet having [CLS]/[SEP] tokens in the input text
64
- first_segment_target_length = (self._target_length - 3) // 2
65
-
66
- first_segment = []
67
- second_segment = []
68
- for sentence in self._current_sentences:
69
- # the sentence goes to the first segment if (1) the first segment is
70
- # empty, (2) the sentence doesn't put the first segment over length or
71
- # (3) 50% of the time when it does put the first segment over length
72
- if (len(first_segment) == 0 or
73
- len(first_segment) + len(sentence) < first_segment_target_length or
74
- (len(second_segment) == 0 and
75
- len(first_segment) < first_segment_target_length and
76
- random.random() < 0.5)):
77
- first_segment += sentence
78
- else:
79
- second_segment += sentence
80
-
81
- # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
82
- first_segment = first_segment[:self._max_length - 2]
83
- second_segment = second_segment[:max(0, self._max_length -
84
- len(first_segment) - 3)]
85
-
86
- # prepare to start building the next example
87
- self._current_sentences = []
88
- self._current_length = 0
89
- # small chance for random-length instead of max_length-length example
90
- if random.random() < 0.05:
91
- self._target_length = random.randint(5, self._max_length)
92
- else:
93
- self._target_length = self._max_length
94
-
95
- return self._make_tf_example(first_segment, second_segment)
96
-
97
- def _make_tf_example(self, first_segment, second_segment):
98
- """Converts two "segments" of text into a tf.train.Example."""
99
- vocab = self._tokenizer.vocab
100
- input_ids = [vocab["[CLS]"]] + first_segment + [vocab["[SEP]"]]
101
- segment_ids = [0] * len(input_ids)
102
- if second_segment:
103
- input_ids += second_segment + [vocab["[SEP]"]]
104
- segment_ids += [1] * (len(second_segment) + 1)
105
- input_mask = [1] * len(input_ids)
106
- input_ids += [0] * (self._max_length - len(input_ids))
107
- input_mask += [0] * (self._max_length - len(input_mask))
108
- segment_ids += [0] * (self._max_length - len(segment_ids))
109
- tf_example = tf.train.Example(features=tf.train.Features(feature={
110
- "input_ids": create_int_feature(input_ids),
111
- "input_mask": create_int_feature(input_mask),
112
- "segment_ids": create_int_feature(segment_ids)
113
- }))
114
- return tf_example
115
-
116
-
117
- class ExampleWriter(object):
118
- """Writes pre-training examples to disk."""
119
-
120
- def __init__(self, job_id, vocab_file, output_dir, max_seq_length,
121
- num_jobs, blanks_separate_docs, do_lower_case,
122
- num_out_files=1000):
123
- self._blanks_separate_docs = blanks_separate_docs
124
- tokenizer = tokenization.FullTokenizer(
125
- vocab_file=vocab_file,
126
- do_lower_case=do_lower_case)
127
- self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
128
- self._writers = []
129
- for i in range(num_out_files):
130
- if i % num_jobs == job_id:
131
- output_fname = os.path.join(
132
- output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format(
133
- i, num_out_files))
134
- self._writers.append(tf.io.TFRecordWriter(output_fname))
135
- self.n_written = 0
136
-
137
- def write_examples(self, input_file):
138
- """Writes out examples from the provided input file."""
139
- with tf.io.gfile.GFile(input_file) as f:
140
- for line in f:
141
- line = line.strip()
142
- if line or self._blanks_separate_docs:
143
- example = self._example_builder.add_line(line)
144
- if example:
145
- self._writers[self.n_written % len(self._writers)].write(
146
- example.SerializeToString())
147
- self.n_written += 1
148
- example = self._example_builder.add_line("")
149
- if example:
150
- self._writers[self.n_written % len(self._writers)].write(
151
- example.SerializeToString())
152
- self.n_written += 1
153
-
154
- def finish(self):
155
- for writer in self._writers:
156
- writer.close()
157
-
158
-
159
- def write_examples(job_id, args):
160
- """A single process creating and writing out pre-processed examples."""
161
-
162
- def log(*args):
163
- msg = " ".join(map(str, args))
164
- print("Job {}:".format(job_id), msg)
165
-
166
- log("Creating example writer")
167
- example_writer = ExampleWriter(
168
- job_id=job_id,
169
- vocab_file=args.vocab_file,
170
- output_dir=args.output_dir,
171
- max_seq_length=args.max_seq_length,
172
- num_jobs=args.num_processes,
173
- blanks_separate_docs=args.blanks_separate_docs,
174
- do_lower_case=args.do_lower_case
175
- )
176
- log("Writing tf examples")
177
- fnames = sorted(tf.io.gfile.listdir(args.corpus_dir))
178
- fnames = [f for (i, f) in enumerate(fnames)
179
- if i % args.num_processes == job_id]
180
- random.shuffle(fnames)
181
- start_time = time.time()
182
- for file_no, fname in enumerate(fnames):
183
- if file_no > 0:
184
- elapsed = time.time() - start_time
185
- log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
186
- "{:} examples written".format(
187
- file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
188
- int((len(fnames) - file_no) / (file_no / elapsed)),
189
- example_writer.n_written))
190
- example_writer.write_examples(os.path.join(args.corpus_dir, fname))
191
- example_writer.finish()
192
- log("Done!")
193
-
194
-
195
- def main():
196
- parser = argparse.ArgumentParser(description=__doc__)
197
- parser.add_argument("--corpus-dir", required=True,
198
- help="Location of pre-training text files.")
199
- parser.add_argument("--vocab-file", required=True,
200
- help="Location of vocabulary file.")
201
- parser.add_argument("--output-dir", required=True,
202
- help="Where to write out the tfrecords.")
203
- parser.add_argument("--max-seq-length", default=128, type=int,
204
- help="Number of tokens per example.")
205
- parser.add_argument("--num-processes", default=1, type=int,
206
- help="Parallelize across multiple processes.")
207
- parser.add_argument("--blanks-separate-docs", default=True, type=bool,
208
- help="Whether blank lines indicate document boundaries.")
209
- parser.add_argument("--do-lower-case", dest='do_lower_case',
210
- action='store_true', help="Lower case input text.")
211
- parser.add_argument("--no-lower-case", dest='do_lower_case',
212
- action='store_false', help="Don't lower case input text.")
213
- parser.set_defaults(do_lower_case=True)
214
- args = parser.parse_args()
215
-
216
- utils.rmkdir(args.output_dir)
217
- if args.num_processes == 1:
218
- write_examples(0, args)
219
- else:
220
- jobs = []
221
- for i in range(args.num_processes):
222
- job = multiprocessing.Process(target=write_examples, args=(i, args))
223
- jobs.append(job)
224
- job.start()
225
- for job in jobs:
226
- job.join()
227
-
228
-
229
- if __name__ == "__main__":
230
- main()
@@ -1,90 +0,0 @@
1
- # coding=utf-8
2
-
3
- import argparse
4
- import os
5
- import tensorflow as tf
6
-
7
- import build_pretraining_dataset
8
- from model import tokenization
9
-
10
- class ExampleWriter(object):
11
- """Writes pre-training examples to disk."""
12
-
13
- def __init__(self, input_fname, vocab_file, output_dir, max_seq_length,
14
- blanks_separate_docs, do_lower_case):
15
- self._blanks_separate_docs = blanks_separate_docs
16
- tokenizer = tokenization.FullTokenizer(
17
- vocab_file=vocab_file,
18
- do_lower_case=do_lower_case)
19
- self._example_builder = build_pretraining_dataset.ExampleBuilder(tokenizer, max_seq_length)
20
- output_fname = os.path.join(output_dir, "{}.tfrecord".format(input_fname.split("/")[-1]))
21
- self._writer = tf.io.TFRecordWriter(output_fname)
22
- self.n_written = 0
23
-
24
- def write_examples(self, input_file):
25
- """Writes out examples from the provided input file."""
26
- with tf.io.gfile.GFile(input_file) as f:
27
- for line in f:
28
- line = line.strip()
29
- if line or self._blanks_separate_docs:
30
- example = self._example_builder.add_line(line)
31
- if example:
32
- self._writer.write(example.SerializeToString())
33
- self.n_written += 1
34
- example = self._example_builder.add_line("")
35
- if example:
36
- self._writer.write(example.SerializeToString())
37
- self.n_written += 1
38
-
39
- def finish(self):
40
- self._writer.close()
41
-
42
- def write_examples(args):
43
- """A single process creating and writing out pre-processed examples."""
44
-
45
- def log(*args):
46
- msg = " ".join(map(str, args))
47
- print(msg)
48
-
49
- log("Creating example writer")
50
- example_writer = ExampleWriter(
51
- input_fname=args.input_file,
52
- vocab_file=args.vocab_file,
53
- output_dir=args.output_dir,
54
- max_seq_length=args.max_seq_length,
55
- blanks_separate_docs=args.blanks_separate_docs,
56
- do_lower_case=args.do_lower_case
57
- )
58
- log("Writing tf example")
59
-
60
- example_writer.write_examples(args.input_file)
61
- example_writer.finish()
62
- log("Done!")
63
- return
64
-
65
-
66
- def main():
67
- parser = argparse.ArgumentParser(description=__doc__)
68
- parser.add_argument("--input-file", required=True,
69
- help="Location of pre-training text files.")
70
- parser.add_argument("--vocab-file", required=True,
71
- help="Location of vocabulary file.")
72
- parser.add_argument("--output-dir", required=True,
73
- help="Where to write out the tfrecords.")
74
- parser.add_argument("--max-seq-length", default=128, type=int,
75
- help="Number of tokens per example.")
76
- parser.add_argument("--blanks-separate-docs", default=True, type=bool,
77
- help="Whether blank lines indicate document boundaries.")
78
- parser.add_argument("--do-lower-case", dest='do_lower_case',
79
- action='store_true', help="Lower case input text.")
80
- parser.add_argument("--no-lower-case", dest='do_lower_case',
81
- action='store_false', help="Don't lower case input text.")
82
- parser.set_defaults(do_lower_case=True)
83
- args = parser.parse_args()
84
-
85
- write_examples(args)
86
-
87
-
88
-
89
- if __name__ == "__main__":
90
- main()
@@ -1,172 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Config controlling hyperparameters for fine-tuning ELECTRA."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import os
23
-
24
- import tensorflow as tf
25
-
26
-
27
- class FinetuningConfig(object):
28
- """Fine-tuning hyperparameters."""
29
-
30
- def __init__(self, model_name, data_dir, **kwargs):
31
- # general
32
- self.model_name = model_name
33
- self.debug = False # debug mode for quickly running things
34
- self.log_examples = False # print out some train examples for debugging
35
- self.num_trials = 1 # how many train+eval runs to perform
36
- self.do_train = True # train a model
37
- self.do_eval = True # evaluate the model
38
- self.keep_all_models = True # if False, only keep the last trial's ckpt
39
-
40
- # model
41
- self.model_size = "base" # one of "small", "base", or "large"
42
- self.task_names = ["chunk"] # which tasks to learn
43
- # override the default transformer hparams for the provided model size; see
44
- # modeling.BertConfig for the possible hparams and util.training_utils for
45
- # the defaults
46
- self.model_hparam_overrides = (
47
- kwargs["model_hparam_overrides"]
48
- if "model_hparam_overrides" in kwargs else {})
49
- self.embedding_size = None # bert hidden size by default
50
- self.vocab_size = 64000 # number of tokens in the vocabulary
51
- self.do_lower_case = True
52
-
53
- # training
54
- self.learning_rate = 1e-4
55
- self.weight_decay_rate = 0.01
56
- self.layerwise_lr_decay = 0.8 # if > 0, the learning rate for a layer is
57
- # lr * lr_decay^(depth - max_depth) i.e.,
58
- # shallower layers have lower learning rates
59
- self.num_train_epochs = 3.0 # passes over the dataset during training
60
- self.warmup_proportion = 0.1 # how much of training to warm up the LR for
61
- self.save_checkpoints_steps = 1000000
62
- self.iterations_per_loop = 1000
63
- self.use_tfrecords_if_existing = True # don't make tfrecords and write them
64
- # to disc if existing ones are found
65
-
66
- # writing model outputs to disc
67
- self.write_test_outputs = False # whether to write test set outputs,
68
- # currently supported for GLUE + SQuAD 2.0
69
- self.n_writes_test = 5 # write test set predictions for the first n trials
70
-
71
- # sizing
72
- self.max_seq_length = 128
73
- self.train_batch_size = 32
74
- self.eval_batch_size = 32
75
- self.predict_batch_size = 32
76
- self.double_unordered = True # for tasks like paraphrase where sentence
77
- # order doesn't matter, train the model on
78
- # on both sentence orderings for each example
79
- # for qa tasks
80
- self.max_query_length = 64 # max tokens in q as opposed to context
81
- self.doc_stride = 128 # stride when splitting doc into multiple examples
82
- self.n_best_size = 20 # number of predictions per example to save
83
- self.max_answer_length = 30 # filter out answers longer than this length
84
- self.answerable_classifier = True # answerable classifier for SQuAD 2.0
85
- self.answerable_uses_start_logits = True # more advanced answerable
86
- # classifier using predicted start
87
- self.answerable_weight = 0.5 # weight for answerability loss
88
- self.joint_prediction = True # jointly predict the start and end positions
89
- # of the answer span
90
- self.beam_size = 20 # beam size when doing joint predictions
91
- self.qa_na_threshold = -2.75 # threshold for "no answer" when writing SQuAD
92
- # 2.0 test outputs
93
-
94
- # TPU settings
95
- self.use_tpu = False
96
- self.num_tpu_cores = 1
97
- self.tpu_job_name = None
98
- self.tpu_name = None # cloud TPU to use for training
99
- self.tpu_zone = None # GCE zone where the Cloud TPU is located in
100
- self.gcp_project = None # project name for the Cloud TPU-enabled project
101
-
102
- # default locations of data files
103
- self.data_dir = data_dir
104
- pretrained_model_dir = os.path.join(data_dir, "models", model_name)
105
- self.raw_data_dir = os.path.join(data_dir, "finetuning_data", "{:}").format
106
- self.vocab_file = os.path.join(pretrained_model_dir, "vocab.txt")
107
- if not tf.io.gfile.exists(self.vocab_file):
108
- self.vocab_file = os.path.join(self.data_dir, "vocab.txt")
109
- task_names_str = ",".join(
110
- kwargs["task_names"] if "task_names" in kwargs else self.task_names)
111
- self.init_checkpoint = None if self.debug else pretrained_model_dir
112
- self.model_dir = os.path.join(pretrained_model_dir, "finetuning_models",
113
- task_names_str + "_model")
114
- results_dir = os.path.join(pretrained_model_dir, "results")
115
- self.results_txt = os.path.join(results_dir,
116
- task_names_str + "_results.txt")
117
- self.results_pkl = os.path.join(results_dir,
118
- task_names_str + "_results.pkl")
119
- qa_topdir = os.path.join(results_dir, task_names_str + "_qa")
120
- self.qa_eval_file = os.path.join(qa_topdir, "{:}_eval.json").format
121
- self.qa_preds_file = os.path.join(qa_topdir, "{:}_preds.json").format
122
- self.qa_na_file = os.path.join(qa_topdir, "{:}_null_odds.json").format
123
- self.preprocessed_data_dir = os.path.join(
124
- pretrained_model_dir, "finetuning_tfrecords",
125
- task_names_str + "_tfrecords" + ("-debug" if self.debug else ""))
126
- self.test_predictions = os.path.join(
127
- pretrained_model_dir, "test_predictions",
128
- "{:}_{:}_{:}_predictions.pkl").format
129
-
130
- # update defaults with passed-in hyperparameters
131
- self.update(kwargs)
132
-
133
- # default hyperparameters for single-task models
134
- if len(self.task_names) == 1:
135
- task_name = self.task_names[0]
136
- if task_name == "rte" or task_name == "sts":
137
- self.num_train_epochs = 10.0
138
- elif "squad" in task_name or "qa" in task_name:
139
- self.max_seq_length = 512
140
- self.num_train_epochs = 2.0
141
- self.write_distill_outputs = False
142
- self.write_test_outputs = False
143
- elif task_name == "chunk":
144
- self.max_seq_length = 256
145
- else:
146
- self.num_train_epochs = 3.0
147
-
148
- # default hyperparameters for different model sizes
149
- if self.model_size == "large":
150
- self.learning_rate = 5e-5
151
- self.layerwise_lr_decay = 0.9
152
- elif self.model_size == "small":
153
- self.embedding_size = 128
154
-
155
- # debug-mode settings
156
- if self.debug:
157
- self.save_checkpoints_steps = 1000000
158
- self.use_tfrecords_if_existing = False
159
- self.num_trials = 1
160
- self.iterations_per_loop = 1
161
- self.train_batch_size = 32
162
- self.num_train_epochs = 3.0
163
- self.log_examples = True
164
-
165
- # passed-in-arguments override (for example) debug-mode defaults
166
- self.update(kwargs)
167
-
168
- def update(self, kwargs):
169
- for k, v in kwargs.items():
170
- if k not in self.__dict__:
171
- raise ValueError("Unknown hparam " + k)
172
- self.__dict__[k] = v
@@ -1,143 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Config controlling hyperparameters for pre-training ELECTRA."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import os
23
-
24
-
25
- class PretrainingConfig(object):
26
- """Defines pre-training hyperparameters."""
27
-
28
- def __init__(self, model_name, data_dir, **kwargs):
29
- self.model_name = model_name
30
- self.debug = False # debug mode for quickly running things
31
- self.do_train = True # pre-train ELECTRA
32
- self.do_eval = False # evaluate generator/discriminator on unlabeled data
33
-
34
- # loss functions
35
- # train ELECTRA or Electric? if both are false, trains a masked LM like BERT
36
- self.electra_objective = True
37
- self.electric_objective = False
38
- self.gen_weight = 1.0 # masked language modeling / generator loss
39
- self.disc_weight = 50.0 # discriminator loss
40
- self.mask_prob = 0.15 # percent of input tokens to mask out / replace
41
-
42
- # optimization
43
- self.learning_rate = 2e-4
44
- self.lr_decay_power = 1.0 # linear weight decay by default
45
- self.weight_decay_rate = 0.01
46
- self.num_warmup_steps = 10000
47
-
48
- # training settings
49
- self.iterations_per_loop = 5000
50
- self.save_checkpoints_steps = 25000
51
- self.num_train_steps = 2000000
52
- self.num_eval_steps = 10000
53
- self.keep_checkpoint_max = 0 # maximum number of recent checkpoint files to keep;
54
- # change to 0 or None to keep all checkpoints
55
-
56
- # model settings
57
- self.model_size = "base" # one of "small", "base", or "large"
58
- # override the default transformer hparams for the provided model size; see
59
- # modeling.BertConfig for the possible hparams and util.training_utils for
60
- # the defaults
61
- self.model_hparam_overrides = (
62
- kwargs["model_hparam_overrides"]
63
- if "model_hparam_overrides" in kwargs else {})
64
- self.embedding_size = None # bert hidden size by default
65
- self.vocab_size = 64000 # number of tokens in the vocabulary
66
- self.do_lower_case = False # lowercase the input?
67
-
68
- # generator settings
69
- self.uniform_generator = False # generator is uniform at random
70
- self.two_tower_generator = False # generator is a two-tower cloze model
71
- self.untied_generator_embeddings = False # tie generator/discriminator
72
- # token embeddings?
73
- self.untied_generator = True # tie all generator/discriminator weights?
74
- self.generator_layers = 1.0 # frac of discriminator layers for generator
75
- self.generator_hidden_size = 0.25 # frac of discrim hidden size for gen
76
- self.disallow_correct = False # force the generator to sample incorrect
77
- # tokens (so 15% of tokens are always
78
- # fake)
79
- self.temperature = 1.0 # temperature for sampling from generator
80
-
81
- # batch sizes
82
- self.max_seq_length = 512
83
- self.train_batch_size = 256
84
- self.eval_batch_size = 256
85
-
86
- # TPU settings
87
- self.use_tpu = True
88
- self.num_tpu_cores = 8
89
- self.tpu_job_name = None
90
- self.tpu_name = "" # cloud TPU to use for training
91
- self.tpu_zone = "" # GCE zone where the Cloud TPU is located in
92
- self.gcp_project = "" # project name for the Cloud TPU-enabled project
93
-
94
- # default locations of data files
95
- self.pretrain_tfrecords = os.path.join(
96
- data_dir, "pretraining_data/512/*")
97
- self.vocab_file = os.path.join(data_dir, "bertvocab_final.txt")
98
- self.model_dir = os.path.join(data_dir, "models", model_name)
99
- results_dir = os.path.join(self.model_dir, "results")
100
- self.results_txt = os.path.join(results_dir, "unsup_results.txt")
101
- self.results_pkl = os.path.join(results_dir, "unsup_results.pkl")
102
-
103
- # update defaults with passed-in hyperparameters
104
- self.update(kwargs)
105
-
106
- self.max_predictions_per_seq = int((self.mask_prob + 0.005) *
107
- self.max_seq_length)
108
-
109
- # debug-mode settings
110
- if self.debug:
111
- self.train_batch_size = 8
112
- self.num_train_steps = 20
113
- self.eval_batch_size = 4
114
- self.iterations_per_loop = 1
115
- self.num_eval_steps = 2
116
-
117
- # defaults for different-sized model
118
- if self.model_size == "small":
119
- self.embedding_size = 128
120
- # Here are the hyperparameters we used for larger models; see Table 6 in the
121
- # paper for the full hyperparameters
122
- else:
123
- self.max_seq_length = 512
124
- self.learning_rate = 2e-4
125
- if self.model_size == "base":
126
- self.embedding_size = 768
127
- self.generator_hidden_size = 0.33333
128
- self.train_batch_size = 256
129
- else:
130
- self.embedding_size = 1024
131
- self.mask_prob = 0.25
132
- self.train_batch_size = 2048
133
- if self.electric_objective:
134
- self.two_tower_generator = True # electric requires a two-tower generator
135
-
136
- # passed-in-arguments override (for example) debug-mode defaults
137
- self.update(kwargs)
138
-
139
- def update(self, kwargs):
140
- for k, v in kwargs.items():
141
- if k not in self.__dict__:
142
- raise ValueError("Unknown hparam " + k)
143
- self.__dict__[k] = v
@@ -1,14 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.