SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,323 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Fine-tunes an ELECTRA model on a downstream task."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import argparse
23
- import collections
24
- import json
25
-
26
- import tensorflow as tf
27
-
28
- import configure_finetuning
29
- from finetune import preprocessing
30
- from finetune import task_builder
31
- from model import modeling
32
- from model import optimization
33
- from util import training_utils
34
- from util import utils
35
-
36
-
37
- class FinetuningModel(object):
38
- """Finetuning model with support for multi-task training."""
39
-
40
- def __init__(self, config: configure_finetuning.FinetuningConfig, tasks,
41
- is_training, features, num_train_steps):
42
- # Create a shared transformer encoder
43
- bert_config = training_utils.get_bert_config(config)
44
- self.bert_config = bert_config
45
- if config.debug:
46
- bert_config.num_hidden_layers = 3
47
- bert_config.hidden_size = 144
48
- bert_config.intermediate_size = 144 * 4
49
- bert_config.num_attention_heads = 4
50
- assert config.max_seq_length <= bert_config.max_position_embeddings
51
- bert_model = modeling.BertModel(
52
- bert_config=bert_config,
53
- is_training=is_training,
54
- input_ids=features["input_ids"],
55
- input_mask=features["input_mask"],
56
- token_type_ids=features["segment_ids"],
57
- use_one_hot_embeddings=config.use_tpu,
58
- embedding_size=config.embedding_size)
59
- percent_done = (tf.cast(tf.train.get_or_create_global_step(), tf.float32) /
60
- tf.cast(num_train_steps, tf.float32))
61
-
62
- # Add specific tasks
63
- self.outputs = {"task_id": features["task_id"]}
64
- losses = []
65
- for task in tasks:
66
- with tf.variable_scope("task_specific/" + task.name):
67
- task_losses, task_outputs = task.get_prediction_module(
68
- bert_model, features, is_training, percent_done)
69
- losses.append(task_losses)
70
- self.outputs[task.name] = task_outputs
71
- self.loss = tf.reduce_sum(
72
- tf.stack(losses, -1) *
73
- tf.one_hot(features["task_id"], len(config.task_names)))
74
-
75
-
76
- def model_fn_builder(config: configure_finetuning.FinetuningConfig, tasks,
77
- num_train_steps, pretraining_config=None):
78
- """Returns `model_fn` closure for TPUEstimator."""
79
-
80
- def model_fn(features, labels, mode, params):
81
- """The `model_fn` for TPUEstimator."""
82
- utils.log("Building model...")
83
- is_training = (mode == tf.estimator.ModeKeys.TRAIN)
84
- model = FinetuningModel(
85
- config, tasks, is_training, features, num_train_steps)
86
-
87
- # Load pre-trained weights from checkpoint
88
- init_checkpoint = config.init_checkpoint
89
- if pretraining_config is not None:
90
- init_checkpoint = tf.train.latest_checkpoint(pretraining_config.model_dir)
91
- utils.log("Using checkpoint", init_checkpoint)
92
- tvars = tf.trainable_variables()
93
- scaffold_fn = None
94
- if init_checkpoint:
95
- assignment_map, _ = modeling.get_assignment_map_from_checkpoint(
96
- tvars, init_checkpoint)
97
- if config.use_tpu:
98
- def tpu_scaffold():
99
- tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
100
- return tf.train.Scaffold()
101
- scaffold_fn = tpu_scaffold
102
- else:
103
- tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
104
-
105
- # Build model for training or prediction
106
- if mode == tf.estimator.ModeKeys.TRAIN:
107
- train_op = optimization.create_optimizer(
108
- model.loss, config.learning_rate, num_train_steps,
109
- weight_decay_rate=config.weight_decay_rate,
110
- use_tpu=config.use_tpu,
111
- warmup_proportion=config.warmup_proportion,
112
- layerwise_lr_decay_power=config.layerwise_lr_decay,
113
- n_transformer_layers=model.bert_config.num_hidden_layers
114
- )
115
- output_spec = tf.estimator.tpu.TPUEstimatorSpec(
116
- mode=mode,
117
- loss=model.loss,
118
- train_op=train_op,
119
- scaffold_fn=scaffold_fn,
120
- training_hooks=[training_utils.ETAHook(
121
- {} if config.use_tpu else dict(loss=model.loss),
122
- num_train_steps, config.iterations_per_loop, config.use_tpu, 10)])
123
- else:
124
- assert mode == tf.estimator.ModeKeys.PREDICT
125
- output_spec = tf.estimator.tpu.TPUEstimatorSpec(
126
- mode=mode,
127
- predictions=utils.flatten_dict(model.outputs),
128
- scaffold_fn=scaffold_fn)
129
-
130
- utils.log("Building complete")
131
- return output_spec
132
-
133
- return model_fn
134
-
135
-
136
- class ModelRunner(object):
137
- """Fine-tunes a model on a supervised task."""
138
-
139
- def __init__(self, config: configure_finetuning.FinetuningConfig, tasks,
140
- pretraining_config=None):
141
- self._config = config
142
- self._tasks = tasks
143
- self._preprocessor = preprocessing.Preprocessor(config, self._tasks)
144
-
145
- is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
146
- tpu_cluster_resolver = None
147
- if config.use_tpu and config.tpu_name:
148
- tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
149
- config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
150
- tpu_config = tf.estimator.tpu.TPUConfig(
151
- iterations_per_loop=config.iterations_per_loop,
152
- num_shards=config.num_tpu_cores,
153
- per_host_input_for_training=is_per_host,
154
- tpu_job_name=config.tpu_job_name)
155
- run_config = tf.estimator.tpu.RunConfig(
156
- cluster=tpu_cluster_resolver,
157
- model_dir=config.model_dir,
158
- save_checkpoints_steps=config.save_checkpoints_steps,
159
- save_checkpoints_secs=None,
160
- tpu_config=tpu_config)
161
-
162
- if self._config.do_train:
163
- (self._train_input_fn,
164
- self.train_steps) = self._preprocessor.prepare_train()
165
- else:
166
- self._train_input_fn, self.train_steps = None, 0
167
- model_fn = model_fn_builder(
168
- config=config,
169
- tasks=self._tasks,
170
- num_train_steps=self.train_steps,
171
- pretraining_config=pretraining_config)
172
- self._estimator = tf.estimator.tpu.TPUEstimator(
173
- use_tpu=config.use_tpu,
174
- model_fn=model_fn,
175
- config=run_config,
176
- train_batch_size=config.train_batch_size,
177
- eval_batch_size=config.eval_batch_size,
178
- predict_batch_size=config.predict_batch_size)
179
-
180
- def train(self):
181
- utils.log("Training for {:} steps".format(self.train_steps))
182
- self._estimator.train(
183
- input_fn=self._train_input_fn, max_steps=self.train_steps)
184
-
185
- def evaluate(self):
186
- return {task.name: self.evaluate_task(task) for task in self._tasks}
187
-
188
- def evaluate_task(self, task, split="dev", return_results=True):
189
- """Evaluate the current model."""
190
- utils.log("Evaluating", task.name)
191
- eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)
192
- results = self._estimator.predict(input_fn=eval_input_fn,
193
- yield_single_examples=True)
194
- scorer = task.get_scorer()
195
- for r in results:
196
- if r["task_id"] != len(self._tasks): # ignore padding examples
197
- r = utils.nest_dict(r, self._config.task_names)
198
- scorer.update(r[task.name])
199
- if return_results:
200
- utils.log(task.name + ": " + scorer.results_str())
201
- utils.log()
202
- return dict(scorer.get_results())
203
- else:
204
- return scorer
205
-
206
- def write_classification_outputs(self, tasks, trial, split):
207
- """Write classification predictions to disk."""
208
- utils.log("Writing out predictions for", tasks, split)
209
- predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split)
210
- results = self._estimator.predict(input_fn=predict_input_fn,
211
- yield_single_examples=True)
212
- # task name -> eid -> model-logits
213
- logits = collections.defaultdict(dict)
214
- for r in results:
215
- if r["task_id"] != len(self._tasks):
216
- r = utils.nest_dict(r, self._config.task_names)
217
- task_name = self._config.task_names[r["task_id"]]
218
- logits[task_name][r[task_name]["eid"]] = (
219
- r[task_name]["logits"] if "logits" in r[task_name]
220
- else r[task_name]["predictions"])
221
- for task_name in logits:
222
- utils.log("Pickling predictions for {:} {:} examples ({:})".format(
223
- len(logits[task_name]), task_name, split))
224
- if trial <= self._config.n_writes_test:
225
- utils.write_pickle(logits[task_name], self._config.test_predictions(
226
- task_name, split, trial))
227
-
228
-
229
- def write_results(config: configure_finetuning.FinetuningConfig, results):
230
- """Write evaluation metrics to disk."""
231
- utils.log("Writing results to", config.results_txt)
232
- utils.mkdir(config.results_txt.rsplit("/", 1)[0])
233
- utils.write_pickle(results, config.results_pkl)
234
- with tf.io.gfile.GFile(config.results_txt, "w") as f:
235
- results_str = ""
236
- for trial_results in results:
237
- for task_name, task_results in trial_results.items():
238
- if task_name == "time" or task_name == "global_step":
239
- continue
240
- results_str += task_name + ": " + " - ".join(
241
- ["{:}: {:.2f}".format(k, v)
242
- for k, v in task_results.items()]) + "\n"
243
- f.write(results_str)
244
- utils.write_pickle(results, config.results_pkl)
245
-
246
-
247
- def run_finetuning(config: configure_finetuning.FinetuningConfig):
248
- """Run finetuning."""
249
-
250
- # Setup for training
251
- results = []
252
- trial = 1
253
- heading_info = "model={:}, trial {:}/{:}".format(
254
- config.model_name, trial, config.num_trials)
255
- heading = lambda msg: utils.heading(msg + ": " + heading_info)
256
- heading("Config")
257
- utils.log_config(config)
258
- generic_model_dir = config.model_dir
259
- tasks = task_builder.get_tasks(config)
260
-
261
- # Train and evaluate num_trials models with different random seeds
262
- while config.num_trials < 0 or trial <= config.num_trials:
263
- config.model_dir = generic_model_dir + "_" + str(trial)
264
- if config.do_train:
265
- utils.rmkdir(config.model_dir)
266
-
267
- model_runner = ModelRunner(config, tasks)
268
- if config.do_train:
269
- heading("Start training")
270
- model_runner.train()
271
- utils.log()
272
-
273
- if config.do_eval:
274
- heading("Run dev set evaluation")
275
- results.append(model_runner.evaluate())
276
- write_results(config, results)
277
- if config.write_test_outputs and trial <= config.n_writes_test:
278
- heading("Running on the test set and writing the predictions")
279
- for task in tasks:
280
- # Currently only writing preds for GLUE and SQuAD 2.0 is supported
281
- if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp",
282
- "sts"]:
283
- for split in task.get_test_splits():
284
- model_runner.write_classification_outputs([task], trial, split)
285
- elif task.name == "squad":
286
- scorer = model_runner.evaluate_task(task, "test", False)
287
- scorer.write_predictions()
288
- preds = utils.load_json(config.qa_preds_file("squad"))
289
- null_odds = utils.load_json(config.qa_na_file("squad"))
290
- for q, _ in preds.items():
291
- if null_odds[q] > config.qa_na_threshold:
292
- preds[q] = ""
293
- utils.write_json(preds, config.test_predictions(
294
- task.name, "test", trial))
295
- else:
296
- utils.log("Skipping task", task.name,
297
- "- writing predictions is not supported for this task")
298
-
299
- if trial != config.num_trials and (not config.keep_all_models):
300
- utils.rmrf(config.model_dir)
301
- trial += 1
302
-
303
-
304
- def main():
305
- parser = argparse.ArgumentParser(description=__doc__)
306
- parser.add_argument("--data-dir", required=True,
307
- help="Location of data files (model weights, etc).")
308
- parser.add_argument("--model-name", required=True,
309
- help="The name of the model being fine-tuned.")
310
- parser.add_argument("--hparams", default="{}",
311
- help="JSON dict of model hyperparameters.")
312
- args = parser.parse_args()
313
- if args.hparams.endswith(".json"):
314
- hparams = utils.load_json(args.hparams)
315
- else:
316
- hparams = json.loads(args.hparams)
317
- tf.logging.set_verbosity(tf.logging.ERROR)
318
- run_finetuning(configure_finetuning.FinetuningConfig(
319
- args.model_name, args.data_dir, **hparams))
320
-
321
-
322
- if __name__ == "__main__":
323
- main()