SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
- SinaTools-1.0.1.dist-info/RECORD +73 -0
- sinatools/VERSION +1 -1
- sinatools/ner/__init__.py +5 -7
- sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- sinatools/ner/trainers/BertTrainer.py +163 -163
- sinatools/ner/trainers/__init__.py +2 -2
- SinaTools-0.1.40.dist-info/RECORD +0 -123
- sinatools/arabert/arabert/__init__.py +0 -14
- sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools/arabert/arabert/optimization.py +0 -202
- sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,397 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
import os
|
3
|
-
import json
|
4
|
-
import math
|
5
|
-
import tensorflow as tf
|
6
|
-
|
7
|
-
import optimization
|
8
|
-
import collections
|
9
|
-
|
10
|
-
from gpt_2_simple.src import model
|
11
|
-
|
12
|
-
flags = tf.flags
|
13
|
-
|
14
|
-
FLAGS = flags.FLAGS
|
15
|
-
|
16
|
-
flags.DEFINE_integer("batch_size", 256, "batch_size")
|
17
|
-
|
18
|
-
flags.DEFINE_integer("eval_batch_size", 8, "eval_batch_size")
|
19
|
-
|
20
|
-
flags.DEFINE_integer("num_train_steps", 100000, "num_train_steps")
|
21
|
-
|
22
|
-
flags.DEFINE_integer("num_warmup_steps", 10, "num_warmup_steps")
|
23
|
-
|
24
|
-
flags.DEFINE_integer("start_warmup_step", 0, "start_warmup_step")
|
25
|
-
|
26
|
-
flags.DEFINE_float("learning_rate", 1e-4, "learning_rate")
|
27
|
-
|
28
|
-
flags.DEFINE_integer("save_checkpoints_steps", 1000, "save_checkpoints_steps")
|
29
|
-
|
30
|
-
flags.DEFINE_integer("max_seq_length", 1024, "max_seq_length")
|
31
|
-
|
32
|
-
flags.DEFINE_integer("max_eval_steps", 10, "Maximum number of eval steps.")
|
33
|
-
|
34
|
-
flags.DEFINE_float("poly_power", 1.0, "The power of poly decay.")
|
35
|
-
|
36
|
-
flags.DEFINE_enum("optimizer", "lamb", ["adamw", "lamb"], "The optimizer for training.")
|
37
|
-
|
38
|
-
|
39
|
-
flags.DEFINE_integer(
|
40
|
-
"iterations_per_loop",
|
41
|
-
1000,
|
42
|
-
"How many steps to make in each estimator call.",
|
43
|
-
)
|
44
|
-
|
45
|
-
|
46
|
-
flags.DEFINE_integer(
|
47
|
-
"keep_checkpoint_max",
|
48
|
-
10,
|
49
|
-
"How ckpts to keep.",
|
50
|
-
)
|
51
|
-
|
52
|
-
flags.DEFINE_string(
|
53
|
-
"input_file",
|
54
|
-
None,
|
55
|
-
"Input TF example files (can be a glob or comma separated).",
|
56
|
-
)
|
57
|
-
|
58
|
-
flags.DEFINE_string(
|
59
|
-
"output_dir",
|
60
|
-
None,
|
61
|
-
"The output directory where the model checkpoints will be written.",
|
62
|
-
)
|
63
|
-
|
64
|
-
flags.DEFINE_string(
|
65
|
-
"config_file",
|
66
|
-
None,
|
67
|
-
"The config json file corresponding to the pre-trained GPT2 model. "
|
68
|
-
"This specifies the model architecture.",
|
69
|
-
)
|
70
|
-
|
71
|
-
flags.DEFINE_string("init_checkpoint", None, "Initial checkpoint")
|
72
|
-
|
73
|
-
|
74
|
-
flags.DEFINE_bool("use_tpu", True, "Whether to use TPU or GPU/CPU.")
|
75
|
-
|
76
|
-
flags.DEFINE_string(
|
77
|
-
"tpu_name",
|
78
|
-
None,
|
79
|
-
"The Cloud TPU to use for training. This should be either the name "
|
80
|
-
"used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
|
81
|
-
"url.",
|
82
|
-
)
|
83
|
-
|
84
|
-
flags.DEFINE_string(
|
85
|
-
"tpu_zone",
|
86
|
-
None,
|
87
|
-
"[Optional] GCE zone where the Cloud TPU is located in. If not "
|
88
|
-
"specified, we will attempt to automatically detect the GCE project from "
|
89
|
-
"metadata.",
|
90
|
-
)
|
91
|
-
|
92
|
-
flags.DEFINE_string(
|
93
|
-
"gcp_project",
|
94
|
-
None,
|
95
|
-
"[Optional] Project name for the Cloud TPU-enabled project. If not "
|
96
|
-
"specified, we will attempt to automatically detect the GCE project from "
|
97
|
-
"metadata.",
|
98
|
-
)
|
99
|
-
flags.DEFINE_integer(
|
100
|
-
"num_tpu_cores",
|
101
|
-
8,
|
102
|
-
"Only used if `use_tpu` is True. Total number of TPU cores to use.",
|
103
|
-
)
|
104
|
-
|
105
|
-
flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
|
106
|
-
|
107
|
-
flags.DEFINE_bool("do_train", False, "Whether to run training.")
|
108
|
-
|
109
|
-
flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
|
110
|
-
|
111
|
-
|
112
|
-
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
|
113
|
-
assignment_map = {}
|
114
|
-
initialized_variable_names = {}
|
115
|
-
|
116
|
-
name_to_variable = collections.OrderedDict()
|
117
|
-
for var in tvars:
|
118
|
-
name = var.name
|
119
|
-
m = re.match("^(.*):\\d+$", name)
|
120
|
-
if m is not None:
|
121
|
-
name = m.group(1)
|
122
|
-
name_to_variable[name] = var
|
123
|
-
|
124
|
-
init_vars = tf.train.list_variables(init_checkpoint)
|
125
|
-
|
126
|
-
assignment_map = collections.OrderedDict()
|
127
|
-
for x in init_vars:
|
128
|
-
(name, var) = (x[0], x[1])
|
129
|
-
if name not in name_to_variable:
|
130
|
-
continue
|
131
|
-
assignment_map[name] = name
|
132
|
-
initialized_variable_names[name] = 1
|
133
|
-
initialized_variable_names[name + ":0"] = 1
|
134
|
-
|
135
|
-
return (assignment_map, initialized_variable_names)
|
136
|
-
|
137
|
-
|
138
|
-
def model_fn_builder(
|
139
|
-
hparams,
|
140
|
-
init_checkpoint,
|
141
|
-
learning_rate,
|
142
|
-
num_train_steps,
|
143
|
-
num_warmup_steps,
|
144
|
-
use_tpu,
|
145
|
-
optimizer,
|
146
|
-
poly_power,
|
147
|
-
start_warmup_step,
|
148
|
-
use_memory_saving_gradients
|
149
|
-
):
|
150
|
-
def model_fn(features, labels, mode, params):
|
151
|
-
tf.logging.info("*** Features ***")
|
152
|
-
for name in sorted(features.keys()):
|
153
|
-
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
|
154
|
-
|
155
|
-
input_ids = features["input_ids"]
|
156
|
-
|
157
|
-
output = model.model(hparams=hparams, X=input_ids)
|
158
|
-
loss = tf.reduce_mean(
|
159
|
-
input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits(
|
160
|
-
labels=input_ids[:, 1:], logits=output["logits"][:, :-1]
|
161
|
-
)
|
162
|
-
)
|
163
|
-
|
164
|
-
tvars = tf.trainable_variables()
|
165
|
-
|
166
|
-
initialized_variable_names = {}
|
167
|
-
scaffold_fn = None
|
168
|
-
if init_checkpoint:
|
169
|
-
(
|
170
|
-
assignment_map,
|
171
|
-
initialized_variable_names,
|
172
|
-
) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
|
173
|
-
if use_tpu:
|
174
|
-
def tpu_scaffold():
|
175
|
-
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
|
176
|
-
return tf.train.Scaffold()
|
177
|
-
scaffold_fn = tpu_scaffold
|
178
|
-
else:
|
179
|
-
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
|
180
|
-
|
181
|
-
tf.logging.info("**** Trainable Variables ****")
|
182
|
-
for var in tvars:
|
183
|
-
init_string = ""
|
184
|
-
if var.name in initialized_variable_names:
|
185
|
-
init_string = ", *INIT_FROM_CKPT*"
|
186
|
-
tf.logging.info(
|
187
|
-
" name = %s, shape = %s%s", var.name, var.shape, init_string
|
188
|
-
)
|
189
|
-
|
190
|
-
output_spec = None
|
191
|
-
if mode == tf.estimator.ModeKeys.TRAIN:
|
192
|
-
train_op = optimization.create_optimizer(
|
193
|
-
loss,
|
194
|
-
learning_rate,
|
195
|
-
num_train_steps,
|
196
|
-
num_warmup_steps,
|
197
|
-
use_tpu,
|
198
|
-
optimizer,
|
199
|
-
poly_power,
|
200
|
-
start_warmup_step,
|
201
|
-
use_memory_saving_gradients=use_memory_saving_gradients
|
202
|
-
)
|
203
|
-
|
204
|
-
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
|
205
|
-
mode=mode,
|
206
|
-
loss=loss,
|
207
|
-
train_op=train_op,
|
208
|
-
scaffold_fn=scaffold_fn,
|
209
|
-
)
|
210
|
-
elif mode == tf.estimator.ModeKeys.EVAL:
|
211
|
-
|
212
|
-
def metric_fn(loss):
|
213
|
-
"""Evaluation metric Fn which runs on CPU."""
|
214
|
-
perplexity = tf.exp(tf.reduce_mean(loss))
|
215
|
-
bpc = tf.reduce_mean(loss) / tf.constant(math.log(2))
|
216
|
-
return {
|
217
|
-
"perplexity": tf.metrics.mean(perplexity),
|
218
|
-
"bpc": tf.metrics.mean(bpc),
|
219
|
-
}
|
220
|
-
|
221
|
-
if FLAGS.use_tpu:
|
222
|
-
with tf.colocate_with(loss):
|
223
|
-
loss = tf.contrib.tpu.cross_replica_sum(loss) \
|
224
|
-
/ FLAGS.num_tpu_cores
|
225
|
-
metric_loss = tf.tile(tf.reshape(loss, [1, 1]), [FLAGS.eval_batch_size, 1])
|
226
|
-
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
|
227
|
-
mode=mode,
|
228
|
-
loss=loss,
|
229
|
-
eval_metrics=(metric_fn, [metric_loss]),
|
230
|
-
scaffold_fn=scaffold_fn)
|
231
|
-
|
232
|
-
# eval_metrics = (metric_fn, {"loss":loss})
|
233
|
-
# output_spec = tf.contrib.tpu.TPUEstimatorSpec(
|
234
|
-
# mode=mode,
|
235
|
-
# loss=loss,
|
236
|
-
# eval_metrics=eval_metrics,
|
237
|
-
# scaffold_fn=scaffold_fn,
|
238
|
-
# )
|
239
|
-
else:
|
240
|
-
raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
|
241
|
-
|
242
|
-
return output_spec
|
243
|
-
|
244
|
-
return model_fn
|
245
|
-
|
246
|
-
def input_fn_builder(input_files, max_seq_length, is_training, num_cpu_threads=4):
|
247
|
-
def input_fn(params):
|
248
|
-
batch_size = params["batch_size"]
|
249
|
-
name_to_features = {"input_ids": tf.FixedLenFeature([max_seq_length + 1], tf.int64)}
|
250
|
-
if is_training:
|
251
|
-
#d = tf.data.TFRecordDataset(input_files)
|
252
|
-
#d = d.repeat(1000)
|
253
|
-
d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
|
254
|
-
d = d.repeat()
|
255
|
-
d = d.shuffle(buffer_size = len(input_files))
|
256
|
-
# `cycle_length` is the number of parallel files that get read.
|
257
|
-
cycle_length = min(num_cpu_threads, len(input_files))
|
258
|
-
# `sloppy` mode means that the interleaving is not exact. This adds
|
259
|
-
# even more randomness to the training pipeline.
|
260
|
-
d = d.apply(
|
261
|
-
tf.contrib.data.parallel_interleave(
|
262
|
-
tf.data.TFRecordDataset,
|
263
|
-
sloppy = is_training,
|
264
|
-
cycle_length = cycle_length,
|
265
|
-
)
|
266
|
-
)
|
267
|
-
d = d.shuffle(buffer_size = 100)
|
268
|
-
else:
|
269
|
-
d = tf.data.TFRecordDataset(input_files)
|
270
|
-
# Since we evaluate for a fixed number of steps we don't want to encounter
|
271
|
-
# out-of-range exceptions.
|
272
|
-
d = d.repeat()
|
273
|
-
|
274
|
-
d = d.apply(
|
275
|
-
tf.contrib.data.map_and_batch(
|
276
|
-
lambda record: _decode_record(record, name_to_features),
|
277
|
-
batch_size=batch_size,
|
278
|
-
num_parallel_batches=num_cpu_threads,
|
279
|
-
drop_remainder=True,
|
280
|
-
)
|
281
|
-
)
|
282
|
-
return d
|
283
|
-
|
284
|
-
return input_fn
|
285
|
-
|
286
|
-
def _decode_record(record, name_to_features):
|
287
|
-
"""Decodes a record to a TensorFlow example."""
|
288
|
-
example = tf.parse_single_example(record, name_to_features)
|
289
|
-
|
290
|
-
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
|
291
|
-
# So cast all int64 to int32.
|
292
|
-
for name in list(example.keys()):
|
293
|
-
t = example[name]
|
294
|
-
if t.dtype == tf.int64:
|
295
|
-
t = tf.to_int32(t)
|
296
|
-
example[name] = t
|
297
|
-
|
298
|
-
return example
|
299
|
-
|
300
|
-
def main(_):
|
301
|
-
tf.logging.set_verbosity(tf.logging.INFO)
|
302
|
-
logger = tf.get_logger()
|
303
|
-
logger.propagate = False
|
304
|
-
|
305
|
-
if not FLAGS.do_train and not FLAGS.do_eval:
|
306
|
-
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
307
|
-
|
308
|
-
hparams = model.default_hparams()
|
309
|
-
|
310
|
-
with tf.gfile.GFile(FLAGS.config_file) as f:
|
311
|
-
hparams.override_from_dict(json.load(f))
|
312
|
-
|
313
|
-
tf.gfile.MakeDirs(FLAGS.output_dir)
|
314
|
-
input_files = []
|
315
|
-
for input_pattern in FLAGS.input_file.split(","):
|
316
|
-
input_files.extend(tf.gfile.Glob(input_pattern))
|
317
|
-
|
318
|
-
# tf.logging.info("*** Input Files ***")
|
319
|
-
# for input_file in input_files:
|
320
|
-
# tf.logging.info(" %s" % input_file)
|
321
|
-
|
322
|
-
tpu_cluster_resolver = None
|
323
|
-
if FLAGS.use_tpu and FLAGS.tpu_name:
|
324
|
-
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
|
325
|
-
FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project
|
326
|
-
)
|
327
|
-
|
328
|
-
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
|
329
|
-
run_config = tf.contrib.tpu.RunConfig(
|
330
|
-
cluster=tpu_cluster_resolver,
|
331
|
-
master=FLAGS.master,
|
332
|
-
model_dir=FLAGS.output_dir,
|
333
|
-
save_checkpoints_steps=FLAGS.save_checkpoints_steps,
|
334
|
-
keep_checkpoint_max=FLAGS.keep_checkpoint_max,
|
335
|
-
tpu_config=tf.contrib.tpu.TPUConfig(
|
336
|
-
iterations_per_loop=FLAGS.iterations_per_loop,
|
337
|
-
num_shards=FLAGS.num_tpu_cores,
|
338
|
-
per_host_input_for_training=is_per_host,
|
339
|
-
),
|
340
|
-
)
|
341
|
-
|
342
|
-
model_fn = model_fn_builder(
|
343
|
-
hparams=hparams,
|
344
|
-
init_checkpoint=FLAGS.init_checkpoint,
|
345
|
-
learning_rate=FLAGS.learning_rate,
|
346
|
-
num_train_steps=FLAGS.num_train_steps,
|
347
|
-
num_warmup_steps=FLAGS.num_warmup_steps,
|
348
|
-
use_tpu=FLAGS.use_tpu,
|
349
|
-
optimizer=FLAGS.optimizer,
|
350
|
-
poly_power=FLAGS.poly_power,
|
351
|
-
start_warmup_step=FLAGS.start_warmup_step,
|
352
|
-
use_memory_saving_gradients=FLAGS.use_memory_saving_gradients
|
353
|
-
)
|
354
|
-
|
355
|
-
# If TPU is not available, this will fall back to normal Estimator on CPU
|
356
|
-
# or GPU.
|
357
|
-
estimator = tf.contrib.tpu.TPUEstimator(
|
358
|
-
use_tpu=FLAGS.use_tpu,
|
359
|
-
model_fn=model_fn,
|
360
|
-
config=run_config,
|
361
|
-
train_batch_size=FLAGS.batch_size,
|
362
|
-
eval_batch_size=FLAGS.eval_batch_size,
|
363
|
-
)
|
364
|
-
|
365
|
-
if FLAGS.do_train:
|
366
|
-
tf.logging.info("***** Running training *****")
|
367
|
-
tf.logging.info(" Batch size = %d", FLAGS.batch_size)
|
368
|
-
train_input_fn = input_fn_builder(
|
369
|
-
input_files=input_files,
|
370
|
-
max_seq_length=FLAGS.max_seq_length,
|
371
|
-
is_training=True,
|
372
|
-
)
|
373
|
-
estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
|
374
|
-
|
375
|
-
if FLAGS.do_eval:
|
376
|
-
tf.logging.info("***** Running evaluation *****")
|
377
|
-
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
|
378
|
-
|
379
|
-
eval_input_fn = input_fn_builder(
|
380
|
-
input_files=input_files,
|
381
|
-
max_seq_length=FLAGS.max_seq_length,
|
382
|
-
is_training=False,
|
383
|
-
)
|
384
|
-
result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)
|
385
|
-
|
386
|
-
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
|
387
|
-
with tf.gfile.GFile(output_eval_file, "w") as writer:
|
388
|
-
tf.logging.info("***** Eval results *****")
|
389
|
-
for key in sorted(result.keys()):
|
390
|
-
tf.logging.info(" %s = %s", key, str(result[key]))
|
391
|
-
writer.write("%s = %s\n" % (key, str(result[key])))
|
392
|
-
|
393
|
-
if __name__ == "__main__":
|
394
|
-
flags.mark_flag_as_required("input_file")
|
395
|
-
flags.mark_flag_as_required("config_file")
|
396
|
-
flags.mark_flag_as_required("output_dir")
|
397
|
-
tf.app.run()
|
File without changes
|
@@ -1,161 +0,0 @@
|
|
1
|
-
# Original work Copyright 2018 The Google AI Language Team Authors.
|
2
|
-
# Modified work Copyright 2019 Rowan Zellers
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
import collections
|
17
|
-
import tensorflow as tf
|
18
|
-
|
19
|
-
|
20
|
-
def _decode_record(record, name_to_features):
|
21
|
-
"""Decodes a record to a TensorFlow example."""
|
22
|
-
example = tf.parse_single_example(record, name_to_features)
|
23
|
-
|
24
|
-
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
|
25
|
-
# So cast all int64 to int32.
|
26
|
-
for name in list(example.keys()):
|
27
|
-
t = example[name]
|
28
|
-
if t.dtype == tf.int64:
|
29
|
-
t = tf.cast(t, tf.int32)
|
30
|
-
example[name] = t
|
31
|
-
return example
|
32
|
-
|
33
|
-
|
34
|
-
def input_fn_builder(input_files,
|
35
|
-
seq_length,
|
36
|
-
is_training,
|
37
|
-
num_cpu_threads=4,
|
38
|
-
evaluate_for_fixed_number_of_steps=True):
|
39
|
-
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
|
40
|
-
|
41
|
-
def input_fn(params):
|
42
|
-
"""The actual input function."""
|
43
|
-
batch_size = params["batch_size"]
|
44
|
-
name_to_features = {
|
45
|
-
"input_ids": tf.FixedLenFeature([seq_length + 1], tf.int64),
|
46
|
-
}
|
47
|
-
|
48
|
-
# For training, we want a lot of parallel reading and shuffling.
|
49
|
-
# For eval, we want no shuffling and parallel reading doesn't matter.
|
50
|
-
if is_training:
|
51
|
-
d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
|
52
|
-
d = d.repeat()
|
53
|
-
d = d.shuffle(buffer_size=len(input_files))
|
54
|
-
|
55
|
-
# `cycle_length` is the number of parallel files that get read.
|
56
|
-
cycle_length = min(num_cpu_threads, len(input_files))
|
57
|
-
|
58
|
-
# `sloppy` mode means that the interleaving is not exact. This adds
|
59
|
-
# even more randomness to the training pipeline.
|
60
|
-
d = d.apply(
|
61
|
-
tf.data.experimental.parallel_interleave(
|
62
|
-
tf.data.TFRecordDataset,
|
63
|
-
sloppy=is_training,
|
64
|
-
cycle_length=cycle_length))
|
65
|
-
d = d.shuffle(buffer_size=100)
|
66
|
-
else:
|
67
|
-
d = tf.data.TFRecordDataset(input_files)
|
68
|
-
# If we evaluate for a fixed number of steps we don't want to encounter
|
69
|
-
# out-of-range exceptions.
|
70
|
-
if evaluate_for_fixed_number_of_steps:
|
71
|
-
d = d.repeat()
|
72
|
-
|
73
|
-
# We must `drop_remainder` on training because the TPU requires fixed
|
74
|
-
# size dimensions. For eval, we assume we are evaluating on the CPU or GPU
|
75
|
-
# and we *don't* want to drop the remainder, otherwise we wont cover
|
76
|
-
# every sample.
|
77
|
-
d = d.apply(
|
78
|
-
tf.data.experimental.map_and_batch(
|
79
|
-
lambda record: _decode_record(record, name_to_features),
|
80
|
-
batch_size=batch_size,
|
81
|
-
num_parallel_batches=num_cpu_threads,
|
82
|
-
drop_remainder=True))
|
83
|
-
return d
|
84
|
-
|
85
|
-
return input_fn
|
86
|
-
|
87
|
-
|
88
|
-
# ~~~~~~~~~~~~~~ This is for classification / AF ~~~~~~~~~~~~~~~~~~
|
89
|
-
def classification_convert_examples_to_features(
|
90
|
-
examples, max_seq_length, batch_size, encoder, output_file, labels, pad_extra_examples=False,
|
91
|
-
chop_from_front_if_needed=True):
|
92
|
-
"""Convert a set of `InputExample`s to a TFRecord file."""
|
93
|
-
|
94
|
-
writer = tf.python_io.TFRecordWriter(output_file)
|
95
|
-
|
96
|
-
label_map = {label: i for i, label in enumerate(labels)}
|
97
|
-
|
98
|
-
for (ex_index, example) in enumerate(examples):
|
99
|
-
if ex_index % 10000 == 0:
|
100
|
-
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
|
101
|
-
|
102
|
-
# begin_summary is our [CLS] token
|
103
|
-
tokens = example['ids'] + [encoder.begin_summary]
|
104
|
-
|
105
|
-
if len(tokens) > max_seq_length:
|
106
|
-
if chop_from_front_if_needed:
|
107
|
-
tokens = tokens[-max_seq_length:]
|
108
|
-
else:
|
109
|
-
tokens = example['ids'][:(max_seq_length-1)] + [encoder.begin_summary]
|
110
|
-
elif len(tokens) < max_seq_length:
|
111
|
-
tokens.extend([encoder.padding] * (max_seq_length - len(tokens)))
|
112
|
-
|
113
|
-
features = collections.OrderedDict()
|
114
|
-
features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=tokens))
|
115
|
-
features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[label_map[example['label']]]))
|
116
|
-
features['is_real_example'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[1]))
|
117
|
-
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
|
118
|
-
writer.write(tf_example.SerializeToString())
|
119
|
-
|
120
|
-
if pad_extra_examples:
|
121
|
-
for x in range(len(examples) % batch_size):
|
122
|
-
features = collections.OrderedDict()
|
123
|
-
features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[0]*max_seq_length))
|
124
|
-
features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[0]))
|
125
|
-
features['is_real_example'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[0]))
|
126
|
-
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
|
127
|
-
writer.write(tf_example.SerializeToString())
|
128
|
-
writer.close()
|
129
|
-
|
130
|
-
|
131
|
-
def classification_input_fn_builder(input_file, seq_length, is_training,
|
132
|
-
drop_remainder,
|
133
|
-
buffer_size=100):
|
134
|
-
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
|
135
|
-
|
136
|
-
name_to_features = {
|
137
|
-
"input_ids": tf.FixedLenFeature([seq_length], tf.int64),
|
138
|
-
"label_ids": tf.FixedLenFeature([], tf.int64),
|
139
|
-
"is_real_example": tf.FixedLenFeature([], tf.int64),
|
140
|
-
}
|
141
|
-
|
142
|
-
def input_fn(params):
|
143
|
-
"""The actual input function."""
|
144
|
-
batch_size = params["batch_size"]
|
145
|
-
|
146
|
-
# For training, we want a lot of parallel reading and shuffling.
|
147
|
-
# For eval, we want no shuffling and parallel reading doesn't matter.
|
148
|
-
d = tf.data.TFRecordDataset(input_file)
|
149
|
-
if is_training:
|
150
|
-
d = d.repeat()
|
151
|
-
d = d.shuffle(buffer_size=buffer_size)
|
152
|
-
|
153
|
-
d = d.apply(
|
154
|
-
tf.data.experimental.map_and_batch(
|
155
|
-
lambda record: _decode_record(record, name_to_features),
|
156
|
-
batch_size=batch_size,
|
157
|
-
drop_remainder=drop_remainder))
|
158
|
-
|
159
|
-
return d
|
160
|
-
|
161
|
-
return input_fn
|