SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,397 +0,0 @@
1
- import re
2
- import os
3
- import json
4
- import math
5
- import tensorflow as tf
6
-
7
- import optimization
8
- import collections
9
-
10
- from gpt_2_simple.src import model
11
-
12
- flags = tf.flags
13
-
14
- FLAGS = flags.FLAGS
15
-
16
- flags.DEFINE_integer("batch_size", 256, "batch_size")
17
-
18
- flags.DEFINE_integer("eval_batch_size", 8, "eval_batch_size")
19
-
20
- flags.DEFINE_integer("num_train_steps", 100000, "num_train_steps")
21
-
22
- flags.DEFINE_integer("num_warmup_steps", 10, "num_warmup_steps")
23
-
24
- flags.DEFINE_integer("start_warmup_step", 0, "start_warmup_step")
25
-
26
- flags.DEFINE_float("learning_rate", 1e-4, "learning_rate")
27
-
28
- flags.DEFINE_integer("save_checkpoints_steps", 1000, "save_checkpoints_steps")
29
-
30
- flags.DEFINE_integer("max_seq_length", 1024, "max_seq_length")
31
-
32
- flags.DEFINE_integer("max_eval_steps", 10, "Maximum number of eval steps.")
33
-
34
- flags.DEFINE_float("poly_power", 1.0, "The power of poly decay.")
35
-
36
- flags.DEFINE_enum("optimizer", "lamb", ["adamw", "lamb"], "The optimizer for training.")
37
-
38
-
39
- flags.DEFINE_integer(
40
- "iterations_per_loop",
41
- 1000,
42
- "How many steps to make in each estimator call.",
43
- )
44
-
45
-
46
- flags.DEFINE_integer(
47
- "keep_checkpoint_max",
48
- 10,
49
- "How ckpts to keep.",
50
- )
51
-
52
- flags.DEFINE_string(
53
- "input_file",
54
- None,
55
- "Input TF example files (can be a glob or comma separated).",
56
- )
57
-
58
- flags.DEFINE_string(
59
- "output_dir",
60
- None,
61
- "The output directory where the model checkpoints will be written.",
62
- )
63
-
64
- flags.DEFINE_string(
65
- "config_file",
66
- None,
67
- "The config json file corresponding to the pre-trained GPT2 model. "
68
- "This specifies the model architecture.",
69
- )
70
-
71
- flags.DEFINE_string("init_checkpoint", None, "Initial checkpoint")
72
-
73
-
74
- flags.DEFINE_bool("use_tpu", True, "Whether to use TPU or GPU/CPU.")
75
-
76
- flags.DEFINE_string(
77
- "tpu_name",
78
- None,
79
- "The Cloud TPU to use for training. This should be either the name "
80
- "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
81
- "url.",
82
- )
83
-
84
- flags.DEFINE_string(
85
- "tpu_zone",
86
- None,
87
- "[Optional] GCE zone where the Cloud TPU is located in. If not "
88
- "specified, we will attempt to automatically detect the GCE project from "
89
- "metadata.",
90
- )
91
-
92
- flags.DEFINE_string(
93
- "gcp_project",
94
- None,
95
- "[Optional] Project name for the Cloud TPU-enabled project. If not "
96
- "specified, we will attempt to automatically detect the GCE project from "
97
- "metadata.",
98
- )
99
- flags.DEFINE_integer(
100
- "num_tpu_cores",
101
- 8,
102
- "Only used if `use_tpu` is True. Total number of TPU cores to use.",
103
- )
104
-
105
- flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
106
-
107
- flags.DEFINE_bool("do_train", False, "Whether to run training.")
108
-
109
- flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
110
-
111
-
112
- def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
113
- assignment_map = {}
114
- initialized_variable_names = {}
115
-
116
- name_to_variable = collections.OrderedDict()
117
- for var in tvars:
118
- name = var.name
119
- m = re.match("^(.*):\\d+$", name)
120
- if m is not None:
121
- name = m.group(1)
122
- name_to_variable[name] = var
123
-
124
- init_vars = tf.train.list_variables(init_checkpoint)
125
-
126
- assignment_map = collections.OrderedDict()
127
- for x in init_vars:
128
- (name, var) = (x[0], x[1])
129
- if name not in name_to_variable:
130
- continue
131
- assignment_map[name] = name
132
- initialized_variable_names[name] = 1
133
- initialized_variable_names[name + ":0"] = 1
134
-
135
- return (assignment_map, initialized_variable_names)
136
-
137
-
138
- def model_fn_builder(
139
- hparams,
140
- init_checkpoint,
141
- learning_rate,
142
- num_train_steps,
143
- num_warmup_steps,
144
- use_tpu,
145
- optimizer,
146
- poly_power,
147
- start_warmup_step,
148
- use_memory_saving_gradients
149
- ):
150
- def model_fn(features, labels, mode, params):
151
- tf.logging.info("*** Features ***")
152
- for name in sorted(features.keys()):
153
- tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
154
-
155
- input_ids = features["input_ids"]
156
-
157
- output = model.model(hparams=hparams, X=input_ids)
158
- loss = tf.reduce_mean(
159
- input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits(
160
- labels=input_ids[:, 1:], logits=output["logits"][:, :-1]
161
- )
162
- )
163
-
164
- tvars = tf.trainable_variables()
165
-
166
- initialized_variable_names = {}
167
- scaffold_fn = None
168
- if init_checkpoint:
169
- (
170
- assignment_map,
171
- initialized_variable_names,
172
- ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
173
- if use_tpu:
174
- def tpu_scaffold():
175
- tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
176
- return tf.train.Scaffold()
177
- scaffold_fn = tpu_scaffold
178
- else:
179
- tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
180
-
181
- tf.logging.info("**** Trainable Variables ****")
182
- for var in tvars:
183
- init_string = ""
184
- if var.name in initialized_variable_names:
185
- init_string = ", *INIT_FROM_CKPT*"
186
- tf.logging.info(
187
- " name = %s, shape = %s%s", var.name, var.shape, init_string
188
- )
189
-
190
- output_spec = None
191
- if mode == tf.estimator.ModeKeys.TRAIN:
192
- train_op = optimization.create_optimizer(
193
- loss,
194
- learning_rate,
195
- num_train_steps,
196
- num_warmup_steps,
197
- use_tpu,
198
- optimizer,
199
- poly_power,
200
- start_warmup_step,
201
- use_memory_saving_gradients=use_memory_saving_gradients
202
- )
203
-
204
- output_spec = tf.contrib.tpu.TPUEstimatorSpec(
205
- mode=mode,
206
- loss=loss,
207
- train_op=train_op,
208
- scaffold_fn=scaffold_fn,
209
- )
210
- elif mode == tf.estimator.ModeKeys.EVAL:
211
-
212
- def metric_fn(loss):
213
- """Evaluation metric Fn which runs on CPU."""
214
- perplexity = tf.exp(tf.reduce_mean(loss))
215
- bpc = tf.reduce_mean(loss) / tf.constant(math.log(2))
216
- return {
217
- "perplexity": tf.metrics.mean(perplexity),
218
- "bpc": tf.metrics.mean(bpc),
219
- }
220
-
221
- if FLAGS.use_tpu:
222
- with tf.colocate_with(loss):
223
- loss = tf.contrib.tpu.cross_replica_sum(loss) \
224
- / FLAGS.num_tpu_cores
225
- metric_loss = tf.tile(tf.reshape(loss, [1, 1]), [FLAGS.eval_batch_size, 1])
226
- output_spec = tf.contrib.tpu.TPUEstimatorSpec(
227
- mode=mode,
228
- loss=loss,
229
- eval_metrics=(metric_fn, [metric_loss]),
230
- scaffold_fn=scaffold_fn)
231
-
232
- # eval_metrics = (metric_fn, {"loss":loss})
233
- # output_spec = tf.contrib.tpu.TPUEstimatorSpec(
234
- # mode=mode,
235
- # loss=loss,
236
- # eval_metrics=eval_metrics,
237
- # scaffold_fn=scaffold_fn,
238
- # )
239
- else:
240
- raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
241
-
242
- return output_spec
243
-
244
- return model_fn
245
-
246
- def input_fn_builder(input_files, max_seq_length, is_training, num_cpu_threads=4):
247
- def input_fn(params):
248
- batch_size = params["batch_size"]
249
- name_to_features = {"input_ids": tf.FixedLenFeature([max_seq_length + 1], tf.int64)}
250
- if is_training:
251
- #d = tf.data.TFRecordDataset(input_files)
252
- #d = d.repeat(1000)
253
- d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
254
- d = d.repeat()
255
- d = d.shuffle(buffer_size = len(input_files))
256
- # `cycle_length` is the number of parallel files that get read.
257
- cycle_length = min(num_cpu_threads, len(input_files))
258
- # `sloppy` mode means that the interleaving is not exact. This adds
259
- # even more randomness to the training pipeline.
260
- d = d.apply(
261
- tf.contrib.data.parallel_interleave(
262
- tf.data.TFRecordDataset,
263
- sloppy = is_training,
264
- cycle_length = cycle_length,
265
- )
266
- )
267
- d = d.shuffle(buffer_size = 100)
268
- else:
269
- d = tf.data.TFRecordDataset(input_files)
270
- # Since we evaluate for a fixed number of steps we don't want to encounter
271
- # out-of-range exceptions.
272
- d = d.repeat()
273
-
274
- d = d.apply(
275
- tf.contrib.data.map_and_batch(
276
- lambda record: _decode_record(record, name_to_features),
277
- batch_size=batch_size,
278
- num_parallel_batches=num_cpu_threads,
279
- drop_remainder=True,
280
- )
281
- )
282
- return d
283
-
284
- return input_fn
285
-
286
- def _decode_record(record, name_to_features):
287
- """Decodes a record to a TensorFlow example."""
288
- example = tf.parse_single_example(record, name_to_features)
289
-
290
- # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
291
- # So cast all int64 to int32.
292
- for name in list(example.keys()):
293
- t = example[name]
294
- if t.dtype == tf.int64:
295
- t = tf.to_int32(t)
296
- example[name] = t
297
-
298
- return example
299
-
300
- def main(_):
301
- tf.logging.set_verbosity(tf.logging.INFO)
302
- logger = tf.get_logger()
303
- logger.propagate = False
304
-
305
- if not FLAGS.do_train and not FLAGS.do_eval:
306
- raise ValueError("At least one of `do_train` or `do_eval` must be True.")
307
-
308
- hparams = model.default_hparams()
309
-
310
- with tf.gfile.GFile(FLAGS.config_file) as f:
311
- hparams.override_from_dict(json.load(f))
312
-
313
- tf.gfile.MakeDirs(FLAGS.output_dir)
314
- input_files = []
315
- for input_pattern in FLAGS.input_file.split(","):
316
- input_files.extend(tf.gfile.Glob(input_pattern))
317
-
318
- # tf.logging.info("*** Input Files ***")
319
- # for input_file in input_files:
320
- # tf.logging.info(" %s" % input_file)
321
-
322
- tpu_cluster_resolver = None
323
- if FLAGS.use_tpu and FLAGS.tpu_name:
324
- tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
325
- FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project
326
- )
327
-
328
- is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
329
- run_config = tf.contrib.tpu.RunConfig(
330
- cluster=tpu_cluster_resolver,
331
- master=FLAGS.master,
332
- model_dir=FLAGS.output_dir,
333
- save_checkpoints_steps=FLAGS.save_checkpoints_steps,
334
- keep_checkpoint_max=FLAGS.keep_checkpoint_max,
335
- tpu_config=tf.contrib.tpu.TPUConfig(
336
- iterations_per_loop=FLAGS.iterations_per_loop,
337
- num_shards=FLAGS.num_tpu_cores,
338
- per_host_input_for_training=is_per_host,
339
- ),
340
- )
341
-
342
- model_fn = model_fn_builder(
343
- hparams=hparams,
344
- init_checkpoint=FLAGS.init_checkpoint,
345
- learning_rate=FLAGS.learning_rate,
346
- num_train_steps=FLAGS.num_train_steps,
347
- num_warmup_steps=FLAGS.num_warmup_steps,
348
- use_tpu=FLAGS.use_tpu,
349
- optimizer=FLAGS.optimizer,
350
- poly_power=FLAGS.poly_power,
351
- start_warmup_step=FLAGS.start_warmup_step,
352
- use_memory_saving_gradients=FLAGS.use_memory_saving_gradients
353
- )
354
-
355
- # If TPU is not available, this will fall back to normal Estimator on CPU
356
- # or GPU.
357
- estimator = tf.contrib.tpu.TPUEstimator(
358
- use_tpu=FLAGS.use_tpu,
359
- model_fn=model_fn,
360
- config=run_config,
361
- train_batch_size=FLAGS.batch_size,
362
- eval_batch_size=FLAGS.eval_batch_size,
363
- )
364
-
365
- if FLAGS.do_train:
366
- tf.logging.info("***** Running training *****")
367
- tf.logging.info(" Batch size = %d", FLAGS.batch_size)
368
- train_input_fn = input_fn_builder(
369
- input_files=input_files,
370
- max_seq_length=FLAGS.max_seq_length,
371
- is_training=True,
372
- )
373
- estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
374
-
375
- if FLAGS.do_eval:
376
- tf.logging.info("***** Running evaluation *****")
377
- tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
378
-
379
- eval_input_fn = input_fn_builder(
380
- input_files=input_files,
381
- max_seq_length=FLAGS.max_seq_length,
382
- is_training=False,
383
- )
384
- result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)
385
-
386
- output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
387
- with tf.gfile.GFile(output_eval_file, "w") as writer:
388
- tf.logging.info("***** Eval results *****")
389
- for key in sorted(result.keys()):
390
- tf.logging.info(" %s = %s", key, str(result[key]))
391
- writer.write("%s = %s\n" % (key, str(result[key])))
392
-
393
- if __name__ == "__main__":
394
- flags.mark_flag_as_required("input_file")
395
- flags.mark_flag_as_required("config_file")
396
- flags.mark_flag_as_required("output_dir")
397
- tf.app.run()
File without changes
@@ -1,161 +0,0 @@
1
- # Original work Copyright 2018 The Google AI Language Team Authors.
2
- # Modified work Copyright 2019 Rowan Zellers
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import collections
17
- import tensorflow as tf
18
-
19
-
20
- def _decode_record(record, name_to_features):
21
- """Decodes a record to a TensorFlow example."""
22
- example = tf.parse_single_example(record, name_to_features)
23
-
24
- # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
25
- # So cast all int64 to int32.
26
- for name in list(example.keys()):
27
- t = example[name]
28
- if t.dtype == tf.int64:
29
- t = tf.cast(t, tf.int32)
30
- example[name] = t
31
- return example
32
-
33
-
34
- def input_fn_builder(input_files,
35
- seq_length,
36
- is_training,
37
- num_cpu_threads=4,
38
- evaluate_for_fixed_number_of_steps=True):
39
- """Creates an `input_fn` closure to be passed to TPUEstimator."""
40
-
41
- def input_fn(params):
42
- """The actual input function."""
43
- batch_size = params["batch_size"]
44
- name_to_features = {
45
- "input_ids": tf.FixedLenFeature([seq_length + 1], tf.int64),
46
- }
47
-
48
- # For training, we want a lot of parallel reading and shuffling.
49
- # For eval, we want no shuffling and parallel reading doesn't matter.
50
- if is_training:
51
- d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
52
- d = d.repeat()
53
- d = d.shuffle(buffer_size=len(input_files))
54
-
55
- # `cycle_length` is the number of parallel files that get read.
56
- cycle_length = min(num_cpu_threads, len(input_files))
57
-
58
- # `sloppy` mode means that the interleaving is not exact. This adds
59
- # even more randomness to the training pipeline.
60
- d = d.apply(
61
- tf.data.experimental.parallel_interleave(
62
- tf.data.TFRecordDataset,
63
- sloppy=is_training,
64
- cycle_length=cycle_length))
65
- d = d.shuffle(buffer_size=100)
66
- else:
67
- d = tf.data.TFRecordDataset(input_files)
68
- # If we evaluate for a fixed number of steps we don't want to encounter
69
- # out-of-range exceptions.
70
- if evaluate_for_fixed_number_of_steps:
71
- d = d.repeat()
72
-
73
- # We must `drop_remainder` on training because the TPU requires fixed
74
- # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
75
- # and we *don't* want to drop the remainder, otherwise we wont cover
76
- # every sample.
77
- d = d.apply(
78
- tf.data.experimental.map_and_batch(
79
- lambda record: _decode_record(record, name_to_features),
80
- batch_size=batch_size,
81
- num_parallel_batches=num_cpu_threads,
82
- drop_remainder=True))
83
- return d
84
-
85
- return input_fn
86
-
87
-
88
- # ~~~~~~~~~~~~~~ This is for classification / AF ~~~~~~~~~~~~~~~~~~
89
- def classification_convert_examples_to_features(
90
- examples, max_seq_length, batch_size, encoder, output_file, labels, pad_extra_examples=False,
91
- chop_from_front_if_needed=True):
92
- """Convert a set of `InputExample`s to a TFRecord file."""
93
-
94
- writer = tf.python_io.TFRecordWriter(output_file)
95
-
96
- label_map = {label: i for i, label in enumerate(labels)}
97
-
98
- for (ex_index, example) in enumerate(examples):
99
- if ex_index % 10000 == 0:
100
- tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
101
-
102
- # begin_summary is our [CLS] token
103
- tokens = example['ids'] + [encoder.begin_summary]
104
-
105
- if len(tokens) > max_seq_length:
106
- if chop_from_front_if_needed:
107
- tokens = tokens[-max_seq_length:]
108
- else:
109
- tokens = example['ids'][:(max_seq_length-1)] + [encoder.begin_summary]
110
- elif len(tokens) < max_seq_length:
111
- tokens.extend([encoder.padding] * (max_seq_length - len(tokens)))
112
-
113
- features = collections.OrderedDict()
114
- features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=tokens))
115
- features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[label_map[example['label']]]))
116
- features['is_real_example'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[1]))
117
- tf_example = tf.train.Example(features=tf.train.Features(feature=features))
118
- writer.write(tf_example.SerializeToString())
119
-
120
- if pad_extra_examples:
121
- for x in range(len(examples) % batch_size):
122
- features = collections.OrderedDict()
123
- features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[0]*max_seq_length))
124
- features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[0]))
125
- features['is_real_example'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[0]))
126
- tf_example = tf.train.Example(features=tf.train.Features(feature=features))
127
- writer.write(tf_example.SerializeToString())
128
- writer.close()
129
-
130
-
131
- def classification_input_fn_builder(input_file, seq_length, is_training,
132
- drop_remainder,
133
- buffer_size=100):
134
- """Creates an `input_fn` closure to be passed to TPUEstimator."""
135
-
136
- name_to_features = {
137
- "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
138
- "label_ids": tf.FixedLenFeature([], tf.int64),
139
- "is_real_example": tf.FixedLenFeature([], tf.int64),
140
- }
141
-
142
- def input_fn(params):
143
- """The actual input function."""
144
- batch_size = params["batch_size"]
145
-
146
- # For training, we want a lot of parallel reading and shuffling.
147
- # For eval, we want no shuffling and parallel reading doesn't matter.
148
- d = tf.data.TFRecordDataset(input_file)
149
- if is_training:
150
- d = d.repeat()
151
- d = d.shuffle(buffer_size=buffer_size)
152
-
153
- d = d.apply(
154
- tf.data.experimental.map_and_batch(
155
- lambda record: _decode_record(record, name_to_features),
156
- batch_size=batch_size,
157
- drop_remainder=drop_remainder))
158
-
159
- return d
160
-
161
- return input_fn