tf-models-nightly 2.15.0.dev20240104__py2.py3-none-any.whl → 2.15.0.dev20240105__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- official/nlp/data/create_pretraining_data.py +77 -27
- official/nlp/data/create_pretraining_data_test.py +8 -8
- official/vision/utils/object_detection/visualization_utils.py +5 -4
- {tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/METADATA +1 -1
- {tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/RECORD +9 -9
- {tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/AUTHORS +0 -0
- {tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/LICENSE +0 -0
- {tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/WHEEL +0 -0
- {tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ import itertools
|
|
19
19
|
import random
|
20
20
|
|
21
21
|
# Import libraries
|
22
|
+
|
22
23
|
from absl import app
|
23
24
|
from absl import flags
|
24
25
|
from absl import logging
|
@@ -35,8 +36,26 @@ flags.DEFINE_string(
|
|
35
36
|
"output_file", None,
|
36
37
|
"Output TF example file (or comma-separated list of files).")
|
37
38
|
|
38
|
-
flags.
|
39
|
-
|
39
|
+
flags.DEFINE_enum(
|
40
|
+
"tokenization",
|
41
|
+
"WordPiece",
|
42
|
+
["WordPiece", "SentencePiece"],
|
43
|
+
"Specifies the tokenizer implementation, i.e., whether to use WordPiece "
|
44
|
+
"or SentencePiece tokenizer. Canonical BERT uses WordPiece tokenizer, "
|
45
|
+
"while ALBERT uses SentencePiece tokenizer.",
|
46
|
+
)
|
47
|
+
|
48
|
+
flags.DEFINE_string(
|
49
|
+
"vocab_file",
|
50
|
+
None,
|
51
|
+
"For WordPiece tokenization, the vocabulary file of the tokenizer.",
|
52
|
+
)
|
53
|
+
|
54
|
+
flags.DEFINE_string(
|
55
|
+
"sp_model_file",
|
56
|
+
"",
|
57
|
+
"For SentencePiece tokenization, the path to the model of the tokenizer.",
|
58
|
+
)
|
40
59
|
|
41
60
|
flags.DEFINE_bool(
|
42
61
|
"do_lower_case", True,
|
@@ -44,8 +63,10 @@ flags.DEFINE_bool(
|
|
44
63
|
"models and False for cased models.")
|
45
64
|
|
46
65
|
flags.DEFINE_bool(
|
47
|
-
"do_whole_word_mask",
|
48
|
-
|
66
|
+
"do_whole_word_mask",
|
67
|
+
False,
|
68
|
+
"Whether to use whole word masking rather than per-token masking.",
|
69
|
+
)
|
49
70
|
|
50
71
|
flags.DEFINE_integer(
|
51
72
|
"max_ngram_size", None,
|
@@ -198,16 +219,19 @@ def create_float_feature(values):
|
|
198
219
|
return feature
|
199
220
|
|
200
221
|
|
201
|
-
def create_training_instances(
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
222
|
+
def create_training_instances(
|
223
|
+
input_files,
|
224
|
+
tokenizer,
|
225
|
+
processor_text_fn,
|
226
|
+
max_seq_length,
|
227
|
+
dupe_factor,
|
228
|
+
short_seq_prob,
|
229
|
+
masked_lm_prob,
|
230
|
+
max_predictions_per_seq,
|
231
|
+
rng,
|
232
|
+
do_whole_word_mask=False,
|
233
|
+
max_ngram_size=None,
|
234
|
+
):
|
211
235
|
"""Create `TrainingInstance`s from raw text."""
|
212
236
|
all_documents = [[]]
|
213
237
|
|
@@ -219,11 +243,8 @@ def create_training_instances(input_files,
|
|
219
243
|
# that the "next sentence prediction" task doesn't span between documents.
|
220
244
|
for input_file in input_files:
|
221
245
|
with tf.io.gfile.GFile(input_file, "rb") as reader:
|
222
|
-
|
223
|
-
line =
|
224
|
-
if not line:
|
225
|
-
break
|
226
|
-
line = line.strip()
|
246
|
+
for line in reader:
|
247
|
+
line = processor_text_fn(line)
|
227
248
|
|
228
249
|
# Empty lines are used as document delimiters
|
229
250
|
if not line:
|
@@ -535,7 +556,7 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
|
|
535
556
|
return output_ngrams
|
536
557
|
|
537
558
|
|
538
|
-
def
|
559
|
+
def _tokens_to_grams(tokens):
|
539
560
|
"""Reconstitue grams (words) from `tokens`.
|
540
561
|
|
541
562
|
E.g.,
|
@@ -543,7 +564,8 @@ def _wordpieces_to_grams(tokens):
|
|
543
564
|
grams: [ [1,2), [2, 4), [4,5) , [5, 6)]
|
544
565
|
|
545
566
|
Args:
|
546
|
-
tokens: list of
|
567
|
+
tokens: list of tokens (word pieces or sentence pieces).
|
568
|
+
|
547
569
|
Returns:
|
548
570
|
List of _Grams representing spans of whole words
|
549
571
|
(without "[CLS]" and "[SEP]").
|
@@ -570,7 +592,7 @@ def create_masked_lm_predictions(tokens, masked_lm_prob,
|
|
570
592
|
max_ngram_size=None):
|
571
593
|
"""Creates the predictions for the masked LM objective."""
|
572
594
|
if do_whole_word_mask:
|
573
|
-
grams =
|
595
|
+
grams = _tokens_to_grams(tokens)
|
574
596
|
else:
|
575
597
|
# Here we consider each token to be a word to allow for sub-word masking.
|
576
598
|
if max_ngram_size:
|
@@ -633,9 +655,28 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
|
|
633
655
|
trunc_tokens.pop()
|
634
656
|
|
635
657
|
|
658
|
+
def get_processor_text_fn(is_sentence_piece, do_lower_case):
|
659
|
+
def processor_text_fn(text):
|
660
|
+
text = tokenization.convert_to_unicode(text)
|
661
|
+
if is_sentence_piece:
|
662
|
+
# Additional preprocessing specific to the SentencePiece tokenizer.
|
663
|
+
text = tokenization.preprocess_text(text, lower=do_lower_case)
|
664
|
+
|
665
|
+
return text.strip()
|
666
|
+
|
667
|
+
return processor_text_fn
|
668
|
+
|
669
|
+
|
636
670
|
def main(_):
|
637
|
-
|
638
|
-
|
671
|
+
if FLAGS.tokenization == "WordPiece":
|
672
|
+
tokenizer = tokenization.FullTokenizer(
|
673
|
+
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case
|
674
|
+
)
|
675
|
+
processor_text_fn = get_processor_text_fn(False, FLAGS.do_lower_case)
|
676
|
+
else:
|
677
|
+
assert FLAGS.tokenization == "SentencePiece"
|
678
|
+
tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
|
679
|
+
processor_text_fn = get_processor_text_fn(True, FLAGS.do_lower_case)
|
639
680
|
|
640
681
|
input_files = []
|
641
682
|
for input_pattern in FLAGS.input_file.split(","):
|
@@ -647,9 +688,18 @@ def main(_):
|
|
647
688
|
|
648
689
|
rng = random.Random(FLAGS.random_seed)
|
649
690
|
instances = create_training_instances(
|
650
|
-
input_files,
|
651
|
-
|
652
|
-
|
691
|
+
input_files,
|
692
|
+
tokenizer,
|
693
|
+
processor_text_fn,
|
694
|
+
FLAGS.max_seq_length,
|
695
|
+
FLAGS.dupe_factor,
|
696
|
+
FLAGS.short_seq_prob,
|
697
|
+
FLAGS.masked_lm_prob,
|
698
|
+
FLAGS.max_predictions_per_seq,
|
699
|
+
rng,
|
700
|
+
FLAGS.do_whole_word_mask,
|
701
|
+
FLAGS.max_ngram_size,
|
702
|
+
)
|
653
703
|
|
654
704
|
output_files = FLAGS.output_file.split(",")
|
655
705
|
logging.info("*** Writing to output files ***")
|
@@ -43,7 +43,7 @@ class CreatePretrainingDataTest(tf.test.TestCase):
|
|
43
43
|
continue
|
44
44
|
self.fail("invalid mask value: {}".format(output_token))
|
45
45
|
|
46
|
-
def
|
46
|
+
def test_tokens_to_grams(self):
|
47
47
|
tests = [
|
48
48
|
(["That", "cone"], [(0, 1), (1, 2)]),
|
49
49
|
(["That", "cone", "##s"], [(0, 1), (1, 3)]),
|
@@ -52,7 +52,7 @@ class CreatePretrainingDataTest(tf.test.TestCase):
|
|
52
52
|
(["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]),
|
53
53
|
]
|
54
54
|
for inp, expected in tests:
|
55
|
-
output = cpd.
|
55
|
+
output = cpd._tokens_to_grams(inp)
|
56
56
|
self.assertEqual(expected, output)
|
57
57
|
|
58
58
|
def test_window(self):
|
@@ -81,8 +81,8 @@ class CreatePretrainingDataTest(tf.test.TestCase):
|
|
81
81
|
rng=rng,
|
82
82
|
do_whole_word_mask=False,
|
83
83
|
max_ngram_size=None))
|
84
|
-
self.
|
85
|
-
self.
|
84
|
+
self.assertLen(masked_positions, 3)
|
85
|
+
self.assertLen(masked_labels, 3)
|
86
86
|
self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
|
87
87
|
|
88
88
|
def test_create_masked_lm_predictions_whole_word(self):
|
@@ -100,8 +100,8 @@ class CreatePretrainingDataTest(tf.test.TestCase):
|
|
100
100
|
max_ngram_size=None))
|
101
101
|
# since we can't get exactly three tokens without breaking a word we
|
102
102
|
# only take two.
|
103
|
-
self.
|
104
|
-
self.
|
103
|
+
self.assertLen(masked_positions, 2)
|
104
|
+
self.assertLen(masked_labels, 2)
|
105
105
|
self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
|
106
106
|
# ensure that we took an entire word.
|
107
107
|
self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]])
|
@@ -119,8 +119,8 @@ class CreatePretrainingDataTest(tf.test.TestCase):
|
|
119
119
|
rng=rng,
|
120
120
|
do_whole_word_mask=True,
|
121
121
|
max_ngram_size=3))
|
122
|
-
self.
|
123
|
-
self.
|
122
|
+
self.assertLen(masked_positions, 76)
|
123
|
+
self.assertLen(masked_labels, 76)
|
124
124
|
self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
|
125
125
|
|
126
126
|
|
@@ -969,6 +969,11 @@ def visualize_segmentation_outputs(
|
|
969
969
|
)
|
970
970
|
return tf.cast(images, dtype=tf.uint8)
|
971
971
|
|
972
|
+
if images.shape[3] > 3:
|
973
|
+
images = images[:, :, :, 0:3]
|
974
|
+
elif images.shape[3] == 1:
|
975
|
+
images = tf.image.grayscale_to_rgb(images)
|
976
|
+
|
972
977
|
images = tf.nest.map_structure(
|
973
978
|
tf.identity,
|
974
979
|
tf.map_fn(
|
@@ -981,10 +986,6 @@ def visualize_segmentation_outputs(
|
|
981
986
|
),
|
982
987
|
)
|
983
988
|
|
984
|
-
if images.shape[3] > 3:
|
985
|
-
images = images[:, :, :, 0:3]
|
986
|
-
elif images.shape[3] == 1:
|
987
|
-
images = tf.image.grayscale_to_rgb(images)
|
988
989
|
if true_image_shape is None:
|
989
990
|
true_shapes = tf.constant(-1, shape=[images.shape.as_list()[0], 3])
|
990
991
|
else:
|
@@ -266,8 +266,8 @@ official/nlp/data/__init__.py,sha256=1ToRMjre4mErL4Ek4_dMVxMjXNPossNXggV8fqbISao
|
|
266
266
|
official/nlp/data/classifier_data_lib.py,sha256=fu3JV9LDn8WC2aU8vjvanMQP09rJa11PrF0IZKxsvhA,57081
|
267
267
|
official/nlp/data/classifier_data_lib_test.py,sha256=VQ9cp3PMylTnIUB2L4gMG-93pIKwNTriQXuZJ1LmAZo,3362
|
268
268
|
official/nlp/data/create_finetuning_data.py,sha256=7uGTMfLDVAFp7cAQqvAYE0M38WluHmRrRlF4PmSnr4c,16703
|
269
|
-
official/nlp/data/create_pretraining_data.py,sha256=
|
270
|
-
official/nlp/data/create_pretraining_data_test.py,sha256=
|
269
|
+
official/nlp/data/create_pretraining_data.py,sha256=vwTPpsSQ8wsV8GLCvm_Rw6FrV-R7iy4VP2-HZYtVEvA,24310
|
270
|
+
official/nlp/data/create_pretraining_data_test.py,sha256=0PHyjbfZW8WGM_YW2pFdz5phRrS-didEsz6raiN2UEQ,4817
|
271
271
|
official/nlp/data/create_xlnet_pretraining_data.py,sha256=WT5AnPU3VV_5HygK7IPaMidjCQF4OnK2dwBbcYxZxwU,24199
|
272
272
|
official/nlp/data/create_xlnet_pretraining_data_test.py,sha256=dZ5za7m_lEs1G0oVR_RQwp6GvQGFdnYEjo3i7oMdN_A,10940
|
273
273
|
official/nlp/data/data_loader.py,sha256=Rgf5A4jS42dnfXcsdztizBF2kSFnH7gAygLYh49dE38,1698
|
@@ -1124,7 +1124,7 @@ official/vision/utils/object_detection/preprocessor.py,sha256=w1OnfVQ-pQ02sYLgCY
|
|
1124
1124
|
official/vision/utils/object_detection/region_similarity_calculator.py,sha256=OzatyMzdwTrdgWXJ2XeIOZazW120K4kIJy4O5glJZxA,4554
|
1125
1125
|
official/vision/utils/object_detection/shape_utils.py,sha256=2rghJjGoDghjqtf2qcJ26oyKO2OmaSMjlZPwB5fJ7EM,3618
|
1126
1126
|
official/vision/utils/object_detection/target_assigner.py,sha256=7lg2C4rH8qZua5mPv02F-GX0V59Ub-nH7UsKQ4_vLb8,24208
|
1127
|
-
official/vision/utils/object_detection/visualization_utils.py,sha256=
|
1127
|
+
official/vision/utils/object_detection/visualization_utils.py,sha256=iSgKiBVWEYc_uIrC3aW_UOa9WZwQbBy0fm4k95Gf6iU,40267
|
1128
1128
|
orbit/__init__.py,sha256=aQRo8zqIQ0Dw4JQReZeiB6MmuJLvvw4DbYHYti5AGys,1117
|
1129
1129
|
orbit/controller.py,sha256=iOpz5DP-pSisTjUxCsMvYP_Q2YsKwfsSvdqmLnOrJfw,25368
|
1130
1130
|
orbit/controller_test.py,sha256=FABwjwpeCKbW-FhmqztfWv8NuBUjr5uG7mrmqEjf6DY,31802
|
@@ -1158,9 +1158,9 @@ tensorflow_models/__init__.py,sha256=Ciz_YBke6teb6y42QyQTUBDdXJAiV7Qdu1zOoZvYiKw
|
|
1158
1158
|
tensorflow_models/tensorflow_models_test.py,sha256=qUBLFZg7rmKkVQ3cHJVlkoid8cPqjjyc2ZiWtjQO5_o,1395
|
1159
1159
|
tensorflow_models/nlp/__init__.py,sha256=3dULDpUBpDi9vljpXadq6oJrWH4y6z42Bz2d3hopYZw,807
|
1160
1160
|
tensorflow_models/vision/__init__.py,sha256=4y77XkHaH8qLls3-6ta4tMp3Xj8CLbB0ihH91HsQ9z4,833
|
1161
|
-
tf_models_nightly-2.15.0.
|
1162
|
-
tf_models_nightly-2.15.0.
|
1163
|
-
tf_models_nightly-2.15.0.
|
1164
|
-
tf_models_nightly-2.15.0.
|
1165
|
-
tf_models_nightly-2.15.0.
|
1166
|
-
tf_models_nightly-2.15.0.
|
1161
|
+
tf_models_nightly-2.15.0.dev20240105.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
|
1162
|
+
tf_models_nightly-2.15.0.dev20240105.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
|
1163
|
+
tf_models_nightly-2.15.0.dev20240105.dist-info/METADATA,sha256=8P_GKo7TGlr-fTDjka7nIiXLJf7yA-7HoN7XLRxAlr4,1414
|
1164
|
+
tf_models_nightly-2.15.0.dev20240105.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
|
1165
|
+
tf_models_nightly-2.15.0.dev20240105.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
|
1166
|
+
tf_models_nightly-2.15.0.dev20240105.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|