tf-models-nightly 2.15.0.dev20240104__py2.py3-none-any.whl → 2.15.0.dev20240105__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@ import itertools
19
19
  import random
20
20
 
21
21
  # Import libraries
22
+
22
23
  from absl import app
23
24
  from absl import flags
24
25
  from absl import logging
@@ -35,8 +36,26 @@ flags.DEFINE_string(
35
36
  "output_file", None,
36
37
  "Output TF example file (or comma-separated list of files).")
37
38
 
38
- flags.DEFINE_string("vocab_file", None,
39
- "The vocabulary file that the BERT model was trained on.")
39
+ flags.DEFINE_enum(
40
+ "tokenization",
41
+ "WordPiece",
42
+ ["WordPiece", "SentencePiece"],
43
+ "Specifies the tokenizer implementation, i.e., whether to use WordPiece "
44
+ "or SentencePiece tokenizer. Canonical BERT uses WordPiece tokenizer, "
45
+ "while ALBERT uses SentencePiece tokenizer.",
46
+ )
47
+
48
+ flags.DEFINE_string(
49
+ "vocab_file",
50
+ None,
51
+ "For WordPiece tokenization, the vocabulary file of the tokenizer.",
52
+ )
53
+
54
+ flags.DEFINE_string(
55
+ "sp_model_file",
56
+ "",
57
+ "For SentencePiece tokenization, the path to the model of the tokenizer.",
58
+ )
40
59
 
41
60
  flags.DEFINE_bool(
42
61
  "do_lower_case", True,
@@ -44,8 +63,10 @@ flags.DEFINE_bool(
44
63
  "models and False for cased models.")
45
64
 
46
65
  flags.DEFINE_bool(
47
- "do_whole_word_mask", False,
48
- "Whether to use whole word masking rather than per-WordPiece masking.")
66
+ "do_whole_word_mask",
67
+ False,
68
+ "Whether to use whole word masking rather than per-token masking.",
69
+ )
49
70
 
50
71
  flags.DEFINE_integer(
51
72
  "max_ngram_size", None,
@@ -198,16 +219,19 @@ def create_float_feature(values):
198
219
  return feature
199
220
 
200
221
 
201
- def create_training_instances(input_files,
202
- tokenizer,
203
- max_seq_length,
204
- dupe_factor,
205
- short_seq_prob,
206
- masked_lm_prob,
207
- max_predictions_per_seq,
208
- rng,
209
- do_whole_word_mask=False,
210
- max_ngram_size=None):
222
+ def create_training_instances(
223
+ input_files,
224
+ tokenizer,
225
+ processor_text_fn,
226
+ max_seq_length,
227
+ dupe_factor,
228
+ short_seq_prob,
229
+ masked_lm_prob,
230
+ max_predictions_per_seq,
231
+ rng,
232
+ do_whole_word_mask=False,
233
+ max_ngram_size=None,
234
+ ):
211
235
  """Create `TrainingInstance`s from raw text."""
212
236
  all_documents = [[]]
213
237
 
@@ -219,11 +243,8 @@ def create_training_instances(input_files,
219
243
  # that the "next sentence prediction" task doesn't span between documents.
220
244
  for input_file in input_files:
221
245
  with tf.io.gfile.GFile(input_file, "rb") as reader:
222
- while True:
223
- line = tokenization.convert_to_unicode(reader.readline())
224
- if not line:
225
- break
226
- line = line.strip()
246
+ for line in reader:
247
+ line = processor_text_fn(line)
227
248
 
228
249
  # Empty lines are used as document delimiters
229
250
  if not line:
@@ -535,7 +556,7 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
535
556
  return output_ngrams
536
557
 
537
558
 
538
- def _wordpieces_to_grams(tokens):
559
+ def _tokens_to_grams(tokens):
539
560
  """Reconstitue grams (words) from `tokens`.
540
561
 
541
562
  E.g.,
@@ -543,7 +564,8 @@ def _wordpieces_to_grams(tokens):
543
564
  grams: [ [1,2), [2, 4), [4,5) , [5, 6)]
544
565
 
545
566
  Args:
546
- tokens: list of wordpieces
567
+ tokens: list of tokens (word pieces or sentence pieces).
568
+
547
569
  Returns:
548
570
  List of _Grams representing spans of whole words
549
571
  (without "[CLS]" and "[SEP]").
@@ -570,7 +592,7 @@ def create_masked_lm_predictions(tokens, masked_lm_prob,
570
592
  max_ngram_size=None):
571
593
  """Creates the predictions for the masked LM objective."""
572
594
  if do_whole_word_mask:
573
- grams = _wordpieces_to_grams(tokens)
595
+ grams = _tokens_to_grams(tokens)
574
596
  else:
575
597
  # Here we consider each token to be a word to allow for sub-word masking.
576
598
  if max_ngram_size:
@@ -633,9 +655,28 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
633
655
  trunc_tokens.pop()
634
656
 
635
657
 
658
+ def get_processor_text_fn(is_sentence_piece, do_lower_case):
659
+ def processor_text_fn(text):
660
+ text = tokenization.convert_to_unicode(text)
661
+ if is_sentence_piece:
662
+ # Additional preprocessing specific to the SentencePiece tokenizer.
663
+ text = tokenization.preprocess_text(text, lower=do_lower_case)
664
+
665
+ return text.strip()
666
+
667
+ return processor_text_fn
668
+
669
+
636
670
  def main(_):
637
- tokenizer = tokenization.FullTokenizer(
638
- vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
671
+ if FLAGS.tokenization == "WordPiece":
672
+ tokenizer = tokenization.FullTokenizer(
673
+ vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case
674
+ )
675
+ processor_text_fn = get_processor_text_fn(False, FLAGS.do_lower_case)
676
+ else:
677
+ assert FLAGS.tokenization == "SentencePiece"
678
+ tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
679
+ processor_text_fn = get_processor_text_fn(True, FLAGS.do_lower_case)
639
680
 
640
681
  input_files = []
641
682
  for input_pattern in FLAGS.input_file.split(","):
@@ -647,9 +688,18 @@ def main(_):
647
688
 
648
689
  rng = random.Random(FLAGS.random_seed)
649
690
  instances = create_training_instances(
650
- input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
651
- FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
652
- rng, FLAGS.do_whole_word_mask, FLAGS.max_ngram_size)
691
+ input_files,
692
+ tokenizer,
693
+ processor_text_fn,
694
+ FLAGS.max_seq_length,
695
+ FLAGS.dupe_factor,
696
+ FLAGS.short_seq_prob,
697
+ FLAGS.masked_lm_prob,
698
+ FLAGS.max_predictions_per_seq,
699
+ rng,
700
+ FLAGS.do_whole_word_mask,
701
+ FLAGS.max_ngram_size,
702
+ )
653
703
 
654
704
  output_files = FLAGS.output_file.split(",")
655
705
  logging.info("*** Writing to output files ***")
@@ -43,7 +43,7 @@ class CreatePretrainingDataTest(tf.test.TestCase):
43
43
  continue
44
44
  self.fail("invalid mask value: {}".format(output_token))
45
45
 
46
- def test_wordpieces_to_grams(self):
46
+ def test_tokens_to_grams(self):
47
47
  tests = [
48
48
  (["That", "cone"], [(0, 1), (1, 2)]),
49
49
  (["That", "cone", "##s"], [(0, 1), (1, 3)]),
@@ -52,7 +52,7 @@ class CreatePretrainingDataTest(tf.test.TestCase):
52
52
  (["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]),
53
53
  ]
54
54
  for inp, expected in tests:
55
- output = cpd._wordpieces_to_grams(inp)
55
+ output = cpd._tokens_to_grams(inp)
56
56
  self.assertEqual(expected, output)
57
57
 
58
58
  def test_window(self):
@@ -81,8 +81,8 @@ class CreatePretrainingDataTest(tf.test.TestCase):
81
81
  rng=rng,
82
82
  do_whole_word_mask=False,
83
83
  max_ngram_size=None))
84
- self.assertEqual(len(masked_positions), 3)
85
- self.assertEqual(len(masked_labels), 3)
84
+ self.assertLen(masked_positions, 3)
85
+ self.assertLen(masked_labels, 3)
86
86
  self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
87
87
 
88
88
  def test_create_masked_lm_predictions_whole_word(self):
@@ -100,8 +100,8 @@ class CreatePretrainingDataTest(tf.test.TestCase):
100
100
  max_ngram_size=None))
101
101
  # since we can't get exactly three tokens without breaking a word we
102
102
  # only take two.
103
- self.assertEqual(len(masked_positions), 2)
104
- self.assertEqual(len(masked_labels), 2)
103
+ self.assertLen(masked_positions, 2)
104
+ self.assertLen(masked_labels, 2)
105
105
  self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
106
106
  # ensure that we took an entire word.
107
107
  self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]])
@@ -119,8 +119,8 @@ class CreatePretrainingDataTest(tf.test.TestCase):
119
119
  rng=rng,
120
120
  do_whole_word_mask=True,
121
121
  max_ngram_size=3))
122
- self.assertEqual(len(masked_positions), 76)
123
- self.assertEqual(len(masked_labels), 76)
122
+ self.assertLen(masked_positions, 76)
123
+ self.assertLen(masked_labels, 76)
124
124
  self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
125
125
 
126
126
 
@@ -969,6 +969,11 @@ def visualize_segmentation_outputs(
969
969
  )
970
970
  return tf.cast(images, dtype=tf.uint8)
971
971
 
972
+ if images.shape[3] > 3:
973
+ images = images[:, :, :, 0:3]
974
+ elif images.shape[3] == 1:
975
+ images = tf.image.grayscale_to_rgb(images)
976
+
972
977
  images = tf.nest.map_structure(
973
978
  tf.identity,
974
979
  tf.map_fn(
@@ -981,10 +986,6 @@ def visualize_segmentation_outputs(
981
986
  ),
982
987
  )
983
988
 
984
- if images.shape[3] > 3:
985
- images = images[:, :, :, 0:3]
986
- elif images.shape[3] == 1:
987
- images = tf.image.grayscale_to_rgb(images)
988
989
  if true_image_shape is None:
989
990
  true_shapes = tf.constant(-1, shape=[images.shape.as_list()[0], 3])
990
991
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tf-models-nightly
3
- Version: 2.15.0.dev20240104
3
+ Version: 2.15.0.dev20240105
4
4
  Summary: TensorFlow Official Models
5
5
  Home-page: https://github.com/tensorflow/models
6
6
  Author: Google Inc.
@@ -266,8 +266,8 @@ official/nlp/data/__init__.py,sha256=1ToRMjre4mErL4Ek4_dMVxMjXNPossNXggV8fqbISao
266
266
  official/nlp/data/classifier_data_lib.py,sha256=fu3JV9LDn8WC2aU8vjvanMQP09rJa11PrF0IZKxsvhA,57081
267
267
  official/nlp/data/classifier_data_lib_test.py,sha256=VQ9cp3PMylTnIUB2L4gMG-93pIKwNTriQXuZJ1LmAZo,3362
268
268
  official/nlp/data/create_finetuning_data.py,sha256=7uGTMfLDVAFp7cAQqvAYE0M38WluHmRrRlF4PmSnr4c,16703
269
- official/nlp/data/create_pretraining_data.py,sha256=4QpbuTLt846NMSI5wWEdlXi9YfS6rpzUCUxv806ThAs,23370
270
- official/nlp/data/create_pretraining_data_test.py,sha256=U5LGakLLA84T8aElLw3dVSIJHKQqOIAT43fayrFZyCY,4867
269
+ official/nlp/data/create_pretraining_data.py,sha256=vwTPpsSQ8wsV8GLCvm_Rw6FrV-R7iy4VP2-HZYtVEvA,24310
270
+ official/nlp/data/create_pretraining_data_test.py,sha256=0PHyjbfZW8WGM_YW2pFdz5phRrS-didEsz6raiN2UEQ,4817
271
271
  official/nlp/data/create_xlnet_pretraining_data.py,sha256=WT5AnPU3VV_5HygK7IPaMidjCQF4OnK2dwBbcYxZxwU,24199
272
272
  official/nlp/data/create_xlnet_pretraining_data_test.py,sha256=dZ5za7m_lEs1G0oVR_RQwp6GvQGFdnYEjo3i7oMdN_A,10940
273
273
  official/nlp/data/data_loader.py,sha256=Rgf5A4jS42dnfXcsdztizBF2kSFnH7gAygLYh49dE38,1698
@@ -1124,7 +1124,7 @@ official/vision/utils/object_detection/preprocessor.py,sha256=w1OnfVQ-pQ02sYLgCY
1124
1124
  official/vision/utils/object_detection/region_similarity_calculator.py,sha256=OzatyMzdwTrdgWXJ2XeIOZazW120K4kIJy4O5glJZxA,4554
1125
1125
  official/vision/utils/object_detection/shape_utils.py,sha256=2rghJjGoDghjqtf2qcJ26oyKO2OmaSMjlZPwB5fJ7EM,3618
1126
1126
  official/vision/utils/object_detection/target_assigner.py,sha256=7lg2C4rH8qZua5mPv02F-GX0V59Ub-nH7UsKQ4_vLb8,24208
1127
- official/vision/utils/object_detection/visualization_utils.py,sha256=9NSBmQS-LS3VSnfWBIYOGGgfOINAndz3-Fo1It_8zqs,40266
1127
+ official/vision/utils/object_detection/visualization_utils.py,sha256=iSgKiBVWEYc_uIrC3aW_UOa9WZwQbBy0fm4k95Gf6iU,40267
1128
1128
  orbit/__init__.py,sha256=aQRo8zqIQ0Dw4JQReZeiB6MmuJLvvw4DbYHYti5AGys,1117
1129
1129
  orbit/controller.py,sha256=iOpz5DP-pSisTjUxCsMvYP_Q2YsKwfsSvdqmLnOrJfw,25368
1130
1130
  orbit/controller_test.py,sha256=FABwjwpeCKbW-FhmqztfWv8NuBUjr5uG7mrmqEjf6DY,31802
@@ -1158,9 +1158,9 @@ tensorflow_models/__init__.py,sha256=Ciz_YBke6teb6y42QyQTUBDdXJAiV7Qdu1zOoZvYiKw
1158
1158
  tensorflow_models/tensorflow_models_test.py,sha256=qUBLFZg7rmKkVQ3cHJVlkoid8cPqjjyc2ZiWtjQO5_o,1395
1159
1159
  tensorflow_models/nlp/__init__.py,sha256=3dULDpUBpDi9vljpXadq6oJrWH4y6z42Bz2d3hopYZw,807
1160
1160
  tensorflow_models/vision/__init__.py,sha256=4y77XkHaH8qLls3-6ta4tMp3Xj8CLbB0ihH91HsQ9z4,833
1161
- tf_models_nightly-2.15.0.dev20240104.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
1162
- tf_models_nightly-2.15.0.dev20240104.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
1163
- tf_models_nightly-2.15.0.dev20240104.dist-info/METADATA,sha256=_s-HB_DJHmW78cfmEM8op-EEem4sGqZDXo3QTmpu-3o,1414
1164
- tf_models_nightly-2.15.0.dev20240104.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
1165
- tf_models_nightly-2.15.0.dev20240104.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
1166
- tf_models_nightly-2.15.0.dev20240104.dist-info/RECORD,,
1161
+ tf_models_nightly-2.15.0.dev20240105.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
1162
+ tf_models_nightly-2.15.0.dev20240105.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
1163
+ tf_models_nightly-2.15.0.dev20240105.dist-info/METADATA,sha256=8P_GKo7TGlr-fTDjka7nIiXLJf7yA-7HoN7XLRxAlr4,1414
1164
+ tf_models_nightly-2.15.0.dev20240105.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
1165
+ tf_models_nightly-2.15.0.dev20240105.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
1166
+ tf_models_nightly-2.15.0.dev20240105.dist-info/RECORD,,