PyPI - tf-models-nightly - Versions diffs - 2.15.0.dev20240104__py2.py3-none-any.whl → 2.15.0.dev20240105__py2.py3-none-any.whl - Mend

tf-models-nightly 2.15.0.dev20240104py2.py3-none-any.whl → 2.15.0.dev20240105py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

official/nlp/data/create_pretraining_data.py CHANGED Viewed

@@ -19,6 +19,7 @@ import itertools
 import random
 # Import libraries
 from absl import app
 from absl import flags
 from absl import logging
@@ -35,8 +36,26 @@ flags.DEFINE_string(
     "output_file", None,
     "Output TF example file (or comma-separated list of files).")
-flags.DEFINE_string("vocab_file", None,
-                    "The vocabulary file that the BERT model was trained on.")
+flags.DEFINE_enum(
+    "tokenization",
+    "WordPiece",
+    ["WordPiece", "SentencePiece"],
+    "Specifies the tokenizer implementation, i.e., whether to use WordPiece "
+    "or SentencePiece tokenizer. Canonical BERT uses WordPiece tokenizer, "
+    "while ALBERT uses SentencePiece tokenizer.",
+)
+flags.DEFINE_string(
+    "vocab_file",
+    None,
+    "For WordPiece tokenization, the vocabulary file of the tokenizer.",
+)
+flags.DEFINE_string(
+    "sp_model_file",
+    "",
+    "For SentencePiece tokenization, the path to the model of the tokenizer.",
+)
 flags.DEFINE_bool(
     "do_lower_case", True,
@@ -44,8 +63,10 @@ flags.DEFINE_bool(
     "models and False for cased models.")
 flags.DEFINE_bool(
-    "do_whole_word_mask", False,
-    "Whether to use whole word masking rather than per-WordPiece masking.")
+    "do_whole_word_mask",
+    False,
+    "Whether to use whole word masking rather than per-token masking.",
+)
 flags.DEFINE_integer(
     "max_ngram_size", None,
@@ -198,16 +219,19 @@ def create_float_feature(values):
   return feature
-def create_training_instances(input_files,
-                              tokenizer,
-                              max_seq_length,
-                              dupe_factor,
-                              short_seq_prob,
-                              masked_lm_prob,
-                              max_predictions_per_seq,
-                              rng,
-                              do_whole_word_mask=False,
-                              max_ngram_size=None):
+def create_training_instances(
+    input_files,
+    tokenizer,
+    processor_text_fn,
+    max_seq_length,
+    dupe_factor,
+    short_seq_prob,
+    masked_lm_prob,
+    max_predictions_per_seq,
+    rng,
+    do_whole_word_mask=False,
+    max_ngram_size=None,
+):
   """Create `TrainingInstance`s from raw text."""
   all_documents = [[]]
@@ -219,11 +243,8 @@ def create_training_instances(input_files,
   # that the "next sentence prediction" task doesn't span between documents.
   for input_file in input_files:
     with tf.io.gfile.GFile(input_file, "rb") as reader:
-      while True:
-        line = tokenization.convert_to_unicode(reader.readline())
-        if not line:
-          break
-        line = line.strip()
+      for line in reader:
+        line = processor_text_fn(line)
         # Empty lines are used as document delimiters
         if not line:
@@ -535,7 +556,7 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
   return output_ngrams
-def _wordpieces_to_grams(tokens):
+def _tokens_to_grams(tokens):
   """Reconstitue grams (words) from `tokens`.
   E.g.,
@@ -543,7 +564,8 @@ def _wordpieces_to_grams(tokens):
       grams: [          [1,2), [2,         4),  [4,5) , [5,       6)]
   Args:
-    tokens: list of wordpieces
+    tokens: list of tokens (word pieces or sentence pieces).
   Returns:
     List of _Grams representing spans of whole words
     (without "[CLS]" and "[SEP]").
@@ -570,7 +592,7 @@ def create_masked_lm_predictions(tokens, masked_lm_prob,
                                  max_ngram_size=None):
   """Creates the predictions for the masked LM objective."""
   if do_whole_word_mask:
-    grams = _wordpieces_to_grams(tokens)
+    grams = _tokens_to_grams(tokens)
   else:
     # Here we consider each token to be a word to allow for sub-word masking.
     if max_ngram_size:
@@ -633,9 +655,28 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
       trunc_tokens.pop()
+def get_processor_text_fn(is_sentence_piece, do_lower_case):
+  def processor_text_fn(text):
+    text = tokenization.convert_to_unicode(text)
+    if is_sentence_piece:
+      # Additional preprocessing specific to the SentencePiece tokenizer.
+      text = tokenization.preprocess_text(text, lower=do_lower_case)
+    return text.strip()
+  return processor_text_fn
 def main(_):
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+  if FLAGS.tokenization == "WordPiece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case
+    )
+    processor_text_fn = get_processor_text_fn(False, FLAGS.do_lower_case)
+  else:
+    assert FLAGS.tokenization == "SentencePiece"
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = get_processor_text_fn(True, FLAGS.do_lower_case)
   input_files = []
   for input_pattern in FLAGS.input_file.split(","):
@@ -647,9 +688,18 @@ def main(_):
   rng = random.Random(FLAGS.random_seed)
   instances = create_training_instances(
-      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
-      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
-      rng, FLAGS.do_whole_word_mask, FLAGS.max_ngram_size)
+      input_files,
+      tokenizer,
+      processor_text_fn,
+      FLAGS.max_seq_length,
+      FLAGS.dupe_factor,
+      FLAGS.short_seq_prob,
+      FLAGS.masked_lm_prob,
+      FLAGS.max_predictions_per_seq,
+      rng,
+      FLAGS.do_whole_word_mask,
+      FLAGS.max_ngram_size,
+  )
   output_files = FLAGS.output_file.split(",")
   logging.info("*** Writing to output files ***")

official/nlp/data/create_pretraining_data_test.py CHANGED Viewed

@@ -43,7 +43,7 @@ class CreatePretrainingDataTest(tf.test.TestCase):
         continue
       self.fail("invalid mask value: {}".format(output_token))
-  def test_wordpieces_to_grams(self):
+  def test_tokens_to_grams(self):
     tests = [
         (["That", "cone"], [(0, 1), (1, 2)]),
         (["That", "cone", "##s"], [(0, 1), (1, 3)]),
@@ -52,7 +52,7 @@ class CreatePretrainingDataTest(tf.test.TestCase):
         (["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]),
     ]
     for inp, expected in tests:
-      output = cpd._wordpieces_to_grams(inp)
+      output = cpd._tokens_to_grams(inp)
       self.assertEqual(expected, output)
   def test_window(self):
@@ -81,8 +81,8 @@ class CreatePretrainingDataTest(tf.test.TestCase):
               rng=rng,
               do_whole_word_mask=False,
               max_ngram_size=None))
-      self.assertEqual(len(masked_positions), 3)
-      self.assertEqual(len(masked_labels), 3)
+      self.assertLen(masked_positions, 3)
+      self.assertLen(masked_labels, 3)
       self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
   def test_create_masked_lm_predictions_whole_word(self):
@@ -100,8 +100,8 @@ class CreatePretrainingDataTest(tf.test.TestCase):
               max_ngram_size=None))
       # since we can't get exactly three tokens without breaking a word we
       # only take two.
-      self.assertEqual(len(masked_positions), 2)
-      self.assertEqual(len(masked_labels), 2)
+      self.assertLen(masked_positions, 2)
+      self.assertLen(masked_labels, 2)
       self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
       # ensure that we took an entire word.
       self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]])
@@ -119,8 +119,8 @@ class CreatePretrainingDataTest(tf.test.TestCase):
               rng=rng,
               do_whole_word_mask=True,
               max_ngram_size=3))
-      self.assertEqual(len(masked_positions), 76)
-      self.assertEqual(len(masked_labels), 76)
+      self.assertLen(masked_positions, 76)
+      self.assertLen(masked_labels, 76)
       self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)

official/vision/utils/object_detection/visualization_utils.py CHANGED Viewed

@@ -969,6 +969,11 @@ def visualize_segmentation_outputs(
       )
     return tf.cast(images, dtype=tf.uint8)
+  if images.shape[3] > 3:
+    images = images[:, :, :, 0:3]
+  elif images.shape[3] == 1:
+    images = tf.image.grayscale_to_rgb(images)
   images = tf.nest.map_structure(
       tf.identity,
       tf.map_fn(
@@ -981,10 +986,6 @@ def visualize_segmentation_outputs(
       ),
   )
-  if images.shape[3] > 3:
-    images = images[:, :, :, 0:3]
-  elif images.shape[3] == 1:
-    images = tf.image.grayscale_to_rgb(images)
   if true_image_shape is None:
     true_shapes = tf.constant(-1, shape=[images.shape.as_list()[0], 3])
   else:

{tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tf-models-nightly
-Version: 2.15.0.dev20240104
+Version: 2.15.0.dev20240105
 Summary: TensorFlow Official Models
 Home-page: https://github.com/tensorflow/models
 Author: Google Inc.

{tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/RECORD RENAMED Viewed

@@ -266,8 +266,8 @@ official/nlp/data/__init__.py,sha256=1ToRMjre4mErL4Ek4_dMVxMjXNPossNXggV8fqbISao
 official/nlp/data/classifier_data_lib.py,sha256=fu3JV9LDn8WC2aU8vjvanMQP09rJa11PrF0IZKxsvhA,57081
 official/nlp/data/classifier_data_lib_test.py,sha256=VQ9cp3PMylTnIUB2L4gMG-93pIKwNTriQXuZJ1LmAZo,3362
 official/nlp/data/create_finetuning_data.py,sha256=7uGTMfLDVAFp7cAQqvAYE0M38WluHmRrRlF4PmSnr4c,16703
-official/nlp/data/create_pretraining_data.py,sha256=4QpbuTLt846NMSI5wWEdlXi9YfS6rpzUCUxv806ThAs,23370
-official/nlp/data/create_pretraining_data_test.py,sha256=U5LGakLLA84T8aElLw3dVSIJHKQqOIAT43fayrFZyCY,4867
+official/nlp/data/create_pretraining_data.py,sha256=vwTPpsSQ8wsV8GLCvm_Rw6FrV-R7iy4VP2-HZYtVEvA,24310
+official/nlp/data/create_pretraining_data_test.py,sha256=0PHyjbfZW8WGM_YW2pFdz5phRrS-didEsz6raiN2UEQ,4817
 official/nlp/data/create_xlnet_pretraining_data.py,sha256=WT5AnPU3VV_5HygK7IPaMidjCQF4OnK2dwBbcYxZxwU,24199
 official/nlp/data/create_xlnet_pretraining_data_test.py,sha256=dZ5za7m_lEs1G0oVR_RQwp6GvQGFdnYEjo3i7oMdN_A,10940
 official/nlp/data/data_loader.py,sha256=Rgf5A4jS42dnfXcsdztizBF2kSFnH7gAygLYh49dE38,1698
@@ -1124,7 +1124,7 @@ official/vision/utils/object_detection/preprocessor.py,sha256=w1OnfVQ-pQ02sYLgCY
 official/vision/utils/object_detection/region_similarity_calculator.py,sha256=OzatyMzdwTrdgWXJ2XeIOZazW120K4kIJy4O5glJZxA,4554
 official/vision/utils/object_detection/shape_utils.py,sha256=2rghJjGoDghjqtf2qcJ26oyKO2OmaSMjlZPwB5fJ7EM,3618
 official/vision/utils/object_detection/target_assigner.py,sha256=7lg2C4rH8qZua5mPv02F-GX0V59Ub-nH7UsKQ4_vLb8,24208
-official/vision/utils/object_detection/visualization_utils.py,sha256=9NSBmQS-LS3VSnfWBIYOGGgfOINAndz3-Fo1It_8zqs,40266
+official/vision/utils/object_detection/visualization_utils.py,sha256=iSgKiBVWEYc_uIrC3aW_UOa9WZwQbBy0fm4k95Gf6iU,40267
 orbit/__init__.py,sha256=aQRo8zqIQ0Dw4JQReZeiB6MmuJLvvw4DbYHYti5AGys,1117
 orbit/controller.py,sha256=iOpz5DP-pSisTjUxCsMvYP_Q2YsKwfsSvdqmLnOrJfw,25368
 orbit/controller_test.py,sha256=FABwjwpeCKbW-FhmqztfWv8NuBUjr5uG7mrmqEjf6DY,31802
@@ -1158,9 +1158,9 @@ tensorflow_models/__init__.py,sha256=Ciz_YBke6teb6y42QyQTUBDdXJAiV7Qdu1zOoZvYiKw
 tensorflow_models/tensorflow_models_test.py,sha256=qUBLFZg7rmKkVQ3cHJVlkoid8cPqjjyc2ZiWtjQO5_o,1395
 tensorflow_models/nlp/__init__.py,sha256=3dULDpUBpDi9vljpXadq6oJrWH4y6z42Bz2d3hopYZw,807
 tensorflow_models/vision/__init__.py,sha256=4y77XkHaH8qLls3-6ta4tMp3Xj8CLbB0ihH91HsQ9z4,833
-tf_models_nightly-2.15.0.dev20240104.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
-tf_models_nightly-2.15.0.dev20240104.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
-tf_models_nightly-2.15.0.dev20240104.dist-info/METADATA,sha256=_s-HB_DJHmW78cfmEM8op-EEem4sGqZDXo3QTmpu-3o,1414
-tf_models_nightly-2.15.0.dev20240104.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
-tf_models_nightly-2.15.0.dev20240104.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
-tf_models_nightly-2.15.0.dev20240104.dist-info/RECORD,,
+tf_models_nightly-2.15.0.dev20240105.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
+tf_models_nightly-2.15.0.dev20240105.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
+tf_models_nightly-2.15.0.dev20240105.dist-info/METADATA,sha256=8P_GKo7TGlr-fTDjka7nIiXLJf7yA-7HoN7XLRxAlr4,1414
+tf_models_nightly-2.15.0.dev20240105.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
+tf_models_nightly-2.15.0.dev20240105.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
+tf_models_nightly-2.15.0.dev20240105.dist-info/RECORD,,

{tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/AUTHORS RENAMED Viewed

File without changes

{tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/LICENSE RENAMED Viewed

File without changes

{tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/WHEEL RENAMED Viewed

File without changes

{tf_models_nightly-2.15.0.dev20240104.dist-info → tf_models_nightly-2.15.0.dev20240105.dist-info}/top_level.txt RENAMED Viewed

File without changes

tf-models-nightly 2.15.0.dev20240104__py2.py3-none-any.whl → 2.15.0.dev20240105__py2.py3-none-any.whl

tf-models-nightly 2.15.0.dev20240104py2.py3-none-any.whl → 2.15.0.dev20240105py2.py3-none-any.whl