PyPI - SinaTools - Versions diffs - 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl - Mend

SinaTools 0.1.40py2.py3-none-any.whl → 1.0.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
SinaTools-1.0.1.dist-info/RECORD +73 -0
sinatools/VERSION +1 -1
sinatools/ner/__init__.py +5 -7
sinatools/ner/trainers/BertNestedTrainer.py +203 -203
sinatools/ner/trainers/BertTrainer.py +163 -163
sinatools/ner/trainers/__init__.py +2 -2
SinaTools-0.1.40.dist-info/RECORD +0 -123
sinatools/arabert/arabert/__init__.py +0 -14
sinatools/arabert/arabert/create_classification_data.py +0 -260
sinatools/arabert/arabert/create_pretraining_data.py +0 -534
sinatools/arabert/arabert/extract_features.py +0 -444
sinatools/arabert/arabert/lamb_optimizer.py +0 -158
sinatools/arabert/arabert/modeling.py +0 -1027
sinatools/arabert/arabert/optimization.py +0 -202
sinatools/arabert/arabert/run_classifier.py +0 -1078
sinatools/arabert/arabert/run_pretraining.py +0 -593
sinatools/arabert/arabert/run_squad.py +0 -1440
sinatools/arabert/arabert/tokenization.py +0 -414
sinatools/arabert/araelectra/__init__.py +0 -1
sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
sinatools/arabert/araelectra/configure_finetuning.py +0 -172
sinatools/arabert/araelectra/configure_pretraining.py +0 -143
sinatools/arabert/araelectra/finetune/__init__.py +0 -14
sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
sinatools/arabert/araelectra/finetune/scorer.py +0 -54
sinatools/arabert/araelectra/finetune/task.py +0 -74
sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
sinatools/arabert/araelectra/flops_computation.py +0 -215
sinatools/arabert/araelectra/model/__init__.py +0 -14
sinatools/arabert/araelectra/model/modeling.py +0 -1029
sinatools/arabert/araelectra/model/optimization.py +0 -193
sinatools/arabert/araelectra/model/tokenization.py +0 -355
sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
sinatools/arabert/araelectra/run_finetuning.py +0 -323
sinatools/arabert/araelectra/run_pretraining.py +0 -469
sinatools/arabert/araelectra/util/__init__.py +0 -14
sinatools/arabert/araelectra/util/training_utils.py +0 -112
sinatools/arabert/araelectra/util/utils.py +0 -109
sinatools/arabert/aragpt2/__init__.py +0 -2
sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
sinatools/arabert/aragpt2/grover/__init__.py +0 -0
sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
sinatools/arabert/aragpt2/grover/modeling.py +0 -803
sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
sinatools/arabert/aragpt2/grover/utils.py +0 -234
sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
{SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
{SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
{SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
{SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
{SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
{SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0

sinatools/arabert/araelectra/pretrain/pretrain_data.py DELETED Viewed

@@ -1,160 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Helpers for preparing pre-training data and supplying them to the model."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import collections
-import numpy as np
-import tensorflow as tf
-import configure_pretraining
-from model import tokenization
-from util import utils
-def get_input_fn(config: configure_pretraining.PretrainingConfig, is_training,
-                 num_cpu_threads=4):
-  """Creates an `input_fn` closure to be passed to TPUEstimator."""
-  input_files = []
-  for input_pattern in config.pretrain_tfrecords.split(","):
-    input_files.extend(tf.io.gfile.glob(input_pattern))
-  def input_fn(params):
-    """The actual input function."""
-    batch_size = params["batch_size"]
-    name_to_features = {
-        "input_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
-        "input_mask": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
-        "segment_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
-    }
-    d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
-    d = d.repeat()
-    d = d.shuffle(buffer_size=len(input_files))
-    # `cycle_length` is the number of parallel files that get read.
-    cycle_length = min(num_cpu_threads, len(input_files))
-    # `sloppy` mode means that the interleaving is not exact. This adds
-    # even more randomness to the training pipeline.
-    d = d.apply(
-        tf.data.experimental.parallel_interleave(
-            tf.data.TFRecordDataset,
-            sloppy=is_training,
-            cycle_length=cycle_length))
-    d = d.shuffle(buffer_size=100)
-    # We must `drop_remainder` on training because the TPU requires fixed
-    # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
-    # and we *don"t* want to drop the remainder, otherwise we wont cover
-    # every sample.
-    d = d.apply(
-        tf.data.experimental.map_and_batch(
-            lambda record: _decode_record(record, name_to_features),
-            batch_size=batch_size,
-            num_parallel_batches=num_cpu_threads,
-            drop_remainder=True))
-    return d
-  return input_fn
-def _decode_record(record, name_to_features):
-  """Decodes a record to a TensorFlow example."""
-  example = tf.io.parse_single_example(record, name_to_features)
-  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-  # So cast all int64 to int32.
-  for name in list(example.keys()):
-    t = example[name]
-    if t.dtype == tf.int64:
-      t = tf.cast(t, tf.int32)
-    example[name] = t
-  return example
-# model inputs - it's a bit nicer to use a namedtuple rather than keep the
-# features as a dict
-Inputs = collections.namedtuple(
-    "Inputs", ["input_ids", "input_mask", "segment_ids", "masked_lm_positions",
-               "masked_lm_ids", "masked_lm_weights"])
-def features_to_inputs(features):
-  return Inputs(
-      input_ids=features["input_ids"],
-      input_mask=features["input_mask"],
-      segment_ids=features["segment_ids"],
-      masked_lm_positions=(features["masked_lm_positions"]
-                           if "masked_lm_positions" in features else None),
-      masked_lm_ids=(features["masked_lm_ids"]
-                     if "masked_lm_ids" in features else None),
-      masked_lm_weights=(features["masked_lm_weights"]
-                         if "masked_lm_weights" in features else None),
-  )
-def get_updated_inputs(inputs, **kwargs):
-  features = inputs._asdict()
-  for k, v in kwargs.items():
-    features[k] = v
-  return features_to_inputs(features)
-ENDC = "\033[0m"
-COLORS = ["\033[" + str(n) + "m" for n in list(range(91, 97)) + [90]]
-RED = COLORS[0]
-BLUE = COLORS[3]
-CYAN = COLORS[5]
-GREEN = COLORS[1]
-def print_tokens(inputs: Inputs, inv_vocab, updates_mask=None):
-  """Pretty-print model inputs."""
-  pos_to_tokid = {}
-  for tokid, pos, weight in zip(
-      inputs.masked_lm_ids[0], inputs.masked_lm_positions[0],
-      inputs.masked_lm_weights[0]):
-    if weight == 0:
-      pass
-    else:
-      pos_to_tokid[pos] = tokid
-  text = ""
-  provided_update_mask = (updates_mask is not None)
-  if not provided_update_mask:
-    updates_mask = np.zeros_like(inputs.input_ids)
-  for pos, (tokid, um) in enumerate(
-      zip(inputs.input_ids[0], updates_mask[0])):
-    token = inv_vocab[tokid]
-    if token == "[PAD]":
-      break
-    if pos in pos_to_tokid:
-      token = RED + token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" + ENDC
-      if provided_update_mask:
-        assert um == 1
-    else:
-      if provided_update_mask:
-        assert um == 0
-    text += token + " "
-  utils.log(tokenization.printable_text(text))

sinatools/arabert/araelectra/pretrain/pretrain_helpers.py DELETED Viewed

@@ -1,229 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Helper functions for pre-training. These mainly deal with the gathering and
-scattering needed so the generator only makes predictions for the small number
-of masked tokens.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import tensorflow as tf
-import configure_pretraining
-from model import modeling
-from model import tokenization
-from pretrain import pretrain_data
-def gather_positions(sequence, positions):
-  """Gathers the vectors at the specific positions over a minibatch.
-  Args:
-    sequence: A [batch_size, seq_length] or
-        [batch_size, seq_length, depth] tensor of values
-    positions: A [batch_size, n_positions] tensor of indices
-  Returns: A [batch_size, n_positions] or
-    [batch_size, n_positions, depth] tensor of the values at the indices
-  """
-  shape = modeling.get_shape_list(sequence, expected_rank=[2, 3])
-  depth_dimension = (len(shape) == 3)
-  if depth_dimension:
-    B, L, D = shape
-  else:
-    B, L = shape
-    D = 1
-    sequence = tf.expand_dims(sequence, -1)
-  position_shift = tf.expand_dims(L * tf.range(B), -1)
-  flat_positions = tf.reshape(positions + position_shift, [-1])
-  flat_sequence = tf.reshape(sequence, [B * L, D])
-  gathered = tf.gather(flat_sequence, flat_positions)
-  if depth_dimension:
-    return tf.reshape(gathered, [B, -1, D])
-  else:
-    return tf.reshape(gathered, [B, -1])
-def scatter_update(sequence, updates, positions):
-  """Scatter-update a sequence.
-  Args:
-    sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor
-    updates: A tensor of size batch_size*seq_len(*depth)
-    positions: A [batch_size, n_positions] tensor
-  Returns: A tuple of two tensors. First is a [batch_size, seq_len] or
-    [batch_size, seq_len, depth] tensor of "sequence" with elements at
-    "positions" replaced by the values at "updates." Updates to index 0 are
-    ignored. If there are duplicated positions the update is only applied once.
-    Second is a [batch_size, seq_len] mask tensor of which inputs were updated.
-  """
-  shape = modeling.get_shape_list(sequence, expected_rank=[2, 3])
-  depth_dimension = (len(shape) == 3)
-  if depth_dimension:
-    B, L, D = shape
-  else:
-    B, L = shape
-    D = 1
-    sequence = tf.expand_dims(sequence, -1)
-  N = modeling.get_shape_list(positions)[1]
-  shift = tf.expand_dims(L * tf.range(B), -1)
-  flat_positions = tf.reshape(positions + shift, [-1, 1])
-  flat_updates = tf.reshape(updates, [-1, D])
-  updates = tf.scatter_nd(flat_positions, flat_updates, [B * L, D])
-  updates = tf.reshape(updates, [B, L, D])
-  flat_updates_mask = tf.ones([B * N], tf.int32)
-  updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask, [B * L])
-  updates_mask = tf.reshape(updates_mask, [B, L])
-  not_first_token = tf.concat([tf.zeros((B, 1), tf.int32),
-                               tf.ones((B, L - 1), tf.int32)], -1)
-  updates_mask *= not_first_token
-  updates_mask_3d = tf.expand_dims(updates_mask, -1)
-  # account for duplicate positions
-  if sequence.dtype == tf.float32:
-    updates_mask_3d = tf.cast(updates_mask_3d, tf.float32)
-    updates /= tf.maximum(1.0, updates_mask_3d)
-  else:
-    assert sequence.dtype == tf.int32
-    updates = tf.math.floordiv(updates, tf.maximum(1, updates_mask_3d))
-  updates_mask = tf.minimum(updates_mask, 1)
-  updates_mask_3d = tf.minimum(updates_mask_3d, 1)
-  updated_sequence = (((1 - updates_mask_3d) * sequence) +
-                      (updates_mask_3d * updates))
-  if not depth_dimension:
-    updated_sequence = tf.squeeze(updated_sequence, -1)
-  return updated_sequence, updates_mask
-VOCAB_MAPPING = {}
-def get_vocab(config: configure_pretraining.PretrainingConfig):
-  """Memoized load of the vocab file."""
-  if config.vocab_file not in VOCAB_MAPPING:
-    vocab = tokenization.FullTokenizer(
-        config.vocab_file, do_lower_case=True).vocab
-    VOCAB_MAPPING[config.vocab_file] = vocab
-  return VOCAB_MAPPING[config.vocab_file]
-def get_candidates_mask(config: configure_pretraining.PretrainingConfig,
-                        inputs: pretrain_data.Inputs,
-                        disallow_from_mask=None):
-  """Returns a mask tensor of positions in the input that can be masked out."""
-  vocab = get_vocab(config)
-  ignore_ids = [vocab["[SEP]"], vocab["[CLS]"], vocab["[MASK]"]]
-  candidates_mask = tf.ones_like(inputs.input_ids, tf.bool)
-  for ignore_id in ignore_ids:
-    candidates_mask &= tf.not_equal(inputs.input_ids, ignore_id)
-  candidates_mask &= tf.cast(inputs.input_mask, tf.bool)
-  if disallow_from_mask is not None:
-    candidates_mask &= ~disallow_from_mask
-  return candidates_mask
-def mask(config: configure_pretraining.PretrainingConfig,
-         inputs: pretrain_data.Inputs, mask_prob, proposal_distribution=1.0,
-         disallow_from_mask=None, already_masked=None):
-  """Implementation of dynamic masking. The optional arguments aren't needed for
-  BERT/ELECTRA and are from early experiments in "strategically" masking out
-  tokens instead of uniformly at random.
-  Args:
-    config: configure_pretraining.PretrainingConfig
-    inputs: pretrain_data.Inputs containing input input_ids/input_mask
-    mask_prob: percent of tokens to mask
-    proposal_distribution: for non-uniform masking can be a [B, L] tensor
-                           of scores for masking each position.
-    disallow_from_mask: a boolean tensor of [B, L] of positions that should
-                        not be masked out
-    already_masked: a boolean tensor of [B, N] of already masked-out tokens
-                    for multiple rounds of masking
-  Returns: a pretrain_data.Inputs with masking added
-  """
-  # Get the batch size, sequence length, and max masked-out tokens
-  N = config.max_predictions_per_seq
-  B, L = modeling.get_shape_list(inputs.input_ids)
-  # Find indices where masking out a token is allowed
-  vocab = get_vocab(config)
-  candidates_mask = get_candidates_mask(config, inputs, disallow_from_mask)
-  # Set the number of tokens to mask out per example
-  num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32)
-  num_to_predict = tf.maximum(1, tf.minimum(
-      N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32)))
-  masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32)
-  if already_masked is not None:
-    masked_lm_weights *= (1 - already_masked)
-  # Get a probability of masking each position in the sequence
-  candidate_mask_float = tf.cast(candidates_mask, tf.float32)
-  sample_prob = (proposal_distribution * candidate_mask_float)
-  sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True)
-  # Sample the positions to mask out
-  sample_prob = tf.stop_gradient(sample_prob)
-  sample_logits = tf.log(sample_prob)
-  masked_lm_positions = tf.random.categorical(
-      sample_logits, N, dtype=tf.int32)
-  masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32)
-  # Get the ids of the masked-out tokens
-  shift = tf.expand_dims(L * tf.range(B), -1)
-  flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1])
-  masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]),
-                               flat_positions)
-  masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1])
-  masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32)
-  # Update the input ids
-  replace_with_mask_positions = masked_lm_positions * tf.cast(
-      tf.less(tf.random.uniform([B, N]), 0.85), tf.int32)
-  inputs_ids, _ = scatter_update(
-      inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]),
-      replace_with_mask_positions)
-  return pretrain_data.get_updated_inputs(
-      inputs,
-      input_ids=tf.stop_gradient(inputs_ids),
-      masked_lm_positions=masked_lm_positions,
-      masked_lm_ids=masked_lm_ids,
-      masked_lm_weights=masked_lm_weights
-  )
-def unmask(inputs: pretrain_data.Inputs):
-  unmasked_input_ids, _ = scatter_update(
-      inputs.input_ids, inputs.masked_lm_ids, inputs.masked_lm_positions)
-  return pretrain_data.get_updated_inputs(inputs, input_ids=unmasked_input_ids)
-def sample_from_softmax(logits, disallow=None):
-  if disallow is not None:
-    logits -= 1000.0 * disallow
-  uniform_noise = tf.random.uniform(
-      modeling.get_shape_list(logits), minval=0, maxval=1)
-  gumbel_noise = -tf.log(-tf.log(uniform_noise + 1e-9) + 1e-9)
-  return tf.one_hot(tf.argmax(tf.nn.softmax(logits + gumbel_noise), -1,
-                              output_type=tf.int32), logits.shape[-1])

SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

SinaTools 0.1.40py2.py3-none-any.whl → 1.0.1py2.py3-none-any.whl