SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,202 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2018 The Google AI Language Team Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """Functions and classes related to optimization (weight updates)."""
16
-
17
- from __future__ import absolute_import
18
- from __future__ import division
19
- from __future__ import print_function
20
-
21
- import re
22
- import tensorflow as tf
23
- import lamb_optimizer
24
-
25
- def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
26
- optimizer="adamw", poly_power=1.0, start_warmup_step=0,
27
- colocate_gradients_with_ops=False):
28
- """Creates an optimizer training op."""
29
- global_step = tf.train.get_or_create_global_step()
30
-
31
- learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
32
-
33
- # Implements linear decay of the learning rate.
34
- learning_rate = tf.train.polynomial_decay(
35
- learning_rate,
36
- global_step,
37
- num_train_steps,
38
- end_learning_rate=0.0,
39
- power=poly_power,
40
- cycle=False,
41
- )
42
-
43
- # Implements linear warmup. I.e., if global_step - start_warmup_step <
44
- # num_warmup_steps, the learning rate will be
45
- # `(global_step - start_warmup_step)/num_warmup_steps * init_lr`.
46
- if num_warmup_steps:
47
- tf.logging.info("++++++ warmup starts at step " + str(start_warmup_step)
48
- + ", for " + str(num_warmup_steps) + " steps ++++++")
49
- global_steps_int = tf.cast(global_step, tf.int32)
50
- start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32)
51
- global_steps_int = global_steps_int - start_warm_int
52
- warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
53
-
54
- global_steps_float = tf.cast(global_steps_int, tf.float32)
55
- warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
56
-
57
- warmup_percent_done = global_steps_float / warmup_steps_float
58
- warmup_learning_rate = init_lr * warmup_percent_done
59
-
60
- is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
61
- learning_rate = (
62
- 1.0 - is_warmup
63
- ) * learning_rate + is_warmup * warmup_learning_rate
64
-
65
- # It is OK that you use this optimizer for finetuning, since this
66
- # is how the model was trained (note that the Adam m/v variables are NOT
67
- # loaded from init_checkpoint.)
68
- # It is OK to use AdamW in the finetuning even the model is trained by LAMB.
69
- # As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune
70
- # is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a
71
- # batch size of 64 in the finetune.
72
- if optimizer == "adamw":
73
- tf.logging.info("using adamw")
74
- optimizer = AdamWeightDecayOptimizer(
75
- learning_rate=learning_rate,
76
- weight_decay_rate=0.01,
77
- beta_1=0.9,
78
- beta_2=0.999,
79
- epsilon=1e-6,
80
- exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
81
- elif optimizer == "lamb":
82
- tf.logging.info("using lamb")
83
- optimizer = lamb_optimizer.LAMBOptimizer(
84
- learning_rate=learning_rate,
85
- weight_decay_rate=0.01,
86
- beta_1=0.9,
87
- beta_2=0.999,
88
- epsilon=1e-6,
89
- exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
90
- else:
91
- raise ValueError("Not supported optimizer: ", optimizer)
92
-
93
- if use_tpu:
94
- optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
95
-
96
- tvars = tf.trainable_variables()
97
- grads = tf.gradients(loss, tvars)
98
-
99
- # This is how the model was pre-trained.
100
- (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
101
-
102
- train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)
103
-
104
- # Normally the global step update is done inside of `apply_gradients`.
105
- # However, neither `AdamWeightDecayOptimizer` nor `LAMBOptimizer` do this.
106
- # But if you use a different optimizer, you should probably take this line
107
- # out.
108
- new_global_step = global_step + 1
109
- train_op = tf.group(train_op, [global_step.assign(new_global_step)])
110
- return train_op
111
-
112
-
113
- class AdamWeightDecayOptimizer(tf.train.Optimizer):
114
- """A basic Adam optimizer that includes "correct" L2 weight decay."""
115
-
116
- def __init__(
117
- self,
118
- learning_rate,
119
- weight_decay_rate=0.0,
120
- beta_1=0.9,
121
- beta_2=0.999,
122
- epsilon=1e-6,
123
- exclude_from_weight_decay=None,
124
- name="AdamWeightDecayOptimizer",
125
- ):
126
- """Constructs a AdamWeightDecayOptimizer."""
127
- super(AdamWeightDecayOptimizer, self).__init__(False, name)
128
-
129
- self.learning_rate = learning_rate
130
- self.weight_decay_rate = weight_decay_rate
131
- self.beta_1 = beta_1
132
- self.beta_2 = beta_2
133
- self.epsilon = epsilon
134
- self.exclude_from_weight_decay = exclude_from_weight_decay
135
-
136
- def apply_gradients(self, grads_and_vars, global_step=None, name=None):
137
- """See base class."""
138
- assignments = []
139
- for (grad, param) in grads_and_vars:
140
- if grad is None or param is None:
141
- continue
142
-
143
- param_name = self._get_variable_name(param.name)
144
-
145
- m = tf.get_variable(
146
- name=param_name + "/adam_m",
147
- shape=param.shape.as_list(),
148
- dtype=tf.float32,
149
- trainable=False,
150
- initializer=tf.zeros_initializer(),
151
- )
152
- v = tf.get_variable(
153
- name=param_name + "/adam_v",
154
- shape=param.shape.as_list(),
155
- dtype=tf.float32,
156
- trainable=False,
157
- initializer=tf.zeros_initializer(),
158
- )
159
-
160
- # Standard Adam update.
161
- next_m = tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)
162
- next_v = tf.multiply(self.beta_2, v) + tf.multiply(
163
- 1.0 - self.beta_2, tf.square(grad)
164
- )
165
-
166
- update = next_m / (tf.sqrt(next_v) + self.epsilon)
167
-
168
- # Just adding the square of the weights to the loss function is *not*
169
- # the correct way of using L2 regularization/weight decay with Adam,
170
- # since that will interact with the m and v parameters in strange ways.
171
- #
172
- # Instead we want ot decay the weights in a manner that doesn't interact
173
- # with the m/v parameters. This is equivalent to adding the square
174
- # of the weights to the loss with plain (non-momentum) SGD.
175
- if self._do_use_weight_decay(param_name):
176
- update += self.weight_decay_rate * param
177
-
178
- update_with_lr = self.learning_rate * update
179
-
180
- next_param = param - update_with_lr
181
-
182
- assignments.extend(
183
- [param.assign(next_param), m.assign(next_m), v.assign(next_v)]
184
- )
185
- return tf.group(*assignments, name=name)
186
-
187
- def _do_use_weight_decay(self, param_name):
188
- """Whether to use L2 weight decay for `param_name`."""
189
- if not self.weight_decay_rate:
190
- return False
191
- if self.exclude_from_weight_decay:
192
- for r in self.exclude_from_weight_decay:
193
- if re.search(r, param_name) is not None:
194
- return False
195
- return True
196
-
197
- def _get_variable_name(self, param_name):
198
- """Get the variable name from the tensor name."""
199
- m = re.match("^(.*):\\d+$", param_name)
200
- if m is not None:
201
- param_name = m.group(1)
202
- return param_name