SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,193 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Functions and classes related to optimization (weight updates).
17
- Modified from the original BERT code to allow for having separate learning
18
- rates for different layers of the network.
19
- """
20
-
21
- from __future__ import absolute_import
22
- from __future__ import division
23
- from __future__ import print_function
24
-
25
- import collections
26
- import re
27
- import tensorflow as tf
28
-
29
-
30
- def create_optimizer(
31
- loss, learning_rate, num_train_steps, weight_decay_rate=0.0, use_tpu=False,
32
- warmup_steps=0, warmup_proportion=0, lr_decay_power=1.0,
33
- layerwise_lr_decay_power=-1, n_transformer_layers=None):
34
- """Creates an optimizer and training op."""
35
- global_step = tf.train.get_or_create_global_step()
36
- learning_rate = tf.train.polynomial_decay(
37
- learning_rate,
38
- global_step,
39
- num_train_steps,
40
- end_learning_rate=0.0,
41
- power=lr_decay_power,
42
- cycle=False)
43
- warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps)
44
- learning_rate *= tf.minimum(
45
- 1.0, tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32))
46
-
47
- if layerwise_lr_decay_power > 0:
48
- learning_rate = _get_layer_lrs(learning_rate, layerwise_lr_decay_power,
49
- n_transformer_layers)
50
- optimizer = AdamWeightDecayOptimizer(
51
- learning_rate=learning_rate,
52
- weight_decay_rate=weight_decay_rate,
53
- beta_1=0.9,
54
- beta_2=0.999,
55
- epsilon=1e-6,
56
- exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
57
- if use_tpu:
58
- optimizer = tf.tpu.CrossShardOptimizer(optimizer)
59
-
60
- tvars = tf.trainable_variables()
61
- grads = tf.gradients(loss, tvars)
62
- (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
63
- train_op = optimizer.apply_gradients(
64
- zip(grads, tvars), global_step=global_step)
65
- new_global_step = global_step + 1
66
- train_op = tf.group(train_op, [global_step.assign(new_global_step)])
67
- return train_op
68
-
69
-
70
- class AdamWeightDecayOptimizer(tf.train.Optimizer):
71
- """A basic Adam optimizer that includes "correct" L2 weight decay."""
72
-
73
- def __init__(self,
74
- learning_rate,
75
- weight_decay_rate=0.0,
76
- beta_1=0.9,
77
- beta_2=0.999,
78
- epsilon=1e-6,
79
- exclude_from_weight_decay=None,
80
- name="AdamWeightDecayOptimizer"):
81
- """Constructs a AdamWeightDecayOptimizer."""
82
- super(AdamWeightDecayOptimizer, self).__init__(False, name)
83
-
84
- self.learning_rate = learning_rate
85
- self.weight_decay_rate = weight_decay_rate
86
- self.beta_1 = beta_1
87
- self.beta_2 = beta_2
88
- self.epsilon = epsilon
89
- self.exclude_from_weight_decay = exclude_from_weight_decay
90
-
91
- def _apply_gradients(self, grads_and_vars, learning_rate):
92
- """See base class."""
93
- assignments = []
94
- for (grad, param) in grads_and_vars:
95
- if grad is None or param is None:
96
- continue
97
-
98
- param_name = self._get_variable_name(param.name)
99
-
100
- m = tf.get_variable(
101
- name=param_name + "/adam_m",
102
- shape=param.shape.as_list(),
103
- dtype=tf.float32,
104
- trainable=False,
105
- initializer=tf.zeros_initializer())
106
- v = tf.get_variable(
107
- name=param_name + "/adam_v",
108
- shape=param.shape.as_list(),
109
- dtype=tf.float32,
110
- trainable=False,
111
- initializer=tf.zeros_initializer())
112
-
113
- # Standard Adam update.
114
- next_m = (
115
- tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
116
- next_v = (
117
- tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
118
- tf.square(grad)))
119
- update = next_m / (tf.sqrt(next_v) + self.epsilon)
120
-
121
- # Just adding the square of the weights to the loss function is *not*
122
- # the correct way of using L2 regularization/weight decay with Adam,
123
- # since that will interact with the m and v parameters in strange ways.
124
- #
125
- # Instead we want ot decay the weights in a manner that doesn't interact
126
- # with the m/v parameters. This is equivalent to adding the square
127
- # of the weights to the loss with plain (non-momentum) SGD.
128
- if self.weight_decay_rate > 0:
129
- if self._do_use_weight_decay(param_name):
130
- update += self.weight_decay_rate * param
131
-
132
- update_with_lr = learning_rate * update
133
- next_param = param - update_with_lr
134
-
135
- assignments.extend(
136
- [param.assign(next_param),
137
- m.assign(next_m),
138
- v.assign(next_v)])
139
-
140
- return assignments
141
-
142
- def apply_gradients(self, grads_and_vars, global_step=None, name=None):
143
- if isinstance(self.learning_rate, dict):
144
- key_to_grads_and_vars = {}
145
- for grad, var in grads_and_vars:
146
- update_for_var = False
147
- for key in self.learning_rate:
148
- if key in var.name:
149
- update_for_var = True
150
- if key not in key_to_grads_and_vars:
151
- key_to_grads_and_vars[key] = []
152
- key_to_grads_and_vars[key].append((grad, var))
153
- if not update_for_var:
154
- raise ValueError("No learning rate specified for variable", var)
155
- assignments = []
156
- for key, key_grads_and_vars in key_to_grads_and_vars.items():
157
- assignments += self._apply_gradients(key_grads_and_vars,
158
- self.learning_rate[key])
159
- else:
160
- assignments = self._apply_gradients(grads_and_vars, self.learning_rate)
161
- return tf.group(*assignments, name=name)
162
-
163
- def _do_use_weight_decay(self, param_name):
164
- """Whether to use L2 weight decay for `param_name`."""
165
- if not self.weight_decay_rate:
166
- return False
167
- if self.exclude_from_weight_decay:
168
- for r in self.exclude_from_weight_decay:
169
- if re.search(r, param_name) is not None:
170
- return False
171
- return True
172
-
173
- def _get_variable_name(self, param_name):
174
- """Get the variable name from the tensor name."""
175
- m = re.match("^(.*):\\d+$", param_name)
176
- if m is not None:
177
- param_name = m.group(1)
178
- return param_name
179
-
180
-
181
- def _get_layer_lrs(learning_rate, layer_decay, n_layers):
182
- """Have lower learning rates for layers closer to the input."""
183
- key_to_depths = collections.OrderedDict({
184
- "/embeddings/": 0,
185
- "/embeddings_project/": 0,
186
- "task_specific/": n_layers + 2,
187
- })
188
- for layer in range(n_layers):
189
- key_to_depths["encoder/layer_" + str(layer) + "/"] = layer + 1
190
- return {
191
- key: learning_rate * (layer_decay ** (n_layers + 2 - depth))
192
- for key, depth in key_to_depths.items()
193
- }
@@ -1,355 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Tokenization classes, the same as used for BERT."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import collections
23
- import unicodedata
24
- import six
25
- import tensorflow as tf
26
-
27
-
28
-
29
- def convert_to_unicode(text):
30
- """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
31
- if six.PY3:
32
- if isinstance(text, str):
33
- return text
34
- elif isinstance(text, bytes):
35
- return text.decode("utf-8", "ignore")
36
- else:
37
- raise ValueError("Unsupported string type: %s" % (type(text)))
38
- elif six.PY2:
39
- if isinstance(text, str):
40
- return text.decode("utf-8", "ignore")
41
- elif isinstance(text, unicode):
42
- return text
43
- else:
44
- raise ValueError("Unsupported string type: %s" % (type(text)))
45
- else:
46
- raise ValueError("Not running on Python2 or Python 3?")
47
-
48
-
49
- def printable_text(text):
50
- """Returns text encoded in a way suitable for print or `tf.logging`."""
51
-
52
- # These functions want `str` for both Python2 and Python3, but in one case
53
- # it's a Unicode string and in the other it's a byte string.
54
- if six.PY3:
55
- if isinstance(text, str):
56
- return text
57
- elif isinstance(text, bytes):
58
- return text.decode("utf-8", "ignore")
59
- else:
60
- raise ValueError("Unsupported string type: %s" % (type(text)))
61
- elif six.PY2:
62
- if isinstance(text, str):
63
- return text
64
- elif isinstance(text, unicode):
65
- return text.encode("utf-8")
66
- else:
67
- raise ValueError("Unsupported string type: %s" % (type(text)))
68
- else:
69
- raise ValueError("Not running on Python2 or Python 3?")
70
-
71
-
72
- def load_vocab(vocab_file):
73
- """Loads a vocabulary file into a dictionary."""
74
- vocab = collections.OrderedDict()
75
- index = 0
76
- with tf.io.gfile.GFile(vocab_file, "r") as reader:
77
- while True:
78
- token = convert_to_unicode(reader.readline())
79
- if not token:
80
- break
81
- token = token.strip()
82
- vocab[token] = index
83
- index += 1
84
- return vocab
85
-
86
-
87
- def convert_by_vocab(vocab, items):
88
- """Converts a sequence of [tokens|ids] using the vocab."""
89
- output = []
90
- for item in items:
91
- output.append(vocab[item])
92
- return output
93
-
94
-
95
- def convert_tokens_to_ids(vocab, tokens):
96
- return convert_by_vocab(vocab, tokens)
97
-
98
-
99
- def convert_ids_to_tokens(inv_vocab, ids):
100
- return convert_by_vocab(inv_vocab, ids)
101
-
102
-
103
- def whitespace_tokenize(text):
104
- """Runs basic whitespace cleaning and splitting on a piece of text."""
105
- text = text.strip()
106
- if not text:
107
- return []
108
- tokens = text.split()
109
- return tokens
110
-
111
-
112
- class FullTokenizer(object):
113
- """Runs end-to-end tokenziation."""
114
-
115
- def __init__(self, vocab_file, do_lower_case=True):
116
- self.vocab = load_vocab(vocab_file)
117
- self.inv_vocab = {v: k for k, v in self.vocab.items()}
118
- self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
119
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
120
-
121
- def tokenize(self, text):
122
- split_tokens = []
123
- for token in self.basic_tokenizer.tokenize(text):
124
- for sub_token in self.wordpiece_tokenizer.tokenize(token):
125
- split_tokens.append(sub_token)
126
-
127
- return split_tokens
128
-
129
- def convert_tokens_to_ids(self, tokens):
130
- return convert_by_vocab(self.vocab, tokens)
131
-
132
- def convert_ids_to_tokens(self, ids):
133
- return convert_by_vocab(self.inv_vocab, ids)
134
-
135
-
136
- class BasicTokenizer(object):
137
- """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
138
-
139
- def __init__(self, do_lower_case=True):
140
- """Constructs a BasicTokenizer.
141
-
142
- Args:
143
- do_lower_case: Whether to lower case the input.
144
- """
145
- self.do_lower_case = do_lower_case
146
-
147
- def tokenize(self, text):
148
- """Tokenizes a piece of text."""
149
- text = convert_to_unicode(text)
150
- text = self._clean_text(text)
151
-
152
- # This was added on November 1st, 2018 for the multilingual and Chinese
153
- # models. This is also applied to the English models now, but it doesn't
154
- # matter since the English models were not trained on any Chinese data
155
- # and generally don't have any Chinese data in them (there are Chinese
156
- # characters in the vocabulary because Wikipedia does have some Chinese
157
- # words in the English Wikipedia.).
158
- text = self._tokenize_chinese_chars(text)
159
-
160
- orig_tokens = whitespace_tokenize(text)
161
- split_tokens = []
162
- for token in orig_tokens:
163
- if self.do_lower_case:
164
- token = token.lower()
165
- token = self._run_strip_accents(token)
166
- split_tokens.extend(self._run_split_on_punc(token))
167
-
168
- output_tokens = whitespace_tokenize(" ".join(split_tokens))
169
- return output_tokens
170
-
171
- def _run_strip_accents(self, text):
172
- """Strips accents from a piece of text."""
173
- text = unicodedata.normalize("NFD", text)
174
- output = []
175
- for char in text:
176
- cat = unicodedata.category(char)
177
- if cat == "Mn":
178
- continue
179
- output.append(char)
180
- return "".join(output)
181
-
182
- def _run_split_on_punc(self, text):
183
- """Splits punctuation on a piece of text."""
184
- chars = list(text)
185
- i = 0
186
- start_new_word = True
187
- output = []
188
- while i < len(chars):
189
- char = chars[i]
190
- if _is_punctuation(char):
191
- output.append([char])
192
- start_new_word = True
193
- else:
194
- if start_new_word:
195
- output.append([])
196
- start_new_word = False
197
- output[-1].append(char)
198
- i += 1
199
-
200
- return ["".join(x) for x in output]
201
-
202
- def _tokenize_chinese_chars(self, text):
203
- """Adds whitespace around any CJK character."""
204
- output = []
205
- for char in text:
206
- cp = ord(char)
207
- if self._is_chinese_char(cp):
208
- output.append(" ")
209
- output.append(char)
210
- output.append(" ")
211
- else:
212
- output.append(char)
213
- return "".join(output)
214
-
215
- def _is_chinese_char(self, cp):
216
- """Checks whether CP is the codepoint of a CJK character."""
217
- # This defines a "chinese character" as anything in the CJK Unicode block:
218
- # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
219
- #
220
- # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
221
- # despite its name. The modern Korean Hangul alphabet is a different block,
222
- # as is Japanese Hiragana and Katakana. Those alphabets are used to write
223
- # space-separated words, so they are not treated specially and handled
224
- # like the all of the other languages.
225
- if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
226
- (cp >= 0x3400 and cp <= 0x4DBF) or #
227
- (cp >= 0x20000 and cp <= 0x2A6DF) or #
228
- (cp >= 0x2A700 and cp <= 0x2B73F) or #
229
- (cp >= 0x2B740 and cp <= 0x2B81F) or #
230
- (cp >= 0x2B820 and cp <= 0x2CEAF) or
231
- (cp >= 0xF900 and cp <= 0xFAFF) or #
232
- (cp >= 0x2F800 and cp <= 0x2FA1F)): #
233
- return True
234
-
235
- return False
236
-
237
- def _clean_text(self, text):
238
- """Performs invalid character removal and whitespace cleanup on text."""
239
- output = []
240
- for char in text:
241
- cp = ord(char)
242
- if cp == 0 or cp == 0xfffd or _is_control(char):
243
- continue
244
- if _is_whitespace(char):
245
- output.append(" ")
246
- else:
247
- output.append(char)
248
- return "".join(output)
249
-
250
-
251
- class WordpieceTokenizer(object):
252
- """Runs WordPiece tokenziation."""
253
-
254
- def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
255
- self.vocab = vocab
256
- self.unk_token = unk_token
257
- self.max_input_chars_per_word = max_input_chars_per_word
258
-
259
- def tokenize(self, text):
260
- """Tokenizes a piece of text into its word pieces.
261
-
262
- This uses a greedy longest-match-first algorithm to perform tokenization
263
- using the given vocabulary.
264
-
265
- For example:
266
- input = "unaffable"
267
- output = ["un", "##aff", "##able"]
268
-
269
- Args:
270
- text: A single token or whitespace separated tokens. This should have
271
- already been passed through `BasicTokenizer.
272
-
273
- Returns:
274
- A list of wordpiece tokens.
275
- """
276
-
277
- text = convert_to_unicode(text)
278
-
279
- output_tokens = []
280
- for token in whitespace_tokenize(text):
281
- chars = list(token)
282
- if len(chars) > self.max_input_chars_per_word:
283
- output_tokens.append(self.unk_token)
284
- continue
285
-
286
- is_bad = False
287
- start = 0
288
- sub_tokens = []
289
- while start < len(chars):
290
- end = len(chars)
291
- cur_substr = None
292
- while start < end:
293
- substr = "".join(chars[start:end])
294
- if start > 0:
295
- substr = "##" + substr
296
- if substr in self.vocab:
297
- cur_substr = substr
298
- break
299
- end -= 1
300
- if cur_substr is None:
301
- is_bad = True
302
- break
303
- sub_tokens.append(cur_substr)
304
- start = end
305
-
306
- if is_bad:
307
- output_tokens.append(self.unk_token)
308
- else:
309
- output_tokens.extend(sub_tokens)
310
- return output_tokens
311
-
312
-
313
- def _is_whitespace(char):
314
- """Checks whether `chars` is a whitespace character."""
315
- # \t, \n, and \r are technically contorl characters but we treat them
316
- # as whitespace since they are generally considered as such.
317
- if char == " " or char == "\t" or char == "\n" or char == "\r":
318
- return True
319
- cat = unicodedata.category(char)
320
- if cat == "Zs":
321
- return True
322
- return False
323
-
324
-
325
- def _is_control(char):
326
- """Checks whether `chars` is a control character."""
327
- # These are technically control characters but we count them as whitespace
328
- # characters.
329
- if char == "\t" or char == "\n" or char == "\r":
330
- return False
331
- cat = unicodedata.category(char)
332
- if cat.startswith("C"):
333
- return True
334
- return False
335
-
336
-
337
- def _is_punctuation(char):
338
- """Checks whether `chars` is a punctuation character."""
339
- cp = ord(char)
340
- # We treat all non-letter/number ASCII as punctuation.
341
- # Characters such as "^", "$", and "`" are not in the Unicode
342
- # Punctuation class but we treat them as punctuation anyways, for
343
- # consistency.
344
- if (
345
- cp == 91 or cp == 93 or cp == 43
346
- ): # [ and ] are not punctuation since they are used in [xx] and the +
347
- return False
348
-
349
- if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
350
- (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
351
- return True
352
- cat = unicodedata.category(char)
353
- if cat.startswith("P"):
354
- return True
355
- return False
@@ -1,14 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.