nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. nlpertools/__init__.py +24 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/data_client.py +387 -257
  9. nlpertools/data_structure/base_structure.py +109 -13
  10. nlpertools/dataprocess.py +611 -3
  11. nlpertools/default_db_config.yml +41 -0
  12. nlpertools/io/__init__.py +3 -3
  13. nlpertools/io/dir.py +54 -36
  14. nlpertools/io/file.py +277 -222
  15. nlpertools/ml.py +483 -460
  16. nlpertools/monitor/__init__.py +0 -0
  17. nlpertools/monitor/gpu.py +18 -0
  18. nlpertools/monitor/memory.py +24 -0
  19. nlpertools/movie.py +36 -0
  20. nlpertools/nlpertools_config.yml +1 -0
  21. nlpertools/{openApi.py → open_api.py} +65 -65
  22. nlpertools/other.py +364 -249
  23. nlpertools/pic.py +288 -0
  24. nlpertools/plugin.py +43 -43
  25. nlpertools/reminder.py +98 -87
  26. nlpertools/utils/__init__.py +3 -3
  27. nlpertools/utils/lazy.py +727 -0
  28. nlpertools/utils/log_util.py +20 -0
  29. nlpertools/utils/package.py +89 -76
  30. nlpertools/utils/package_v1.py +94 -0
  31. nlpertools/utils/package_v2.py +117 -0
  32. nlpertools/utils_for_nlpertools.py +93 -93
  33. nlpertools/vector_index_demo.py +108 -0
  34. nlpertools/wrapper.py +161 -96
  35. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  36. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  37. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  38. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  39. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  40. nlpertools_helper/__init__.py +10 -0
  41. nlpertools-1.0.5.dist-info/METADATA +0 -85
  42. nlpertools-1.0.5.dist-info/RECORD +0 -25
  43. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/dataprocess.py CHANGED
@@ -1,3 +1,611 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import re
5
+ import string
6
+ from typing import List
7
+
8
+ import numpy as np
9
+
10
+ # from . import DB_CONFIG_FILE # cannot import name 'DB_CONFIG_FILE' from partially initialized module 'nlpertools'
11
+ from .utils.package import *
12
+
13
+ main_special_characters = string.punctuation + string.digits + string.whitespace
14
+ other_special_characters = (
15
+ "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═"
16
+ "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
17
+ "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
18
+ "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
19
+ "」﴾》"
20
+ )
21
+
22
+
23
+ class Pattern:
24
+ """
25
+ >>> pattern_special_char = re.compile("[{}{}]".format(pattern_special_char_x[1:-1], pattern_special_char_u[1:-1]))
26
+ a = "\U000d8be6asdasdas \x00v啊实打实\x00\x00v阿松大\x00"
27
+ res = re.sub(pattern_special_char, "$",a)
28
+ """
29
+
30
+ # some from data-prepare
31
+
32
+ # emoji
33
+ """
34
+ # 这也是emoji的取法,不知道pattern全不全
35
+ import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
36
+ emoji = list(emoji.UNICODE_EMOJI["en"].keys())
37
+ """
38
+ emoji_pattern = "[\U00010000-\U0010ffff\\uD800-\\uDBFF\\uDC00-\\uDFFF]"
39
+
40
+ # 特殊的乱码或不可见字符
41
+ # \x 09:\t 0a:\n 0d:\r
42
+ special_char_x_pattern = "[\x00-\x08\x0b\x0c\x0e\x0f\x10-\x19\x1a-\x1f]"
43
+ # 统计大规模语料出来的非正常字符
44
+ special_char_u_pattern = (
45
+ "[\u3000\U000d8be6\U000e0062\U000e0063\U000e0067\U000e0073\U000e0074\U000e007f]"
46
+ )
47
+ special_char_pattern = "{}{}".format(
48
+ special_char_x_pattern[1:-1], special_char_u_pattern[1:-1]
49
+ )
50
+ non_printing_characters_pattern = (
51
+ f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]"
52
+ )
53
+
54
+ # 必须从头匹配,否则无意义的
55
+ # 中文人名
56
+ chinese_name_pattern = "(?:[\u4e00-\u9fa5·]{2,3})"
57
+ # 英文人名
58
+ english_name_pattern = "(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
59
+ # 纯数字
60
+ pure_num_pattern = "\d+"
61
+ # xxxx图/表 之类的表述
62
+ pic_table_descript_pattern = ".{1,15}图"
63
+
64
+ # 无需从头匹配的。
65
+ # hlink
66
+ hlink_pattern = (
67
+ r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
68
+ )
69
+ http_pattern = "(http|https):\/\/([\w.]+\/?)\S*/\S*"
70
+ # 邮箱
71
+ email_pattern = "[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
72
+ # html 可能过于严格了
73
+ html_pattern = "<[\s\S]*?>"
74
+ # 重复 “asdasdasdasd”
75
+ repeat_pattern = "(.)\1+"
76
+ # 日期
77
+ day_time_pattern = "\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
78
+ # 小时
79
+ hour_time_pattern = "(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
80
+ # 股票
81
+ stock_pattern = (
82
+ "(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
83
+ )
84
+
85
+ # 一般是需要替换的
86
+ # 多余空格 => " "
87
+ redundancy_space_pattern = " +"
88
+ # 一般用不到 多余换行符号 => " "
89
+ linebreak_pattern = "[\r\n\t]+"
90
+
91
+ # 微博视频等
92
+ weibo_pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
93
+ # @
94
+ at_pattern = "@\w+"
95
+
96
+ # from https://github.com/bigscience-workshop/data-preparation pii
97
+ year_patterns = [
98
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])",
99
+ # yyyy-yyyy or yyyy/yyyy
100
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
101
+ # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm
102
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
103
+ # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy
104
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
105
+ # mm-yyyy or mm/yyyy or the same but with yy
106
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
107
+ # yyyy-mm or yyyy/mm
108
+ ]
109
+
110
+ # Patterns for high-risk character strings
111
+ id_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([A-Za-z]*(?:[\p{Pd}]*\p{Nd}){6,})(?:$|[\b\s@?,!;:\'\")(.\p{Han}])'
112
+ # https://regex101.com/r/JQkmh8/2
113
+ # key_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[\s\p{Pd}]?){4,})(?:$|[\b\s\p{Han}@?,!;:\'\"])'
114
+ # https://regex101.com/r/JQkmh8/5
115
+ key_pattern = r'(?:^|[\b\s@?,!:;\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:_]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[ \p{Pd}]?){3,})(?:$|[\b\s\p{Han}@?,!;:\'\")(.])'
116
+ ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
117
+ ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
118
+ ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join(
119
+ [ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
120
+
121
+ # https://regex101.com/r/EpA5B7/1
122
+ email_line_pattern = r'''
123
+ (?<= ^ | [\b\s@,?!;:)('".\p{Han}<] )
124
+ (
125
+ [^\b\s@?!;,:)('"<]+
126
+ @
127
+ [^\b\s@!?;,/]*
128
+ [^\b\s@?!;,/:)('">.]
129
+ \.
130
+ \p{L} \w{1,}
131
+ )
132
+ (?= $ | [\b\s@,?!;:)('".\p{Han}>] )
133
+ '''
134
+
135
+ # https://regex101.com/r/mOqi1s/3
136
+ # user_pattern = r'(?:^|[\s@,?!;:\'\")(\p{Han}])(@[^\s@,?!;:\'\")(]{3,})'
137
+ user_pattern = r'''
138
+ (?<= ^ | [)(\s@,?!;:'"\p{Han}] )
139
+ (@
140
+ [^)(\s@,?!;:'"]{3,}
141
+ )
142
+ '''
143
+
144
+
145
+ class CalcPPL(object):
146
+ # ppl计算
147
+ # https://www.scribendi.ai/comparing-bert-and-gpt-2-as-language-models-to-score-the-grammatical-correctness-of-a-sentence/
148
+ def __init__(self, model_type, model_path, tokenizer_path):
149
+ self.model_type = model_type
150
+ self.model, self.tokenizer = self._init_model(model_type, model_path, tokenizer_path)
151
+
152
+ @staticmethod
153
+ def _init_model(model_type, model_path, tokenizer_path):
154
+ if model_type == "ngram":
155
+ model = kenlm.Model(model_path)
156
+ tokenizer = sentencepiece.SentencePieceProcessor()
157
+ tokenizer.load(tokenizer_path)
158
+ elif model_type == "bert":
159
+ model = BertForMaskedLM.from_pretrained(model_path)
160
+ tokenizer = BertTokenizer.from_pretrained(model_path)
161
+ elif model_type == "gpt":
162
+ model = GPT2LMHeadModel.from_pretrained(model_path)
163
+ tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
164
+ else:
165
+ model = tokenizer = None
166
+ assert "model_type should in ngram bert gpt"
167
+ return model, tokenizer
168
+
169
+ def ppl(self, sentence):
170
+ # 根据model_type自动选择
171
+ if self.model_type == "ngram":
172
+ return self.ppl_ngram(sentence)
173
+ elif self.model_type == "ngram":
174
+ return self.ppl_bert(sentence)
175
+ else:
176
+ return self.ppl3_gpt(sentence)
177
+
178
+ def ppl_ngram(self, sentence):
179
+ pass
180
+
181
+ def ppl_bert_2(self, sentence):
182
+ # 忘记哪来的
183
+ tokenizer = self.tokenizer
184
+ model = self.tokenizer
185
+ tokenize_input = tokenizer.tokenize(sentence)
186
+ tokenize_input = tokenize_input
187
+ tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
188
+ with torch.no_grad():
189
+ loss = model(tensor_input, labels=tensor_input)[0]
190
+ return np.exp(loss.detach().numpy())
191
+
192
+ # [1] Salazar J, Liang D, Nguyen T Q, et al. Masked Language Model Scoring[C]//Proceedings of ACL. 2020: 2699-2712.
193
+ def ppl_bert(self, sentence):
194
+ tokenizer = self.tokenizer
195
+ model = self.tokenizer
196
+ with torch.no_grad():
197
+ tokenize_input = tokenizer.tokenize(sentence)
198
+ tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
199
+ sen_len = len(tokenize_input)
200
+ sentence_loss = 0.
201
+
202
+ for i, word in enumerate(tokenize_input):
203
+ # add mask to i-th character of the sentence
204
+ tokenize_input[i] = '[MASK]'
205
+ mask_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
206
+
207
+ output = model(mask_input)
208
+
209
+ prediction_scores = output[0]
210
+ softmax = nn.Softmax(dim=0)
211
+ ps = softmax(prediction_scores[0, i]).log()
212
+ word_loss = ps[tensor_input[0, i]]
213
+ sentence_loss += word_loss.item()
214
+
215
+ tokenize_input[i] = word
216
+ ppl = np.exp(-sentence_loss / sen_len)
217
+ # print("困惑度:", ppl)
218
+ return ppl
219
+
220
+ def ppl3_gpt(self, text):
221
+ from torch.nn import CrossEntropyLoss
222
+ # 这里用 GPT2LMHeadModel
223
+ inputs = self.tokenizer([text], padding='max_length', max_length=50, truncation=True, return_tensors="pt")
224
+ bs, sl = inputs['input_ids'].size()
225
+ outputs = self.model(**inputs, labels=inputs['input_ids'])
226
+ logits = outputs[1]
227
+ # Shift so that tokens < n predict n
228
+ shift_logits = logits[:, :-1, :].contiguous()
229
+ shift_labels = inputs['input_ids'][:, 1:].contiguous()
230
+ shift_attentions = inputs['attention_mask'][:, 1:].contiguous()
231
+ # Flatten the tokens
232
+ loss_fct = CrossEntropyLoss(ignore_index=0, reduction="none")
233
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).detach().reshape(bs, -1)
234
+ meanloss = loss.sum(1) / shift_attentions.sum(1)
235
+ ppl = torch.exp(meanloss).numpy().tolist()
236
+ return ppl[0]
237
+
238
+ def test(self):
239
+ sentence = "输入句子:"
240
+ ppl = self.ppl_bert_2(sentence)
241
+ ppl2 = self.ppl_bert(sentence)
242
+ print(ppl)
243
+ print(ppl2)
244
+
245
+
246
+ class TextProcess(object):
247
+ """
248
+ 数据处理类
249
+ 这是基类,如果是定制化的语言处理,请继承该类
250
+ """
251
+
252
+ def __init__(
253
+ self,
254
+ patterns_filter: List = None,
255
+ patterns_replace: List[List] = None,
256
+ words_filter: List = []
257
+ ):
258
+ """
259
+ pattern_list:
260
+ """
261
+ self.patterns_filter, self.patterns_replace = self._pre_compile_pattern(
262
+ patterns_filter, patterns_replace
263
+ )
264
+ self.words_filter = words_filter
265
+
266
+ @staticmethod
267
+ def _pre_compile_pattern(patterns_filter, patterns_replace):
268
+ complied_patterns_replace, complied_patterns_filter = [], []
269
+ for i in patterns_filter:
270
+ complied_patterns_filter.append(re.compile(i))
271
+ for i in patterns_replace:
272
+ complied_patterns_replace.append((re.compile(i[0]), i[1]))
273
+ return complied_patterns_filter, complied_patterns_replace
274
+
275
+ def process(self, text):
276
+ # 进来的数据都要做的标准化
277
+ text = self.full2half(text)
278
+ # text = self.filter_http(text)
279
+ text = self.filter_html(text)
280
+ text = self.filter_html_special(text)
281
+ # 根据类型与语言分别处理
282
+ text = self.filter_exclusive(text)
283
+ # text = self.trandition2simple(text)
284
+ # text = self.remove_stopwords(text)
285
+ return text
286
+
287
+ def filter_words(self, text):
288
+ # 根据词典,命中返回True,需要过滤掉
289
+
290
+ for word in self.words_filter:
291
+ if word in text:
292
+ return True
293
+ return False
294
+
295
+ def filter_whitelist(self, text):
296
+ whitelist = re.compile(
297
+ "[^\u4e00-\u9fa5^0-9a-zA-Z^-^《^》^<^>^【^】^(^)^{^}^–^…^”^“^,^.^;^?^:^‘^~^`^,^。^?^;^!^:^、^·^!^@^#^$^%^&^(^)^|]"
298
+ )
299
+ text = whitelist.sub("", text)
300
+ return text
301
+
302
+ def text_split(self, text, language):
303
+ if language == "en":
304
+ text = text[:256]
305
+ elif language == "zh":
306
+ text = text[:510]
307
+ return text
308
+
309
+ def trandition2simple(self, text):
310
+ # 仅对中文
311
+ """
312
+ https://juejin.cn/post/7234554420163100728
313
+ """
314
+ text = zhconv.convert("我幹什麼不干你事。", "zh-cn")
315
+ return text
316
+
317
+ def remove_stopwords(self, text):
318
+ import jieba
319
+
320
+ new_tokens = []
321
+ if self.language == "en":
322
+ tokens = text.split(" ")
323
+ else:
324
+ tokens = jieba.lcut(text)
325
+
326
+ for i in tokens:
327
+ if i in self.stopwords:
328
+ pass
329
+ else:
330
+ new_tokens.append(i)
331
+
332
+ return new_tokens
333
+
334
+ @staticmethod
335
+ def split_sentence(sentence, language="chinese"):
336
+ """
337
+ 分句,英文有nltk,中文怎么能没有好的分句工具呢
338
+ :param sentence:
339
+ :param language:
340
+ :return:
341
+ """
342
+ # sentences->Str
343
+ # example '12“345。”“6789”'
344
+ assert language in ["chinese", "english"], "unsupportable for other language"
345
+ if language == "chinese":
346
+ split_signs = list("。!?…\t")
347
+ other_sign = "”"
348
+ elif language == "english":
349
+ split_signs = list(".!?")
350
+ other_sign = '"'
351
+ else:
352
+ split_signs = list(".!?")
353
+ other_sign = '"'
354
+ sentences = []
355
+ start_idx = 0
356
+ for idx, char in enumerate(sentence):
357
+ if idx == len(sentence) - 1:
358
+ if char in split_signs:
359
+ sentences.append(sentence[start_idx: idx + 1].strip())
360
+ start_idx = idx + 1
361
+ else:
362
+ sentences.append(sentence[start_idx:].strip())
363
+ else:
364
+ if char in split_signs:
365
+ if sentence[idx + 1] == other_sign:
366
+ if idx < len(sentence) - 2:
367
+ # 处理。”。
368
+ if sentence[idx + 2] not in split_signs:
369
+ sentences.append(sentence[start_idx: idx + 2].strip())
370
+ start_idx = idx + 2
371
+ elif sentence[idx + 1] not in split_signs:
372
+ sentences.append(sentence[start_idx: idx + 1].strip())
373
+ start_idx = idx + 1
374
+ sentences = [i.strip() for i in sentences if i.strip()]
375
+ return sentences
376
+
377
+ def cut_word(self, text, language):
378
+ import jieba
379
+
380
+ if language == "en":
381
+ tokens = text.split(" ")
382
+ else:
383
+ tokens = jieba.lcut(text)
384
+ return tokens
385
+
386
+ def full2half(self, text):
387
+ """
388
+ 全角转化为半角
389
+ :param text:
390
+ :return:
391
+ """
392
+ ret_str = ""
393
+ for i in text:
394
+ if ord(i) >= 33 + 65248 and ord(i) <= 126 + 65248:
395
+ ret_str += chr(ord(i) - 65248)
396
+ else:
397
+ ret_str += i
398
+ return ret_str
399
+
400
+ def filter_html(self, text):
401
+ # 这个比较严格
402
+ """
403
+ 过滤html标签
404
+ :param text:
405
+ :return:
406
+ """
407
+ patterns = [
408
+ re.compile("//<![CDATA[[^>]*//]]>", re.I), # 匹配CDATA
409
+ re.compile("<s*script[^>]*>[^<]*<s*/s*scripts*>", re.I), # Script
410
+ re.compile("<s*style[^>]*>[^<]*<s*/s*styles*>", re.I), # style
411
+ re.compile("<brs*?/?>"), # 处理换行
412
+ re.compile("</?w+[^>]*>"), # HTML标签
413
+ re.compile("<!--[^>]*-->"), # HTML注释
414
+ ]
415
+ for pattern in patterns:
416
+ text = pattern.sub("", text)
417
+ return text
418
+
419
+ def filter_html_special(self, text):
420
+ """
421
+ 替换所有html转义字符
422
+ 这个好像只有新闻有?
423
+ :param text:
424
+ :return:
425
+ """
426
+ # TODO html标签应该是 &nbsp 这种,\xa0也是吗
427
+ CHAR_ENTITIES = {
428
+ "&nbsp": " ",
429
+ "160": " ",
430
+ "lt": "<",
431
+ "60": "<",
432
+ "gt": ">",
433
+ "62": ">",
434
+ "amp": "&",
435
+ "38": "&",
436
+ "quot": '"',
437
+ "34": '"',
438
+ "ldquo": '"',
439
+ "rdquo": '"',
440
+ "mdash": "",
441
+ "\xa0": "",
442
+ }
443
+
444
+ re_charEntity = re.compile(r"&#?(?P<name>\w+);", re.S)
445
+ sz = re.search(re_charEntity, text)
446
+ while sz:
447
+ entity = sz.group() # entity全称,如>
448
+ key = sz.group("name") # 去除&;后entity,如>为gt
449
+ try:
450
+ htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], text, 1)
451
+ text = htmlstr
452
+ sz = re.search(re_charEntity, htmlstr)
453
+ except KeyError:
454
+ # 以空串代替
455
+ htmlstr = re_charEntity.sub("", text, 1)
456
+ text = htmlstr
457
+ sz = re_charEntity.search(htmlstr)
458
+ return text
459
+
460
+ def filter_exclusive(self, text):
461
+ """
462
+ 去除 @、 #、 表情等twitter、微博“特有”的情况
463
+ :return:
464
+ """
465
+ pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
466
+ p = re.compile(pattern, re.S)
467
+ text = p.sub("", text)
468
+
469
+ dr = re.compile("@\w+", re.S)
470
+ text = dr.sub("", text)
471
+
472
+ return text
473
+
474
+ def filter_html_tag(self, text):
475
+ # res_tr = r'<a (.*?)></a>'
476
+ # m_tr = re.findall(res_tr,text,re.S|re.M)
477
+ res = re.sub(r"<a.*?>", "", text)
478
+ res = re.sub(r"</a>", "", res)
479
+ res = re.sub(r"<span.*?>", "", res)
480
+ res = re.sub(r"</span>", "", res)
481
+ res = re.sub(r"<img.*?>", "", res)
482
+ res = re.sub(r"<br.*?>", "", res)
483
+ res = re.sub(r"//", "", res)
484
+ res = re.sub(r"@", "", res)
485
+ res = re.sub(r"</", "", res)
486
+ # res = re.sub(r',', '', res)
487
+ # res = re.sub(r'&nbsp;', '', res)
488
+ return res
489
+
490
+ @staticmethod
491
+ def uniform_whitespace(
492
+ document,
493
+ whitespace=[
494
+ " ",
495
+ " ",
496
+ " ",
497
+ " ",
498
+ " ",
499
+ " ",
500
+ " ",
501
+ " ",
502
+ " ",
503
+ " ",
504
+ "",
505
+ "„",
506
+ ],
507
+ ):
508
+ # from https://github.com/bigscience-workshop/data-preparation
509
+ """There are different whitespace characters."""
510
+ whitespace = set(whitespace)
511
+ document = "".join(
512
+ [char if char not in whitespace else " " for char in document]
513
+ )
514
+ return document
515
+
516
+ def filter_pattern(self, text):
517
+ """
518
+ 返回True表示命中规则,需要过滤
519
+ """
520
+ for pattern in self.patterns_filter:
521
+ if re.match(pattern, text):
522
+ return True
523
+ return False
524
+
525
+ def replace_pattern(self, text):
526
+ for pattern, replace in self.patterns_replace:
527
+ text = re.sub(pattern, replace, text)
528
+ return text
529
+
530
+ def calc_proportion_zh(self,text):
531
+ text = text.strip()
532
+ # 如果是中国英文的情况,并且英文有空格分开
533
+ if " " in text:
534
+ pass
535
+ chinese_count = 0
536
+ for char in text:
537
+ if '\u4e00' <= char <= '\u9fff':
538
+ chinese_count += 1
539
+ else:
540
+ pass
541
+ class CopyFunc():
542
+ # from https://github.com/lemon234071/clean-dialog
543
+ def is_chinese_char(cp):
544
+ """Checks whether CP is the codepoint of a CJK character."""
545
+ # This defines a "chinese character" as anything in the CJK Unicode block:
546
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
547
+ #
548
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
549
+ # despite its name. The modern Korean Hangul alphabet is a different block,
550
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
551
+ # space-separated words, so they are not treated specially and handled
552
+ # like the all of the other languages.
553
+ return (
554
+ (cp >= 0x4E00 and cp <= 0x9FFF)
555
+ or (cp >= 0x3400 and cp <= 0x4DBF) #
556
+ or (cp >= 0x20000 and cp <= 0x2A6DF) #
557
+ or (cp >= 0x2A700 and cp <= 0x2B73F) #
558
+ or (cp >= 0x2B740 and cp <= 0x2B81F) #
559
+ or (cp >= 0x2B820 and cp <= 0x2CEAF) #
560
+ or (cp >= 0xF900 and cp <= 0xFAFF)
561
+ or (cp >= 0x2F800 and cp <= 0x2FA1F) #
562
+ )
563
+
564
+ def contains_Chinese(seq):
565
+ for char in seq:
566
+ cp = ord(char)
567
+ if is_chinese_char(cp):
568
+ return True
569
+ return False
570
+
571
+
572
+ class EnTextProcess(object):
573
+ pass
574
+
575
+
576
+ def convert2markdown(table: list) -> str:
577
+ df = pd.DataFrame(table[1:], columns=table[0])
578
+
579
+ return df.to_markdown(index=False)
580
+
581
+
582
+ def convert_fullwidth2_basic(sentence):
583
+ # 参照:https://fuhaoku.net/U+FF21
584
+ new_sentence = ""
585
+ for char in sentence:
586
+ if 65281 <= ord(char) <= 65374:
587
+ char = chr(ord(char) - 65248)
588
+ new_sentence += char
589
+ return new_sentence
590
+
591
+
592
+ def convert_basic2fullwidth(sentence):
593
+ new_sentence = ""
594
+ for char in sentence:
595
+ if 33 <= ord(char) <= 126:
596
+ char = chr(ord(char) + 65248)
597
+ new_sentence += char
598
+ return new_sentence
599
+
600
+ if __name__ == "__main__":
601
+ pattern_for_filter = [
602
+ Pattern.redundancy_space_pattern,
603
+ Pattern.repeat_pattern,
604
+ Pattern.special_char_pattern,
605
+ ]
606
+ pattern_for_replace = [(Pattern.special_char_pattern, " ")]
607
+
608
+ dp = TextProcess(
609
+ patterns_filter=pattern_for_filter, patterns_replace=pattern_for_replace
610
+ )
611
+ dp.process(text="demo")
@@ -0,0 +1,41 @@
1
+ neo4j_url: "******"
2
+
3
+ mysql:
4
+ host: "******"
5
+ port: "******"
6
+ user: "******"
7
+ password: "******"
8
+ database: "******"
9
+
10
+ es:
11
+ host:
12
+ - "******"
13
+ - "******"
14
+ - "******"
15
+ - "******"
16
+ timeout:
17
+ - "******"
18
+
19
+ mongo:
20
+ - uri: "******"
21
+ - db: "******"
22
+ - col: "******"
23
+ redis:
24
+ - uri: "******"
25
+
26
+ hbase:
27
+ - # 配置
28
+ - topic_num: "******"
29
+ - # 默认配置
30
+ - default_host: "******"
31
+ - default_port: "******"
32
+ - default_transport: "******"
33
+ - default_compat: "******"
34
+ - default_protocol: "******"
35
+
36
+ kafka:
37
+ bootstrap_server:
38
+ - "******"
39
+ - "******"
40
+ - "******"
41
+ topic: "******"
nlpertools/io/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji