nlpertools 1.0.4__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. nlpertools/__init__.py +24 -11
  2. nlpertools/algo/__init__.py +0 -0
  3. nlpertools/algo/ac.py +18 -0
  4. nlpertools/algo/bit_ops.py +28 -0
  5. nlpertools/algo/kmp.py +94 -0
  6. nlpertools/algo/num_ops.py +12 -0
  7. nlpertools/algo/template.py +116 -0
  8. nlpertools/algo/union.py +13 -0
  9. nlpertools/data_client.py +387 -0
  10. nlpertools/data_structure/__init__.py +0 -0
  11. nlpertools/data_structure/base_structure.py +109 -0
  12. nlpertools/dataprocess.py +611 -3
  13. nlpertools/default_db_config.yml +41 -0
  14. nlpertools/io/__init__.py +3 -3
  15. nlpertools/io/dir.py +54 -47
  16. nlpertools/io/file.py +277 -205
  17. nlpertools/ml.py +483 -317
  18. nlpertools/monitor/__init__.py +0 -0
  19. nlpertools/monitor/gpu.py +18 -0
  20. nlpertools/monitor/memory.py +24 -0
  21. nlpertools/movie.py +36 -0
  22. nlpertools/nlpertools_config.yml +1 -0
  23. nlpertools/{openApi.py → open_api.py} +65 -62
  24. nlpertools/other.py +364 -188
  25. nlpertools/pic.py +288 -0
  26. nlpertools/plugin.py +43 -34
  27. nlpertools/reminder.py +98 -15
  28. nlpertools/template/__init__.py +0 -0
  29. nlpertools/utils/__init__.py +3 -0
  30. nlpertools/utils/lazy.py +727 -0
  31. nlpertools/utils/log_util.py +20 -0
  32. nlpertools/utils/package.py +89 -0
  33. nlpertools/utils/package_v1.py +94 -0
  34. nlpertools/utils/package_v2.py +117 -0
  35. nlpertools/utils_for_nlpertools.py +93 -0
  36. nlpertools/vector_index_demo.py +108 -0
  37. nlpertools/wrapper.py +161 -0
  38. {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  39. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  40. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  41. {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  42. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  43. nlpertools_helper/__init__.py +10 -0
  44. nlpertools-1.0.4.dist-info/METADATA +0 -42
  45. nlpertools-1.0.4.dist-info/RECORD +0 -15
  46. nlpertools-1.0.4.dist-info/top_level.txt +0 -1
nlpertools/dataprocess.py CHANGED
@@ -1,3 +1,611 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import re
5
+ import string
6
+ from typing import List
7
+
8
+ import numpy as np
9
+
10
+ # from . import DB_CONFIG_FILE # cannot import name 'DB_CONFIG_FILE' from partially initialized module 'nlpertools'
11
+ from .utils.package import *
12
+
13
+ main_special_characters = string.punctuation + string.digits + string.whitespace
14
+ other_special_characters = (
15
+ "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═"
16
+ "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
17
+ "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
18
+ "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
19
+ "」﴾》"
20
+ )
21
+
22
+
23
+ class Pattern:
24
+ """
25
+ >>> pattern_special_char = re.compile("[{}{}]".format(pattern_special_char_x[1:-1], pattern_special_char_u[1:-1]))
26
+ a = "\U000d8be6asdasdas \x00v啊实打实\x00\x00v阿松大\x00"
27
+ res = re.sub(pattern_special_char, "$",a)
28
+ """
29
+
30
+ # some from data-prepare
31
+
32
+ # emoji
33
+ """
34
+ # 这也是emoji的取法,不知道pattern全不全
35
+ import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
36
+ emoji = list(emoji.UNICODE_EMOJI["en"].keys())
37
+ """
38
+ emoji_pattern = "[\U00010000-\U0010ffff\\uD800-\\uDBFF\\uDC00-\\uDFFF]"
39
+
40
+ # 特殊的乱码或不可见字符
41
+ # \x 09:\t 0a:\n 0d:\r
42
+ special_char_x_pattern = "[\x00-\x08\x0b\x0c\x0e\x0f\x10-\x19\x1a-\x1f]"
43
+ # 统计大规模语料出来的非正常字符
44
+ special_char_u_pattern = (
45
+ "[\u3000\U000d8be6\U000e0062\U000e0063\U000e0067\U000e0073\U000e0074\U000e007f]"
46
+ )
47
+ special_char_pattern = "{}{}".format(
48
+ special_char_x_pattern[1:-1], special_char_u_pattern[1:-1]
49
+ )
50
+ non_printing_characters_pattern = (
51
+ f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]"
52
+ )
53
+
54
+ # 必须从头匹配,否则无意义的
55
+ # 中文人名
56
+ chinese_name_pattern = "(?:[\u4e00-\u9fa5·]{2,3})"
57
+ # 英文人名
58
+ english_name_pattern = "(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
59
+ # 纯数字
60
+ pure_num_pattern = "\d+"
61
+ # xxxx图/表 之类的表述
62
+ pic_table_descript_pattern = ".{1,15}图"
63
+
64
+ # 无需从头匹配的。
65
+ # hlink
66
+ hlink_pattern = (
67
+ r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
68
+ )
69
+ http_pattern = "(http|https):\/\/([\w.]+\/?)\S*/\S*"
70
+ # 邮箱
71
+ email_pattern = "[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
72
+ # html 可能过于严格了
73
+ html_pattern = "<[\s\S]*?>"
74
+ # 重复 “asdasdasdasd”
75
+ repeat_pattern = "(.)\1+"
76
+ # 日期
77
+ day_time_pattern = "\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
78
+ # 小时
79
+ hour_time_pattern = "(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
80
+ # 股票
81
+ stock_pattern = (
82
+ "(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
83
+ )
84
+
85
+ # 一般是需要替换的
86
+ # 多余空格 => " "
87
+ redundancy_space_pattern = " +"
88
+ # 一般用不到 多余换行符号 => " "
89
+ linebreak_pattern = "[\r\n\t]+"
90
+
91
+ # 微博视频等
92
+ weibo_pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
93
+ # @
94
+ at_pattern = "@\w+"
95
+
96
+ # from https://github.com/bigscience-workshop/data-preparation pii
97
+ year_patterns = [
98
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])",
99
+ # yyyy-yyyy or yyyy/yyyy
100
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
101
+ # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm
102
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
103
+ # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy
104
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
105
+ # mm-yyyy or mm/yyyy or the same but with yy
106
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
107
+ # yyyy-mm or yyyy/mm
108
+ ]
109
+
110
+ # Patterns for high-risk character strings
111
+ id_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([A-Za-z]*(?:[\p{Pd}]*\p{Nd}){6,})(?:$|[\b\s@?,!;:\'\")(.\p{Han}])'
112
+ # https://regex101.com/r/JQkmh8/2
113
+ # key_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[\s\p{Pd}]?){4,})(?:$|[\b\s\p{Han}@?,!;:\'\"])'
114
+ # https://regex101.com/r/JQkmh8/5
115
+ key_pattern = r'(?:^|[\b\s@?,!:;\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:_]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[ \p{Pd}]?){3,})(?:$|[\b\s\p{Han}@?,!;:\'\")(.])'
116
+ ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
117
+ ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
118
+ ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join(
119
+ [ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
120
+
121
+ # https://regex101.com/r/EpA5B7/1
122
+ email_line_pattern = r'''
123
+ (?<= ^ | [\b\s@,?!;:)('".\p{Han}<] )
124
+ (
125
+ [^\b\s@?!;,:)('"<]+
126
+ @
127
+ [^\b\s@!?;,/]*
128
+ [^\b\s@?!;,/:)('">.]
129
+ \.
130
+ \p{L} \w{1,}
131
+ )
132
+ (?= $ | [\b\s@,?!;:)('".\p{Han}>] )
133
+ '''
134
+
135
+ # https://regex101.com/r/mOqi1s/3
136
+ # user_pattern = r'(?:^|[\s@,?!;:\'\")(\p{Han}])(@[^\s@,?!;:\'\")(]{3,})'
137
+ user_pattern = r'''
138
+ (?<= ^ | [)(\s@,?!;:'"\p{Han}] )
139
+ (@
140
+ [^)(\s@,?!;:'"]{3,}
141
+ )
142
+ '''
143
+
144
+
145
+ class CalcPPL(object):
146
+ # ppl计算
147
+ # https://www.scribendi.ai/comparing-bert-and-gpt-2-as-language-models-to-score-the-grammatical-correctness-of-a-sentence/
148
+ def __init__(self, model_type, model_path, tokenizer_path):
149
+ self.model_type = model_type
150
+ self.model, self.tokenizer = self._init_model(model_type, model_path, tokenizer_path)
151
+
152
+ @staticmethod
153
+ def _init_model(model_type, model_path, tokenizer_path):
154
+ if model_type == "ngram":
155
+ model = kenlm.Model(model_path)
156
+ tokenizer = sentencepiece.SentencePieceProcessor()
157
+ tokenizer.load(tokenizer_path)
158
+ elif model_type == "bert":
159
+ model = BertForMaskedLM.from_pretrained(model_path)
160
+ tokenizer = BertTokenizer.from_pretrained(model_path)
161
+ elif model_type == "gpt":
162
+ model = GPT2LMHeadModel.from_pretrained(model_path)
163
+ tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
164
+ else:
165
+ model = tokenizer = None
166
+ assert "model_type should in ngram bert gpt"
167
+ return model, tokenizer
168
+
169
+ def ppl(self, sentence):
170
+ # 根据model_type自动选择
171
+ if self.model_type == "ngram":
172
+ return self.ppl_ngram(sentence)
173
+ elif self.model_type == "ngram":
174
+ return self.ppl_bert(sentence)
175
+ else:
176
+ return self.ppl3_gpt(sentence)
177
+
178
+ def ppl_ngram(self, sentence):
179
+ pass
180
+
181
+ def ppl_bert_2(self, sentence):
182
+ # 忘记哪来的
183
+ tokenizer = self.tokenizer
184
+ model = self.tokenizer
185
+ tokenize_input = tokenizer.tokenize(sentence)
186
+ tokenize_input = tokenize_input
187
+ tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
188
+ with torch.no_grad():
189
+ loss = model(tensor_input, labels=tensor_input)[0]
190
+ return np.exp(loss.detach().numpy())
191
+
192
+ # [1] Salazar J, Liang D, Nguyen T Q, et al. Masked Language Model Scoring[C]//Proceedings of ACL. 2020: 2699-2712.
193
+ def ppl_bert(self, sentence):
194
+ tokenizer = self.tokenizer
195
+ model = self.tokenizer
196
+ with torch.no_grad():
197
+ tokenize_input = tokenizer.tokenize(sentence)
198
+ tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
199
+ sen_len = len(tokenize_input)
200
+ sentence_loss = 0.
201
+
202
+ for i, word in enumerate(tokenize_input):
203
+ # add mask to i-th character of the sentence
204
+ tokenize_input[i] = '[MASK]'
205
+ mask_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
206
+
207
+ output = model(mask_input)
208
+
209
+ prediction_scores = output[0]
210
+ softmax = nn.Softmax(dim=0)
211
+ ps = softmax(prediction_scores[0, i]).log()
212
+ word_loss = ps[tensor_input[0, i]]
213
+ sentence_loss += word_loss.item()
214
+
215
+ tokenize_input[i] = word
216
+ ppl = np.exp(-sentence_loss / sen_len)
217
+ # print("困惑度:", ppl)
218
+ return ppl
219
+
220
+ def ppl3_gpt(self, text):
221
+ from torch.nn import CrossEntropyLoss
222
+ # 这里用 GPT2LMHeadModel
223
+ inputs = self.tokenizer([text], padding='max_length', max_length=50, truncation=True, return_tensors="pt")
224
+ bs, sl = inputs['input_ids'].size()
225
+ outputs = self.model(**inputs, labels=inputs['input_ids'])
226
+ logits = outputs[1]
227
+ # Shift so that tokens < n predict n
228
+ shift_logits = logits[:, :-1, :].contiguous()
229
+ shift_labels = inputs['input_ids'][:, 1:].contiguous()
230
+ shift_attentions = inputs['attention_mask'][:, 1:].contiguous()
231
+ # Flatten the tokens
232
+ loss_fct = CrossEntropyLoss(ignore_index=0, reduction="none")
233
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).detach().reshape(bs, -1)
234
+ meanloss = loss.sum(1) / shift_attentions.sum(1)
235
+ ppl = torch.exp(meanloss).numpy().tolist()
236
+ return ppl[0]
237
+
238
+ def test(self):
239
+ sentence = "输入句子:"
240
+ ppl = self.ppl_bert_2(sentence)
241
+ ppl2 = self.ppl_bert(sentence)
242
+ print(ppl)
243
+ print(ppl2)
244
+
245
+
246
+ class TextProcess(object):
247
+ """
248
+ 数据处理类
249
+ 这是基类,如果是定制化的语言处理,请继承该类
250
+ """
251
+
252
+ def __init__(
253
+ self,
254
+ patterns_filter: List = None,
255
+ patterns_replace: List[List] = None,
256
+ words_filter: List = []
257
+ ):
258
+ """
259
+ pattern_list:
260
+ """
261
+ self.patterns_filter, self.patterns_replace = self._pre_compile_pattern(
262
+ patterns_filter, patterns_replace
263
+ )
264
+ self.words_filter = words_filter
265
+
266
+ @staticmethod
267
+ def _pre_compile_pattern(patterns_filter, patterns_replace):
268
+ complied_patterns_replace, complied_patterns_filter = [], []
269
+ for i in patterns_filter:
270
+ complied_patterns_filter.append(re.compile(i))
271
+ for i in patterns_replace:
272
+ complied_patterns_replace.append((re.compile(i[0]), i[1]))
273
+ return complied_patterns_filter, complied_patterns_replace
274
+
275
+ def process(self, text):
276
+ # 进来的数据都要做的标准化
277
+ text = self.full2half(text)
278
+ # text = self.filter_http(text)
279
+ text = self.filter_html(text)
280
+ text = self.filter_html_special(text)
281
+ # 根据类型与语言分别处理
282
+ text = self.filter_exclusive(text)
283
+ # text = self.trandition2simple(text)
284
+ # text = self.remove_stopwords(text)
285
+ return text
286
+
287
+ def filter_words(self, text):
288
+ # 根据词典,命中返回True,需要过滤掉
289
+
290
+ for word in self.words_filter:
291
+ if word in text:
292
+ return True
293
+ return False
294
+
295
+ def filter_whitelist(self, text):
296
+ whitelist = re.compile(
297
+ "[^\u4e00-\u9fa5^0-9a-zA-Z^-^《^》^<^>^【^】^(^)^{^}^–^…^”^“^,^.^;^?^:^‘^~^`^,^。^?^;^!^:^、^·^!^@^#^$^%^&^(^)^|]"
298
+ )
299
+ text = whitelist.sub("", text)
300
+ return text
301
+
302
+ def text_split(self, text, language):
303
+ if language == "en":
304
+ text = text[:256]
305
+ elif language == "zh":
306
+ text = text[:510]
307
+ return text
308
+
309
+ def trandition2simple(self, text):
310
+ # 仅对中文
311
+ """
312
+ https://juejin.cn/post/7234554420163100728
313
+ """
314
+ text = zhconv.convert("我幹什麼不干你事。", "zh-cn")
315
+ return text
316
+
317
+ def remove_stopwords(self, text):
318
+ import jieba
319
+
320
+ new_tokens = []
321
+ if self.language == "en":
322
+ tokens = text.split(" ")
323
+ else:
324
+ tokens = jieba.lcut(text)
325
+
326
+ for i in tokens:
327
+ if i in self.stopwords:
328
+ pass
329
+ else:
330
+ new_tokens.append(i)
331
+
332
+ return new_tokens
333
+
334
+ @staticmethod
335
+ def split_sentence(sentence, language="chinese"):
336
+ """
337
+ 分句,英文有nltk,中文怎么能没有好的分句工具呢
338
+ :param sentence:
339
+ :param language:
340
+ :return:
341
+ """
342
+ # sentences->Str
343
+ # example '12“345。”“6789”'
344
+ assert language in ["chinese", "english"], "unsupportable for other language"
345
+ if language == "chinese":
346
+ split_signs = list("。!?…\t")
347
+ other_sign = "”"
348
+ elif language == "english":
349
+ split_signs = list(".!?")
350
+ other_sign = '"'
351
+ else:
352
+ split_signs = list(".!?")
353
+ other_sign = '"'
354
+ sentences = []
355
+ start_idx = 0
356
+ for idx, char in enumerate(sentence):
357
+ if idx == len(sentence) - 1:
358
+ if char in split_signs:
359
+ sentences.append(sentence[start_idx: idx + 1].strip())
360
+ start_idx = idx + 1
361
+ else:
362
+ sentences.append(sentence[start_idx:].strip())
363
+ else:
364
+ if char in split_signs:
365
+ if sentence[idx + 1] == other_sign:
366
+ if idx < len(sentence) - 2:
367
+ # 处理。”。
368
+ if sentence[idx + 2] not in split_signs:
369
+ sentences.append(sentence[start_idx: idx + 2].strip())
370
+ start_idx = idx + 2
371
+ elif sentence[idx + 1] not in split_signs:
372
+ sentences.append(sentence[start_idx: idx + 1].strip())
373
+ start_idx = idx + 1
374
+ sentences = [i.strip() for i in sentences if i.strip()]
375
+ return sentences
376
+
377
+ def cut_word(self, text, language):
378
+ import jieba
379
+
380
+ if language == "en":
381
+ tokens = text.split(" ")
382
+ else:
383
+ tokens = jieba.lcut(text)
384
+ return tokens
385
+
386
+ def full2half(self, text):
387
+ """
388
+ 全角转化为半角
389
+ :param text:
390
+ :return:
391
+ """
392
+ ret_str = ""
393
+ for i in text:
394
+ if ord(i) >= 33 + 65248 and ord(i) <= 126 + 65248:
395
+ ret_str += chr(ord(i) - 65248)
396
+ else:
397
+ ret_str += i
398
+ return ret_str
399
+
400
+ def filter_html(self, text):
401
+ # 这个比较严格
402
+ """
403
+ 过滤html标签
404
+ :param text:
405
+ :return:
406
+ """
407
+ patterns = [
408
+ re.compile("//<![CDATA[[^>]*//]]>", re.I), # 匹配CDATA
409
+ re.compile("<s*script[^>]*>[^<]*<s*/s*scripts*>", re.I), # Script
410
+ re.compile("<s*style[^>]*>[^<]*<s*/s*styles*>", re.I), # style
411
+ re.compile("<brs*?/?>"), # 处理换行
412
+ re.compile("</?w+[^>]*>"), # HTML标签
413
+ re.compile("<!--[^>]*-->"), # HTML注释
414
+ ]
415
+ for pattern in patterns:
416
+ text = pattern.sub("", text)
417
+ return text
418
+
419
+ def filter_html_special(self, text):
420
+ """
421
+ 替换所有html转义字符
422
+ 这个好像只有新闻有?
423
+ :param text:
424
+ :return:
425
+ """
426
+ # TODO html标签应该是 &nbsp 这种,\xa0也是吗
427
+ CHAR_ENTITIES = {
428
+ "&nbsp": " ",
429
+ "160": " ",
430
+ "lt": "<",
431
+ "60": "<",
432
+ "gt": ">",
433
+ "62": ">",
434
+ "amp": "&",
435
+ "38": "&",
436
+ "quot": '"',
437
+ "34": '"',
438
+ "ldquo": '"',
439
+ "rdquo": '"',
440
+ "mdash": "",
441
+ "\xa0": "",
442
+ }
443
+
444
+ re_charEntity = re.compile(r"&#?(?P<name>\w+);", re.S)
445
+ sz = re.search(re_charEntity, text)
446
+ while sz:
447
+ entity = sz.group() # entity全称,如>
448
+ key = sz.group("name") # 去除&;后entity,如>为gt
449
+ try:
450
+ htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], text, 1)
451
+ text = htmlstr
452
+ sz = re.search(re_charEntity, htmlstr)
453
+ except KeyError:
454
+ # 以空串代替
455
+ htmlstr = re_charEntity.sub("", text, 1)
456
+ text = htmlstr
457
+ sz = re_charEntity.search(htmlstr)
458
+ return text
459
+
460
+ def filter_exclusive(self, text):
461
+ """
462
+ 去除 @、 #、 表情等twitter、微博“特有”的情况
463
+ :return:
464
+ """
465
+ pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
466
+ p = re.compile(pattern, re.S)
467
+ text = p.sub("", text)
468
+
469
+ dr = re.compile("@\w+", re.S)
470
+ text = dr.sub("", text)
471
+
472
+ return text
473
+
474
+ def filter_html_tag(self, text):
475
+ # res_tr = r'<a (.*?)></a>'
476
+ # m_tr = re.findall(res_tr,text,re.S|re.M)
477
+ res = re.sub(r"<a.*?>", "", text)
478
+ res = re.sub(r"</a>", "", res)
479
+ res = re.sub(r"<span.*?>", "", res)
480
+ res = re.sub(r"</span>", "", res)
481
+ res = re.sub(r"<img.*?>", "", res)
482
+ res = re.sub(r"<br.*?>", "", res)
483
+ res = re.sub(r"//", "", res)
484
+ res = re.sub(r"@", "", res)
485
+ res = re.sub(r"</", "", res)
486
+ # res = re.sub(r',', '', res)
487
+ # res = re.sub(r'&nbsp;', '', res)
488
+ return res
489
+
490
+ @staticmethod
491
+ def uniform_whitespace(
492
+ document,
493
+ whitespace=[
494
+ " ",
495
+ " ",
496
+ " ",
497
+ " ",
498
+ " ",
499
+ " ",
500
+ " ",
501
+ " ",
502
+ " ",
503
+ " ",
504
+ "",
505
+ "„",
506
+ ],
507
+ ):
508
+ # from https://github.com/bigscience-workshop/data-preparation
509
+ """There are different whitespace characters."""
510
+ whitespace = set(whitespace)
511
+ document = "".join(
512
+ [char if char not in whitespace else " " for char in document]
513
+ )
514
+ return document
515
+
516
+ def filter_pattern(self, text):
517
+ """
518
+ 返回True表示命中规则,需要过滤
519
+ """
520
+ for pattern in self.patterns_filter:
521
+ if re.match(pattern, text):
522
+ return True
523
+ return False
524
+
525
+ def replace_pattern(self, text):
526
+ for pattern, replace in self.patterns_replace:
527
+ text = re.sub(pattern, replace, text)
528
+ return text
529
+
530
+ def calc_proportion_zh(self,text):
531
+ text = text.strip()
532
+ # 如果是中国英文的情况,并且英文有空格分开
533
+ if " " in text:
534
+ pass
535
+ chinese_count = 0
536
+ for char in text:
537
+ if '\u4e00' <= char <= '\u9fff':
538
+ chinese_count += 1
539
+ else:
540
+ pass
541
+ class CopyFunc():
542
+ # from https://github.com/lemon234071/clean-dialog
543
+ def is_chinese_char(cp):
544
+ """Checks whether CP is the codepoint of a CJK character."""
545
+ # This defines a "chinese character" as anything in the CJK Unicode block:
546
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
547
+ #
548
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
549
+ # despite its name. The modern Korean Hangul alphabet is a different block,
550
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
551
+ # space-separated words, so they are not treated specially and handled
552
+ # like the all of the other languages.
553
+ return (
554
+ (cp >= 0x4E00 and cp <= 0x9FFF)
555
+ or (cp >= 0x3400 and cp <= 0x4DBF) #
556
+ or (cp >= 0x20000 and cp <= 0x2A6DF) #
557
+ or (cp >= 0x2A700 and cp <= 0x2B73F) #
558
+ or (cp >= 0x2B740 and cp <= 0x2B81F) #
559
+ or (cp >= 0x2B820 and cp <= 0x2CEAF) #
560
+ or (cp >= 0xF900 and cp <= 0xFAFF)
561
+ or (cp >= 0x2F800 and cp <= 0x2FA1F) #
562
+ )
563
+
564
+ def contains_Chinese(seq):
565
+ for char in seq:
566
+ cp = ord(char)
567
+ if is_chinese_char(cp):
568
+ return True
569
+ return False
570
+
571
+
572
+ class EnTextProcess(object):
573
+ pass
574
+
575
+
576
+ def convert2markdown(table: list) -> str:
577
+ df = pd.DataFrame(table[1:], columns=table[0])
578
+
579
+ return df.to_markdown(index=False)
580
+
581
+
582
+ def convert_fullwidth2_basic(sentence):
583
+ # 参照:https://fuhaoku.net/U+FF21
584
+ new_sentence = ""
585
+ for char in sentence:
586
+ if 65281 <= ord(char) <= 65374:
587
+ char = chr(ord(char) - 65248)
588
+ new_sentence += char
589
+ return new_sentence
590
+
591
+
592
+ def convert_basic2fullwidth(sentence):
593
+ new_sentence = ""
594
+ for char in sentence:
595
+ if 33 <= ord(char) <= 126:
596
+ char = chr(ord(char) + 65248)
597
+ new_sentence += char
598
+ return new_sentence
599
+
600
+ if __name__ == "__main__":
601
+ pattern_for_filter = [
602
+ Pattern.redundancy_space_pattern,
603
+ Pattern.repeat_pattern,
604
+ Pattern.special_char_pattern,
605
+ ]
606
+ pattern_for_replace = [(Pattern.special_char_pattern, " ")]
607
+
608
+ dp = TextProcess(
609
+ patterns_filter=pattern_for_filter, patterns_replace=pattern_for_replace
610
+ )
611
+ dp.process(text="demo")
@@ -0,0 +1,41 @@
1
+ neo4j_url: "******"
2
+
3
+ mysql:
4
+ host: "******"
5
+ port: "******"
6
+ user: "******"
7
+ password: "******"
8
+ database: "******"
9
+
10
+ es:
11
+ host:
12
+ - "******"
13
+ - "******"
14
+ - "******"
15
+ - "******"
16
+ timeout:
17
+ - "******"
18
+
19
+ mongo:
20
+ - uri: "******"
21
+ - db: "******"
22
+ - col: "******"
23
+ redis:
24
+ - uri: "******"
25
+
26
+ hbase:
27
+ - # 配置
28
+ - topic_num: "******"
29
+ - # 默认配置
30
+ - default_host: "******"
31
+ - default_port: "******"
32
+ - default_transport: "******"
33
+ - default_compat: "******"
34
+ - default_protocol: "******"
35
+
36
+ kafka:
37
+ bootstrap_server:
38
+ - "******"
39
+ - "******"
40
+ - "******"
41
+ topic: "******"
nlpertools/io/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji