nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nlpertools/__init__.py +24 -20
- nlpertools/algo/ac.py +18 -0
- nlpertools/algo/bit_ops.py +28 -0
- nlpertools/algo/kmp.py +94 -55
- nlpertools/algo/num_ops.py +12 -0
- nlpertools/algo/template.py +116 -0
- nlpertools/algo/union.py +13 -0
- nlpertools/data_client.py +387 -257
- nlpertools/data_structure/base_structure.py +109 -13
- nlpertools/dataprocess.py +611 -3
- nlpertools/default_db_config.yml +41 -0
- nlpertools/io/__init__.py +3 -3
- nlpertools/io/dir.py +54 -36
- nlpertools/io/file.py +277 -222
- nlpertools/ml.py +483 -460
- nlpertools/monitor/__init__.py +0 -0
- nlpertools/monitor/gpu.py +18 -0
- nlpertools/monitor/memory.py +24 -0
- nlpertools/movie.py +36 -0
- nlpertools/nlpertools_config.yml +1 -0
- nlpertools/{openApi.py → open_api.py} +65 -65
- nlpertools/other.py +364 -249
- nlpertools/pic.py +288 -0
- nlpertools/plugin.py +43 -43
- nlpertools/reminder.py +98 -87
- nlpertools/utils/__init__.py +3 -3
- nlpertools/utils/lazy.py +727 -0
- nlpertools/utils/log_util.py +20 -0
- nlpertools/utils/package.py +89 -76
- nlpertools/utils/package_v1.py +94 -0
- nlpertools/utils/package_v2.py +117 -0
- nlpertools/utils_for_nlpertools.py +93 -93
- nlpertools/vector_index_demo.py +108 -0
- nlpertools/wrapper.py +161 -96
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
- nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
- nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
- nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
- nlpertools_helper/__init__.py +10 -0
- nlpertools-1.0.5.dist-info/METADATA +0 -85
- nlpertools-1.0.5.dist-info/RECORD +0 -25
- nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/dataprocess.py
CHANGED
@@ -1,3 +1,611 @@
|
|
1
|
-
#!/usr/bin/python3.8
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : youshu.Ji
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import re
|
5
|
+
import string
|
6
|
+
from typing import List
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
|
10
|
+
# from . import DB_CONFIG_FILE # cannot import name 'DB_CONFIG_FILE' from partially initialized module 'nlpertools'
|
11
|
+
from .utils.package import *
|
12
|
+
|
13
|
+
main_special_characters = string.punctuation + string.digits + string.whitespace
|
14
|
+
other_special_characters = (
|
15
|
+
" ’“”–ー一▬…✦�£•€«»°·═"
|
16
|
+
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰
‑≤≥‖"
|
17
|
+
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
|
18
|
+
"゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
|
19
|
+
"」﴾》"
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
class Pattern:
|
24
|
+
"""
|
25
|
+
>>> pattern_special_char = re.compile("[{}{}]".format(pattern_special_char_x[1:-1], pattern_special_char_u[1:-1]))
|
26
|
+
a = "\U000d8be6asdasdas \x00v啊实打实\x00\x00v阿松大\x00"
|
27
|
+
res = re.sub(pattern_special_char, "$",a)
|
28
|
+
"""
|
29
|
+
|
30
|
+
# some from data-prepare
|
31
|
+
|
32
|
+
# emoji
|
33
|
+
"""
|
34
|
+
# 这也是emoji的取法,不知道pattern全不全
|
35
|
+
import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
|
36
|
+
emoji = list(emoji.UNICODE_EMOJI["en"].keys())
|
37
|
+
"""
|
38
|
+
emoji_pattern = "[\U00010000-\U0010ffff\\uD800-\\uDBFF\\uDC00-\\uDFFF]"
|
39
|
+
|
40
|
+
# 特殊的乱码或不可见字符
|
41
|
+
# \x 09:\t 0a:\n 0d:\r
|
42
|
+
special_char_x_pattern = "[\x00-\x08\x0b\x0c\x0e\x0f\x10-\x19\x1a-\x1f]"
|
43
|
+
# 统计大规模语料出来的非正常字符
|
44
|
+
special_char_u_pattern = (
|
45
|
+
"[\u3000\U000d8be6\U000e0062\U000e0063\U000e0067\U000e0073\U000e0074\U000e007f]"
|
46
|
+
)
|
47
|
+
special_char_pattern = "{}{}".format(
|
48
|
+
special_char_x_pattern[1:-1], special_char_u_pattern[1:-1]
|
49
|
+
)
|
50
|
+
non_printing_characters_pattern = (
|
51
|
+
f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]"
|
52
|
+
)
|
53
|
+
|
54
|
+
# 必须从头匹配,否则无意义的
|
55
|
+
# 中文人名
|
56
|
+
chinese_name_pattern = "(?:[\u4e00-\u9fa5·]{2,3})"
|
57
|
+
# 英文人名
|
58
|
+
english_name_pattern = "(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
|
59
|
+
# 纯数字
|
60
|
+
pure_num_pattern = "\d+"
|
61
|
+
# xxxx图/表 之类的表述
|
62
|
+
pic_table_descript_pattern = ".{1,15}图"
|
63
|
+
|
64
|
+
# 无需从头匹配的。
|
65
|
+
# hlink
|
66
|
+
hlink_pattern = (
|
67
|
+
r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
|
68
|
+
)
|
69
|
+
http_pattern = "(http|https):\/\/([\w.]+\/?)\S*/\S*"
|
70
|
+
# 邮箱
|
71
|
+
email_pattern = "[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
|
72
|
+
# html 可能过于严格了
|
73
|
+
html_pattern = "<[\s\S]*?>"
|
74
|
+
# 重复 “asdasdasdasd”
|
75
|
+
repeat_pattern = "(.)\1+"
|
76
|
+
# 日期
|
77
|
+
day_time_pattern = "\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
|
78
|
+
# 小时
|
79
|
+
hour_time_pattern = "(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
|
80
|
+
# 股票
|
81
|
+
stock_pattern = (
|
82
|
+
"(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
|
83
|
+
)
|
84
|
+
|
85
|
+
# 一般是需要替换的
|
86
|
+
# 多余空格 => " "
|
87
|
+
redundancy_space_pattern = " +"
|
88
|
+
# 一般用不到 多余换行符号 => " "
|
89
|
+
linebreak_pattern = "[\r\n\t]+"
|
90
|
+
|
91
|
+
# 微博视频等
|
92
|
+
weibo_pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
|
93
|
+
# @
|
94
|
+
at_pattern = "@\w+"
|
95
|
+
|
96
|
+
# from https://github.com/bigscience-workshop/data-preparation pii
|
97
|
+
year_patterns = [
|
98
|
+
r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])",
|
99
|
+
# yyyy-yyyy or yyyy/yyyy
|
100
|
+
r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
|
101
|
+
# yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm
|
102
|
+
r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
|
103
|
+
# mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy
|
104
|
+
r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
|
105
|
+
# mm-yyyy or mm/yyyy or the same but with yy
|
106
|
+
r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
|
107
|
+
# yyyy-mm or yyyy/mm
|
108
|
+
]
|
109
|
+
|
110
|
+
# Patterns for high-risk character strings
|
111
|
+
id_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([A-Za-z]*(?:[\p{Pd}]*\p{Nd}){6,})(?:$|[\b\s@?,!;:\'\")(.\p{Han}])'
|
112
|
+
# https://regex101.com/r/JQkmh8/2
|
113
|
+
# key_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[\s\p{Pd}]?){4,})(?:$|[\b\s\p{Han}@?,!;:\'\"])'
|
114
|
+
# https://regex101.com/r/JQkmh8/5
|
115
|
+
key_pattern = r'(?:^|[\b\s@?,!:;\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:_]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[ \p{Pd}]?){3,})(?:$|[\b\s\p{Han}@?,!;:\'\")(.])'
|
116
|
+
ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
|
117
|
+
ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
|
118
|
+
ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join(
|
119
|
+
[ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
|
120
|
+
|
121
|
+
# https://regex101.com/r/EpA5B7/1
|
122
|
+
email_line_pattern = r'''
|
123
|
+
(?<= ^ | [\b\s@,?!;:)('".\p{Han}<] )
|
124
|
+
(
|
125
|
+
[^\b\s@?!;,:)('"<]+
|
126
|
+
@
|
127
|
+
[^\b\s@!?;,/]*
|
128
|
+
[^\b\s@?!;,/:)('">.]
|
129
|
+
\.
|
130
|
+
\p{L} \w{1,}
|
131
|
+
)
|
132
|
+
(?= $ | [\b\s@,?!;:)('".\p{Han}>] )
|
133
|
+
'''
|
134
|
+
|
135
|
+
# https://regex101.com/r/mOqi1s/3
|
136
|
+
# user_pattern = r'(?:^|[\s@,?!;:\'\")(\p{Han}])(@[^\s@,?!;:\'\")(]{3,})'
|
137
|
+
user_pattern = r'''
|
138
|
+
(?<= ^ | [)(\s@,?!;:'"\p{Han}] )
|
139
|
+
(@
|
140
|
+
[^)(\s@,?!;:'"]{3,}
|
141
|
+
)
|
142
|
+
'''
|
143
|
+
|
144
|
+
|
145
|
+
class CalcPPL(object):
|
146
|
+
# ppl计算
|
147
|
+
# https://www.scribendi.ai/comparing-bert-and-gpt-2-as-language-models-to-score-the-grammatical-correctness-of-a-sentence/
|
148
|
+
def __init__(self, model_type, model_path, tokenizer_path):
|
149
|
+
self.model_type = model_type
|
150
|
+
self.model, self.tokenizer = self._init_model(model_type, model_path, tokenizer_path)
|
151
|
+
|
152
|
+
@staticmethod
|
153
|
+
def _init_model(model_type, model_path, tokenizer_path):
|
154
|
+
if model_type == "ngram":
|
155
|
+
model = kenlm.Model(model_path)
|
156
|
+
tokenizer = sentencepiece.SentencePieceProcessor()
|
157
|
+
tokenizer.load(tokenizer_path)
|
158
|
+
elif model_type == "bert":
|
159
|
+
model = BertForMaskedLM.from_pretrained(model_path)
|
160
|
+
tokenizer = BertTokenizer.from_pretrained(model_path)
|
161
|
+
elif model_type == "gpt":
|
162
|
+
model = GPT2LMHeadModel.from_pretrained(model_path)
|
163
|
+
tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
|
164
|
+
else:
|
165
|
+
model = tokenizer = None
|
166
|
+
assert "model_type should in ngram bert gpt"
|
167
|
+
return model, tokenizer
|
168
|
+
|
169
|
+
def ppl(self, sentence):
|
170
|
+
# 根据model_type自动选择
|
171
|
+
if self.model_type == "ngram":
|
172
|
+
return self.ppl_ngram(sentence)
|
173
|
+
elif self.model_type == "ngram":
|
174
|
+
return self.ppl_bert(sentence)
|
175
|
+
else:
|
176
|
+
return self.ppl3_gpt(sentence)
|
177
|
+
|
178
|
+
def ppl_ngram(self, sentence):
|
179
|
+
pass
|
180
|
+
|
181
|
+
def ppl_bert_2(self, sentence):
|
182
|
+
# 忘记哪来的
|
183
|
+
tokenizer = self.tokenizer
|
184
|
+
model = self.tokenizer
|
185
|
+
tokenize_input = tokenizer.tokenize(sentence)
|
186
|
+
tokenize_input = tokenize_input
|
187
|
+
tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
|
188
|
+
with torch.no_grad():
|
189
|
+
loss = model(tensor_input, labels=tensor_input)[0]
|
190
|
+
return np.exp(loss.detach().numpy())
|
191
|
+
|
192
|
+
# [1] Salazar J, Liang D, Nguyen T Q, et al. Masked Language Model Scoring[C]//Proceedings of ACL. 2020: 2699-2712.
|
193
|
+
def ppl_bert(self, sentence):
|
194
|
+
tokenizer = self.tokenizer
|
195
|
+
model = self.tokenizer
|
196
|
+
with torch.no_grad():
|
197
|
+
tokenize_input = tokenizer.tokenize(sentence)
|
198
|
+
tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
|
199
|
+
sen_len = len(tokenize_input)
|
200
|
+
sentence_loss = 0.
|
201
|
+
|
202
|
+
for i, word in enumerate(tokenize_input):
|
203
|
+
# add mask to i-th character of the sentence
|
204
|
+
tokenize_input[i] = '[MASK]'
|
205
|
+
mask_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
|
206
|
+
|
207
|
+
output = model(mask_input)
|
208
|
+
|
209
|
+
prediction_scores = output[0]
|
210
|
+
softmax = nn.Softmax(dim=0)
|
211
|
+
ps = softmax(prediction_scores[0, i]).log()
|
212
|
+
word_loss = ps[tensor_input[0, i]]
|
213
|
+
sentence_loss += word_loss.item()
|
214
|
+
|
215
|
+
tokenize_input[i] = word
|
216
|
+
ppl = np.exp(-sentence_loss / sen_len)
|
217
|
+
# print("困惑度:", ppl)
|
218
|
+
return ppl
|
219
|
+
|
220
|
+
def ppl3_gpt(self, text):
|
221
|
+
from torch.nn import CrossEntropyLoss
|
222
|
+
# 这里用 GPT2LMHeadModel
|
223
|
+
inputs = self.tokenizer([text], padding='max_length', max_length=50, truncation=True, return_tensors="pt")
|
224
|
+
bs, sl = inputs['input_ids'].size()
|
225
|
+
outputs = self.model(**inputs, labels=inputs['input_ids'])
|
226
|
+
logits = outputs[1]
|
227
|
+
# Shift so that tokens < n predict n
|
228
|
+
shift_logits = logits[:, :-1, :].contiguous()
|
229
|
+
shift_labels = inputs['input_ids'][:, 1:].contiguous()
|
230
|
+
shift_attentions = inputs['attention_mask'][:, 1:].contiguous()
|
231
|
+
# Flatten the tokens
|
232
|
+
loss_fct = CrossEntropyLoss(ignore_index=0, reduction="none")
|
233
|
+
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).detach().reshape(bs, -1)
|
234
|
+
meanloss = loss.sum(1) / shift_attentions.sum(1)
|
235
|
+
ppl = torch.exp(meanloss).numpy().tolist()
|
236
|
+
return ppl[0]
|
237
|
+
|
238
|
+
def test(self):
|
239
|
+
sentence = "输入句子:"
|
240
|
+
ppl = self.ppl_bert_2(sentence)
|
241
|
+
ppl2 = self.ppl_bert(sentence)
|
242
|
+
print(ppl)
|
243
|
+
print(ppl2)
|
244
|
+
|
245
|
+
|
246
|
+
class TextProcess(object):
|
247
|
+
"""
|
248
|
+
数据处理类
|
249
|
+
这是基类,如果是定制化的语言处理,请继承该类
|
250
|
+
"""
|
251
|
+
|
252
|
+
def __init__(
|
253
|
+
self,
|
254
|
+
patterns_filter: List = None,
|
255
|
+
patterns_replace: List[List] = None,
|
256
|
+
words_filter: List = []
|
257
|
+
):
|
258
|
+
"""
|
259
|
+
pattern_list:
|
260
|
+
"""
|
261
|
+
self.patterns_filter, self.patterns_replace = self._pre_compile_pattern(
|
262
|
+
patterns_filter, patterns_replace
|
263
|
+
)
|
264
|
+
self.words_filter = words_filter
|
265
|
+
|
266
|
+
@staticmethod
|
267
|
+
def _pre_compile_pattern(patterns_filter, patterns_replace):
|
268
|
+
complied_patterns_replace, complied_patterns_filter = [], []
|
269
|
+
for i in patterns_filter:
|
270
|
+
complied_patterns_filter.append(re.compile(i))
|
271
|
+
for i in patterns_replace:
|
272
|
+
complied_patterns_replace.append((re.compile(i[0]), i[1]))
|
273
|
+
return complied_patterns_filter, complied_patterns_replace
|
274
|
+
|
275
|
+
def process(self, text):
|
276
|
+
# 进来的数据都要做的标准化
|
277
|
+
text = self.full2half(text)
|
278
|
+
# text = self.filter_http(text)
|
279
|
+
text = self.filter_html(text)
|
280
|
+
text = self.filter_html_special(text)
|
281
|
+
# 根据类型与语言分别处理
|
282
|
+
text = self.filter_exclusive(text)
|
283
|
+
# text = self.trandition2simple(text)
|
284
|
+
# text = self.remove_stopwords(text)
|
285
|
+
return text
|
286
|
+
|
287
|
+
def filter_words(self, text):
|
288
|
+
# 根据词典,命中返回True,需要过滤掉
|
289
|
+
|
290
|
+
for word in self.words_filter:
|
291
|
+
if word in text:
|
292
|
+
return True
|
293
|
+
return False
|
294
|
+
|
295
|
+
def filter_whitelist(self, text):
|
296
|
+
whitelist = re.compile(
|
297
|
+
"[^\u4e00-\u9fa5^0-9a-zA-Z^-^《^》^<^>^【^】^(^)^{^}^–^…^”^“^,^.^;^?^:^‘^~^`^,^。^?^;^!^:^、^·^!^@^#^$^%^&^(^)^|]"
|
298
|
+
)
|
299
|
+
text = whitelist.sub("", text)
|
300
|
+
return text
|
301
|
+
|
302
|
+
def text_split(self, text, language):
|
303
|
+
if language == "en":
|
304
|
+
text = text[:256]
|
305
|
+
elif language == "zh":
|
306
|
+
text = text[:510]
|
307
|
+
return text
|
308
|
+
|
309
|
+
def trandition2simple(self, text):
|
310
|
+
# 仅对中文
|
311
|
+
"""
|
312
|
+
https://juejin.cn/post/7234554420163100728
|
313
|
+
"""
|
314
|
+
text = zhconv.convert("我幹什麼不干你事。", "zh-cn")
|
315
|
+
return text
|
316
|
+
|
317
|
+
def remove_stopwords(self, text):
|
318
|
+
import jieba
|
319
|
+
|
320
|
+
new_tokens = []
|
321
|
+
if self.language == "en":
|
322
|
+
tokens = text.split(" ")
|
323
|
+
else:
|
324
|
+
tokens = jieba.lcut(text)
|
325
|
+
|
326
|
+
for i in tokens:
|
327
|
+
if i in self.stopwords:
|
328
|
+
pass
|
329
|
+
else:
|
330
|
+
new_tokens.append(i)
|
331
|
+
|
332
|
+
return new_tokens
|
333
|
+
|
334
|
+
@staticmethod
|
335
|
+
def split_sentence(sentence, language="chinese"):
|
336
|
+
"""
|
337
|
+
分句,英文有nltk,中文怎么能没有好的分句工具呢
|
338
|
+
:param sentence:
|
339
|
+
:param language:
|
340
|
+
:return:
|
341
|
+
"""
|
342
|
+
# sentences->Str
|
343
|
+
# example '12“345。”“6789”'
|
344
|
+
assert language in ["chinese", "english"], "unsupportable for other language"
|
345
|
+
if language == "chinese":
|
346
|
+
split_signs = list("。!?…\t")
|
347
|
+
other_sign = "”"
|
348
|
+
elif language == "english":
|
349
|
+
split_signs = list(".!?")
|
350
|
+
other_sign = '"'
|
351
|
+
else:
|
352
|
+
split_signs = list(".!?")
|
353
|
+
other_sign = '"'
|
354
|
+
sentences = []
|
355
|
+
start_idx = 0
|
356
|
+
for idx, char in enumerate(sentence):
|
357
|
+
if idx == len(sentence) - 1:
|
358
|
+
if char in split_signs:
|
359
|
+
sentences.append(sentence[start_idx: idx + 1].strip())
|
360
|
+
start_idx = idx + 1
|
361
|
+
else:
|
362
|
+
sentences.append(sentence[start_idx:].strip())
|
363
|
+
else:
|
364
|
+
if char in split_signs:
|
365
|
+
if sentence[idx + 1] == other_sign:
|
366
|
+
if idx < len(sentence) - 2:
|
367
|
+
# 处理。”。
|
368
|
+
if sentence[idx + 2] not in split_signs:
|
369
|
+
sentences.append(sentence[start_idx: idx + 2].strip())
|
370
|
+
start_idx = idx + 2
|
371
|
+
elif sentence[idx + 1] not in split_signs:
|
372
|
+
sentences.append(sentence[start_idx: idx + 1].strip())
|
373
|
+
start_idx = idx + 1
|
374
|
+
sentences = [i.strip() for i in sentences if i.strip()]
|
375
|
+
return sentences
|
376
|
+
|
377
|
+
def cut_word(self, text, language):
|
378
|
+
import jieba
|
379
|
+
|
380
|
+
if language == "en":
|
381
|
+
tokens = text.split(" ")
|
382
|
+
else:
|
383
|
+
tokens = jieba.lcut(text)
|
384
|
+
return tokens
|
385
|
+
|
386
|
+
def full2half(self, text):
|
387
|
+
"""
|
388
|
+
全角转化为半角
|
389
|
+
:param text:
|
390
|
+
:return:
|
391
|
+
"""
|
392
|
+
ret_str = ""
|
393
|
+
for i in text:
|
394
|
+
if ord(i) >= 33 + 65248 and ord(i) <= 126 + 65248:
|
395
|
+
ret_str += chr(ord(i) - 65248)
|
396
|
+
else:
|
397
|
+
ret_str += i
|
398
|
+
return ret_str
|
399
|
+
|
400
|
+
def filter_html(self, text):
|
401
|
+
# 这个比较严格
|
402
|
+
"""
|
403
|
+
过滤html标签
|
404
|
+
:param text:
|
405
|
+
:return:
|
406
|
+
"""
|
407
|
+
patterns = [
|
408
|
+
re.compile("//<![CDATA[[^>]*//]]>", re.I), # 匹配CDATA
|
409
|
+
re.compile("<s*script[^>]*>[^<]*<s*/s*scripts*>", re.I), # Script
|
410
|
+
re.compile("<s*style[^>]*>[^<]*<s*/s*styles*>", re.I), # style
|
411
|
+
re.compile("<brs*?/?>"), # 处理换行
|
412
|
+
re.compile("</?w+[^>]*>"), # HTML标签
|
413
|
+
re.compile("<!--[^>]*-->"), # HTML注释
|
414
|
+
]
|
415
|
+
for pattern in patterns:
|
416
|
+
text = pattern.sub("", text)
|
417
|
+
return text
|
418
|
+
|
419
|
+
def filter_html_special(self, text):
|
420
|
+
"""
|
421
|
+
替换所有html转义字符
|
422
|
+
这个好像只有新闻有?
|
423
|
+
:param text:
|
424
|
+
:return:
|
425
|
+
"""
|
426
|
+
# TODO html标签应该是   这种,\xa0也是吗
|
427
|
+
CHAR_ENTITIES = {
|
428
|
+
" ": " ",
|
429
|
+
"160": " ",
|
430
|
+
"lt": "<",
|
431
|
+
"60": "<",
|
432
|
+
"gt": ">",
|
433
|
+
"62": ">",
|
434
|
+
"amp": "&",
|
435
|
+
"38": "&",
|
436
|
+
"quot": '"',
|
437
|
+
"34": '"',
|
438
|
+
"ldquo": '"',
|
439
|
+
"rdquo": '"',
|
440
|
+
"mdash": "",
|
441
|
+
"\xa0": "",
|
442
|
+
}
|
443
|
+
|
444
|
+
re_charEntity = re.compile(r"&#?(?P<name>\w+);", re.S)
|
445
|
+
sz = re.search(re_charEntity, text)
|
446
|
+
while sz:
|
447
|
+
entity = sz.group() # entity全称,如>
|
448
|
+
key = sz.group("name") # 去除&;后entity,如>为gt
|
449
|
+
try:
|
450
|
+
htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], text, 1)
|
451
|
+
text = htmlstr
|
452
|
+
sz = re.search(re_charEntity, htmlstr)
|
453
|
+
except KeyError:
|
454
|
+
# 以空串代替
|
455
|
+
htmlstr = re_charEntity.sub("", text, 1)
|
456
|
+
text = htmlstr
|
457
|
+
sz = re_charEntity.search(htmlstr)
|
458
|
+
return text
|
459
|
+
|
460
|
+
def filter_exclusive(self, text):
|
461
|
+
"""
|
462
|
+
去除 @、 #、 表情等twitter、微博“特有”的情况
|
463
|
+
:return:
|
464
|
+
"""
|
465
|
+
pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
|
466
|
+
p = re.compile(pattern, re.S)
|
467
|
+
text = p.sub("", text)
|
468
|
+
|
469
|
+
dr = re.compile("@\w+", re.S)
|
470
|
+
text = dr.sub("", text)
|
471
|
+
|
472
|
+
return text
|
473
|
+
|
474
|
+
def filter_html_tag(self, text):
|
475
|
+
# res_tr = r'<a (.*?)></a>'
|
476
|
+
# m_tr = re.findall(res_tr,text,re.S|re.M)
|
477
|
+
res = re.sub(r"<a.*?>", "", text)
|
478
|
+
res = re.sub(r"</a>", "", res)
|
479
|
+
res = re.sub(r"<span.*?>", "", res)
|
480
|
+
res = re.sub(r"</span>", "", res)
|
481
|
+
res = re.sub(r"<img.*?>", "", res)
|
482
|
+
res = re.sub(r"<br.*?>", "", res)
|
483
|
+
res = re.sub(r"//", "", res)
|
484
|
+
res = re.sub(r"@", "", res)
|
485
|
+
res = re.sub(r"</", "", res)
|
486
|
+
# res = re.sub(r',', '', res)
|
487
|
+
# res = re.sub(r' ', '', res)
|
488
|
+
return res
|
489
|
+
|
490
|
+
@staticmethod
|
491
|
+
def uniform_whitespace(
|
492
|
+
document,
|
493
|
+
whitespace=[
|
494
|
+
" ",
|
495
|
+
" ",
|
496
|
+
" ",
|
497
|
+
" ",
|
498
|
+
" ",
|
499
|
+
" ",
|
500
|
+
" ",
|
501
|
+
" ",
|
502
|
+
" ",
|
503
|
+
" ",
|
504
|
+
"",
|
505
|
+
"",
|
506
|
+
],
|
507
|
+
):
|
508
|
+
# from https://github.com/bigscience-workshop/data-preparation
|
509
|
+
"""There are different whitespace characters."""
|
510
|
+
whitespace = set(whitespace)
|
511
|
+
document = "".join(
|
512
|
+
[char if char not in whitespace else " " for char in document]
|
513
|
+
)
|
514
|
+
return document
|
515
|
+
|
516
|
+
def filter_pattern(self, text):
|
517
|
+
"""
|
518
|
+
返回True表示命中规则,需要过滤
|
519
|
+
"""
|
520
|
+
for pattern in self.patterns_filter:
|
521
|
+
if re.match(pattern, text):
|
522
|
+
return True
|
523
|
+
return False
|
524
|
+
|
525
|
+
def replace_pattern(self, text):
|
526
|
+
for pattern, replace in self.patterns_replace:
|
527
|
+
text = re.sub(pattern, replace, text)
|
528
|
+
return text
|
529
|
+
|
530
|
+
def calc_proportion_zh(self,text):
|
531
|
+
text = text.strip()
|
532
|
+
# 如果是中国英文的情况,并且英文有空格分开
|
533
|
+
if " " in text:
|
534
|
+
pass
|
535
|
+
chinese_count = 0
|
536
|
+
for char in text:
|
537
|
+
if '\u4e00' <= char <= '\u9fff':
|
538
|
+
chinese_count += 1
|
539
|
+
else:
|
540
|
+
pass
|
541
|
+
class CopyFunc():
|
542
|
+
# from https://github.com/lemon234071/clean-dialog
|
543
|
+
def is_chinese_char(cp):
|
544
|
+
"""Checks whether CP is the codepoint of a CJK character."""
|
545
|
+
# This defines a "chinese character" as anything in the CJK Unicode block:
|
546
|
+
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
547
|
+
#
|
548
|
+
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
549
|
+
# despite its name. The modern Korean Hangul alphabet is a different block,
|
550
|
+
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
551
|
+
# space-separated words, so they are not treated specially and handled
|
552
|
+
# like the all of the other languages.
|
553
|
+
return (
|
554
|
+
(cp >= 0x4E00 and cp <= 0x9FFF)
|
555
|
+
or (cp >= 0x3400 and cp <= 0x4DBF) #
|
556
|
+
or (cp >= 0x20000 and cp <= 0x2A6DF) #
|
557
|
+
or (cp >= 0x2A700 and cp <= 0x2B73F) #
|
558
|
+
or (cp >= 0x2B740 and cp <= 0x2B81F) #
|
559
|
+
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
|
560
|
+
or (cp >= 0xF900 and cp <= 0xFAFF)
|
561
|
+
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
|
562
|
+
)
|
563
|
+
|
564
|
+
def contains_Chinese(seq):
|
565
|
+
for char in seq:
|
566
|
+
cp = ord(char)
|
567
|
+
if is_chinese_char(cp):
|
568
|
+
return True
|
569
|
+
return False
|
570
|
+
|
571
|
+
|
572
|
+
class EnTextProcess(object):
|
573
|
+
pass
|
574
|
+
|
575
|
+
|
576
|
+
def convert2markdown(table: list) -> str:
|
577
|
+
df = pd.DataFrame(table[1:], columns=table[0])
|
578
|
+
|
579
|
+
return df.to_markdown(index=False)
|
580
|
+
|
581
|
+
|
582
|
+
def convert_fullwidth2_basic(sentence):
|
583
|
+
# 参照:https://fuhaoku.net/U+FF21
|
584
|
+
new_sentence = ""
|
585
|
+
for char in sentence:
|
586
|
+
if 65281 <= ord(char) <= 65374:
|
587
|
+
char = chr(ord(char) - 65248)
|
588
|
+
new_sentence += char
|
589
|
+
return new_sentence
|
590
|
+
|
591
|
+
|
592
|
+
def convert_basic2fullwidth(sentence):
|
593
|
+
new_sentence = ""
|
594
|
+
for char in sentence:
|
595
|
+
if 33 <= ord(char) <= 126:
|
596
|
+
char = chr(ord(char) + 65248)
|
597
|
+
new_sentence += char
|
598
|
+
return new_sentence
|
599
|
+
|
600
|
+
if __name__ == "__main__":
|
601
|
+
pattern_for_filter = [
|
602
|
+
Pattern.redundancy_space_pattern,
|
603
|
+
Pattern.repeat_pattern,
|
604
|
+
Pattern.special_char_pattern,
|
605
|
+
]
|
606
|
+
pattern_for_replace = [(Pattern.special_char_pattern, " ")]
|
607
|
+
|
608
|
+
dp = TextProcess(
|
609
|
+
patterns_filter=pattern_for_filter, patterns_replace=pattern_for_replace
|
610
|
+
)
|
611
|
+
dp.process(text="demo")
|
@@ -0,0 +1,41 @@
|
|
1
|
+
neo4j_url: "******"
|
2
|
+
|
3
|
+
mysql:
|
4
|
+
host: "******"
|
5
|
+
port: "******"
|
6
|
+
user: "******"
|
7
|
+
password: "******"
|
8
|
+
database: "******"
|
9
|
+
|
10
|
+
es:
|
11
|
+
host:
|
12
|
+
- "******"
|
13
|
+
- "******"
|
14
|
+
- "******"
|
15
|
+
- "******"
|
16
|
+
timeout:
|
17
|
+
- "******"
|
18
|
+
|
19
|
+
mongo:
|
20
|
+
- uri: "******"
|
21
|
+
- db: "******"
|
22
|
+
- col: "******"
|
23
|
+
redis:
|
24
|
+
- uri: "******"
|
25
|
+
|
26
|
+
hbase:
|
27
|
+
- # 配置
|
28
|
+
- topic_num: "******"
|
29
|
+
- # 默认配置
|
30
|
+
- default_host: "******"
|
31
|
+
- default_port: "******"
|
32
|
+
- default_transport: "******"
|
33
|
+
- default_compat: "******"
|
34
|
+
- default_protocol: "******"
|
35
|
+
|
36
|
+
kafka:
|
37
|
+
bootstrap_server:
|
38
|
+
- "******"
|
39
|
+
- "******"
|
40
|
+
- "******"
|
41
|
+
topic: "******"
|
nlpertools/io/__init__.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
#!/usr/bin/python3.8
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : youshu.Ji
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|