pyxllib 0.3.197__py3-none-any.whl → 0.3.200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +21 -21
- pyxllib/algo/__init__.py +8 -8
- pyxllib/algo/disjoint.py +54 -54
- pyxllib/algo/geo.py +541 -541
- pyxllib/algo/intervals.py +964 -964
- pyxllib/algo/matcher.py +389 -389
- pyxllib/algo/newbie.py +166 -166
- pyxllib/algo/pupil.py +629 -629
- pyxllib/algo/shapelylib.py +67 -67
- pyxllib/algo/specialist.py +241 -241
- pyxllib/algo/stat.py +494 -494
- pyxllib/algo/treelib.py +149 -149
- pyxllib/algo/unitlib.py +66 -66
- pyxllib/autogui/__init__.py +5 -5
- pyxllib/autogui/activewin.py +246 -246
- pyxllib/autogui/all.py +9 -9
- pyxllib/autogui/autogui.py +852 -852
- pyxllib/autogui/uiautolib.py +362 -362
- pyxllib/autogui/virtualkey.py +102 -102
- pyxllib/autogui/wechat.py +827 -827
- pyxllib/autogui/wechat_msg.py +421 -421
- pyxllib/autogui/wxautolib.py +84 -84
- pyxllib/cv/__init__.py +5 -5
- pyxllib/cv/expert.py +267 -267
- pyxllib/cv/imfile.py +159 -159
- pyxllib/cv/imhash.py +39 -39
- pyxllib/cv/pupil.py +9 -9
- pyxllib/cv/rgbfmt.py +1525 -1525
- pyxllib/cv/slidercaptcha.py +137 -137
- pyxllib/cv/trackbartools.py +251 -251
- pyxllib/cv/xlcvlib.py +1040 -1040
- pyxllib/cv/xlpillib.py +423 -423
- pyxllib/data/echarts.py +240 -240
- pyxllib/data/jsonlib.py +89 -89
- pyxllib/data/oss.py +72 -72
- pyxllib/data/pglib.py +1127 -1127
- pyxllib/data/sqlite.py +568 -568
- pyxllib/data/sqllib.py +297 -297
- pyxllib/ext/JLineViewer.py +505 -505
- pyxllib/ext/__init__.py +6 -6
- pyxllib/ext/demolib.py +246 -246
- pyxllib/ext/drissionlib.py +277 -277
- pyxllib/ext/kq5034lib.py +12 -12
- pyxllib/ext/old.py +663 -663
- pyxllib/ext/qt.py +449 -449
- pyxllib/ext/robustprocfile.py +497 -497
- pyxllib/ext/seleniumlib.py +76 -76
- pyxllib/ext/tk.py +173 -173
- pyxllib/ext/unixlib.py +827 -827
- pyxllib/ext/utools.py +351 -351
- pyxllib/ext/webhook.py +124 -119
- pyxllib/ext/win32lib.py +40 -40
- pyxllib/ext/wjxlib.py +88 -88
- pyxllib/ext/wpsapi.py +124 -124
- pyxllib/ext/xlwork.py +9 -9
- pyxllib/ext/yuquelib.py +1105 -1105
- pyxllib/file/__init__.py +17 -17
- pyxllib/file/docxlib.py +761 -761
- pyxllib/file/gitlib.py +309 -309
- pyxllib/file/libreoffice.py +165 -165
- pyxllib/file/movielib.py +148 -148
- pyxllib/file/newbie.py +10 -10
- pyxllib/file/onenotelib.py +1469 -1469
- pyxllib/file/packlib/__init__.py +330 -330
- pyxllib/file/packlib/zipfile.py +2441 -2441
- pyxllib/file/pdflib.py +426 -426
- pyxllib/file/pupil.py +185 -185
- pyxllib/file/specialist/__init__.py +685 -685
- pyxllib/file/specialist/dirlib.py +799 -799
- pyxllib/file/specialist/download.py +193 -193
- pyxllib/file/specialist/filelib.py +2829 -2829
- pyxllib/file/xlsxlib.py +3131 -3131
- pyxllib/file/xlsyncfile.py +341 -341
- pyxllib/prog/__init__.py +5 -5
- pyxllib/prog/cachetools.py +64 -64
- pyxllib/prog/deprecatedlib.py +233 -233
- pyxllib/prog/filelock.py +42 -42
- pyxllib/prog/ipyexec.py +253 -253
- pyxllib/prog/multiprogs.py +940 -940
- pyxllib/prog/newbie.py +451 -451
- pyxllib/prog/pupil.py +1197 -1197
- pyxllib/prog/sitepackages.py +33 -33
- pyxllib/prog/specialist/__init__.py +391 -391
- pyxllib/prog/specialist/bc.py +203 -203
- pyxllib/prog/specialist/browser.py +497 -497
- pyxllib/prog/specialist/common.py +347 -347
- pyxllib/prog/specialist/datetime.py +198 -198
- pyxllib/prog/specialist/tictoc.py +240 -240
- pyxllib/prog/specialist/xllog.py +180 -180
- pyxllib/prog/xlosenv.py +108 -108
- pyxllib/stdlib/__init__.py +17 -17
- pyxllib/stdlib/tablepyxl/__init__.py +10 -10
- pyxllib/stdlib/tablepyxl/style.py +303 -303
- pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
- pyxllib/text/__init__.py +8 -8
- pyxllib/text/ahocorasick.py +39 -39
- pyxllib/text/airscript.js +744 -744
- pyxllib/text/charclasslib.py +121 -121
- pyxllib/text/jiebalib.py +267 -267
- pyxllib/text/jinjalib.py +32 -32
- pyxllib/text/jsa_ai_prompt.md +271 -271
- pyxllib/text/jscode.py +922 -922
- pyxllib/text/latex/__init__.py +158 -158
- pyxllib/text/levenshtein.py +303 -303
- pyxllib/text/nestenv.py +1215 -1215
- pyxllib/text/newbie.py +300 -300
- pyxllib/text/pupil/__init__.py +8 -8
- pyxllib/text/pupil/common.py +1121 -1121
- pyxllib/text/pupil/xlalign.py +326 -326
- pyxllib/text/pycode.py +47 -47
- pyxllib/text/specialist/__init__.py +8 -8
- pyxllib/text/specialist/common.py +112 -112
- pyxllib/text/specialist/ptag.py +186 -186
- pyxllib/text/spellchecker.py +172 -172
- pyxllib/text/templates/echart_base.html +10 -10
- pyxllib/text/templates/highlight_code.html +16 -16
- pyxllib/text/templates/latex_editor.html +102 -102
- pyxllib/text/vbacode.py +17 -17
- pyxllib/text/xmllib.py +747 -747
- pyxllib/xl.py +42 -39
- pyxllib/xlcv.py +17 -17
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/METADATA +1 -1
- pyxllib-0.3.200.dist-info/RECORD +126 -0
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/licenses/LICENSE +190 -190
- pyxllib-0.3.197.dist-info/RECORD +0 -126
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +0 -0
pyxllib/text/jiebalib.py
CHANGED
@@ -1,267 +1,267 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2023/11/05
|
6
|
-
|
7
|
-
""" 基于jieba库的一些文本处理功能 """
|
8
|
-
|
9
|
-
from collections import Counter
|
10
|
-
import re
|
11
|
-
|
12
|
-
from tqdm import tqdm
|
13
|
-
import pandas as pd
|
14
|
-
|
15
|
-
import jieba
|
16
|
-
import jieba.posseg as pseg
|
17
|
-
from simhash import Simhash
|
18
|
-
|
19
|
-
from pyxllib.prog.pupil import DictTool, run_once
|
20
|
-
from pyxllib.file.specialist import XlPath
|
21
|
-
from pyxllib.algo.stat import update_dataframes_to_excel
|
22
|
-
|
23
|
-
|
24
|
-
def jieba_add_words(words):
|
25
|
-
for w in words:
|
26
|
-
jieba.add_word(w)
|
27
|
-
|
28
|
-
|
29
|
-
def jieba_del_words(words):
|
30
|
-
for w in words:
|
31
|
-
jieba.del_word(w)
|
32
|
-
|
33
|
-
|
34
|
-
@run_once('str')
|
35
|
-
def jieba_cut(text):
|
36
|
-
return tuple(jieba.cut(text))
|
37
|
-
|
38
|
-
|
39
|
-
@run_once('str')
|
40
|
-
def pseg_cut(text):
|
41
|
-
return tuple(pseg.cut(text))
|
42
|
-
|
43
|
-
|
44
|
-
def _count_word_frequency(texts, function_word=True):
|
45
|
-
""" 统计关键词出现频数 (主要是协助计算tf-idf)
|
46
|
-
|
47
|
-
:param texts: 输入字符串列表
|
48
|
-
:param function_word: 是否要统计虚词
|
49
|
-
:return: 一个dict
|
50
|
-
key: 分词名称
|
51
|
-
values: [x, y],x是出现总频数,y是这个词在多少篇文章中出现过
|
52
|
-
|
53
|
-
>>> _count_word_frequency(['正正正正', '正反正', '反反反反'])
|
54
|
-
{'正正': [1, 1], '反反': [2, 1]}
|
55
|
-
|
56
|
-
原没有过滤词性的结果:{'正正': [2, 1], '正': [1, 1], '反正': [1, 1], '反反': [2, 1]}
|
57
|
-
"""
|
58
|
-
|
59
|
-
d = dict()
|
60
|
-
for text in tqdm(texts, '词频统计'):
|
61
|
-
wordflags = list(pseg.cut(text))
|
62
|
-
words = set()
|
63
|
-
for word, flag in wordflags:
|
64
|
-
# 虚词不做记录
|
65
|
-
if (not function_word) and flag in ('uj', 'd', 'p', 'c', 'u', 'xc'):
|
66
|
-
continue
|
67
|
-
words.add(word)
|
68
|
-
if word not in d:
|
69
|
-
d[word] = [0, 0]
|
70
|
-
d[word][0] += 1
|
71
|
-
for word in words:
|
72
|
-
d[word][1] += 1
|
73
|
-
return d
|
74
|
-
|
75
|
-
|
76
|
-
def analyse_tf_idf(texts, outfile=None, sheet_name='tf-idf', *, function_word=True):
|
77
|
-
""" 分析tf-idf值
|
78
|
-
|
79
|
-
:param list[str] texts: 多份文件的文本内容
|
80
|
-
:return: 一个DataFrame数据
|
81
|
-
|
82
|
-
这个算法jieba可能有些自带库可以搞,但是自己写一下也不难啦
|
83
|
-
注意我这里返回的tf-idf中,是放大了总频数倍的,这样显示的数值大一点,看起来舒服~
|
84
|
-
"""
|
85
|
-
from math import log10
|
86
|
-
|
87
|
-
frequency = _count_word_frequency(texts, function_word)
|
88
|
-
DictTool.isub(frequency, [' ', '\t', '\n'])
|
89
|
-
|
90
|
-
n = len(texts)
|
91
|
-
sum_frequency = sum([v[0] for v in frequency.values()])
|
92
|
-
|
93
|
-
li = []
|
94
|
-
for k, v in frequency.items():
|
95
|
-
idf = log10(n / v[1])
|
96
|
-
# idf = 1
|
97
|
-
li.append([k, v[0], v[0] / sum_frequency, v[1], idf, v[0] * idf])
|
98
|
-
df = pd.DataFrame.from_records(li, columns=('词汇', '频数', '频率', '出现该词文章数', 'idf', 'tf-idf'))
|
99
|
-
df.sort_values(by='tf-idf', ascending=False, inplace=True)
|
100
|
-
|
101
|
-
if outfile:
|
102
|
-
update_dataframes_to_excel(outfile, {sheet_name: df})
|
103
|
-
|
104
|
-
return df
|
105
|
-
|
106
|
-
|
107
|
-
class TextClassifier:
|
108
|
-
def __init__(self, texts=None):
|
109
|
-
""" 文本分类器
|
110
|
-
|
111
|
-
:param list[str] texts: 文本内容
|
112
|
-
"""
|
113
|
-
|
114
|
-
self.texts = []
|
115
|
-
self.tfidf = {}
|
116
|
-
self.vecs = [] # 每份文本对应的向量化表达
|
117
|
-
self.default_tfidf = 1 # 如果没有计算tf-idf,可以全部默认用权重1
|
118
|
-
|
119
|
-
if texts:
|
120
|
-
for text in texts:
|
121
|
-
self.texts.append(text)
|
122
|
-
|
123
|
-
def get_text_tf(self, text, *,
|
124
|
-
function_word_weight=0.2,
|
125
|
-
normalize=True,
|
126
|
-
ingore_words=(' ', '\t', '\n'),
|
127
|
-
add_flag=False):
|
128
|
-
""" 这里可以定制提取text关键词的算法
|
129
|
-
|
130
|
-
:param function_word_weight: 这里可以自定义功能性词汇权重,一般是设一个小数降低权重
|
131
|
-
|
132
|
-
一般是定制一些过滤规则,比如过滤掉一些词性,或者过滤掉一些词
|
133
|
-
"""
|
134
|
-
ct = Counter()
|
135
|
-
|
136
|
-
# 1 初步的分词,以及是否要过滤虚词
|
137
|
-
wordflags = list(pseg_cut(text))
|
138
|
-
for word, flag in wordflags:
|
139
|
-
if flag in ('uj', 'd', 'p', 'c', 'u', 'xc', 'x'):
|
140
|
-
if add_flag:
|
141
|
-
ct[word + ',' + flag] += function_word_weight
|
142
|
-
else:
|
143
|
-
ct[word] += function_word_weight
|
144
|
-
else:
|
145
|
-
if add_flag:
|
146
|
-
ct[word + ',' + flag] += 1
|
147
|
-
else:
|
148
|
-
ct[word] += 1
|
149
|
-
|
150
|
-
# 2 归一化一些词
|
151
|
-
if normalize:
|
152
|
-
ct2 = Counter()
|
153
|
-
for k, v in ct.items():
|
154
|
-
# 如果需要对一些词汇做归一化,也可以这里设置
|
155
|
-
k = re.sub(r'\d', '0', k) # 把数字都换成0
|
156
|
-
ct2[k] += v
|
157
|
-
ct = ct2
|
158
|
-
|
159
|
-
# 3 过滤掉一些词
|
160
|
-
if ingore_words:
|
161
|
-
for k in ingore_words:
|
162
|
-
if k in ct:
|
163
|
-
del ct[k]
|
164
|
-
|
165
|
-
return ct
|
166
|
-
|
167
|
-
def compute_tfidf(self, outfile=None, sheet_name='tf-idf', normalize=False, function_word_weight=0.2,
|
168
|
-
add_flag=False):
|
169
|
-
""" 重算tfidf表 """
|
170
|
-
from math import log10
|
171
|
-
|
172
|
-
# 1 统计频数和出现该词的文章数
|
173
|
-
d = dict()
|
174
|
-
for text in tqdm(self.texts, '词频统计'):
|
175
|
-
ct = self.get_text_tf(text, normalize=normalize, function_word_weight=function_word_weight,
|
176
|
-
add_flag=add_flag)
|
177
|
-
for k, v in ct.items():
|
178
|
-
if k not in d:
|
179
|
-
d[k] = [0, 0]
|
180
|
-
d[k] = [d[k][0] + v, d[k][1] + 1]
|
181
|
-
|
182
|
-
# 2 计算tfidf
|
183
|
-
n = len(self.texts)
|
184
|
-
sum_tf = sum([v[0] for v in d.values()])
|
185
|
-
ls = []
|
186
|
-
for k, v in d.items():
|
187
|
-
idf = log10(n / v[1])
|
188
|
-
# idf = 1
|
189
|
-
ls.append([k, v[0], v[0] / sum_tf, v[1], idf, v[0] * idf])
|
190
|
-
|
191
|
-
df = pd.DataFrame.from_records(ls, columns=('词汇', '频数', '频率', '出现该词文章数', 'idf', 'tf-idf'))
|
192
|
-
df.sort_values(by='tf-idf', ascending=False, inplace=True)
|
193
|
-
|
194
|
-
# 3 保存到文件
|
195
|
-
if outfile:
|
196
|
-
update_dataframes_to_excel(outfile, {sheet_name: df})
|
197
|
-
|
198
|
-
self.tfidf = {row['词汇']: row['tf-idf'] for idx, row in df.iterrows()}
|
199
|
-
self.default_tfidf = df.loc[len(df) - 1]['tf-idf'] # 最后条的权重作为其他未见词的默认权重
|
200
|
-
|
201
|
-
return df
|
202
|
-
|
203
|
-
def normalization(self, d):
|
204
|
-
""" 向量归一化
|
205
|
-
|
206
|
-
输入一个类字典结构表示的向量,对向量做归一化处理
|
207
|
-
"""
|
208
|
-
length = sum([v * v for v in d.values()]) ** 0.5 # 向量长度
|
209
|
-
return {k: v / length for k, v in d.items()}
|
210
|
-
|
211
|
-
def get_text_vec(self, text):
|
212
|
-
""" 获取文本的向量化表达
|
213
|
-
|
214
|
-
:param str text: 文本内容
|
215
|
-
"""
|
216
|
-
ct = self.get_text_tf(text)
|
217
|
-
vec = {k: v * self.tfidf.get(k, self.default_tfidf) for k, v in ct.items()}
|
218
|
-
vec = self.normalization(vec)
|
219
|
-
return vec
|
220
|
-
|
221
|
-
def compute_vecs(self):
|
222
|
-
""" 重置向量化表达 """
|
223
|
-
vecs = []
|
224
|
-
for text in tqdm(self.texts, desc='query向量化'):
|
225
|
-
vecs.append(self.get_text_vec(text))
|
226
|
-
self.vecs = vecs
|
227
|
-
return vecs
|
228
|
-
|
229
|
-
def cosine_similar(self, x, y):
|
230
|
-
""" 两个向量的余弦相似度,值越大越相似
|
231
|
-
|
232
|
-
这里是简化的,只算两个向量的点积,请确保输入的都是单位长度的向量
|
233
|
-
注意这里x和y都是稀疏矩阵的存储形式,传入的是dict结构
|
234
|
-
"""
|
235
|
-
keys = x.keys() & y.keys() # 求出x和y共有的键值
|
236
|
-
return sum([x[k] * y[k] for k in keys])
|
237
|
-
|
238
|
-
def find_similar_vec(self, x, maxn=10):
|
239
|
-
""" 找与x最相近的向量,返回下标和相似度
|
240
|
-
|
241
|
-
:pamra x: 待查找的对象
|
242
|
-
:param maxn: 返回最相近的前maxn个对象
|
243
|
-
"""
|
244
|
-
if isinstance(x, str):
|
245
|
-
x = self.get_text_vec(x)
|
246
|
-
|
247
|
-
# todo 使用并行计算?或者其实也可以向量化,但向量化是稀疏矩阵,挺占空间的
|
248
|
-
sims = [(i, self.cosine_similar(x, v)) for i, v in enumerate(self.vecs)]
|
249
|
-
sims.sort(key=lambda x: x[1], reverse=True)
|
250
|
-
return sims[:maxn]
|
251
|
-
|
252
|
-
def refine_vecs(self):
|
253
|
-
""" 优化向量数据,去掉权重小余0.0001的维度 """
|
254
|
-
# 1 计算每个向量的长度
|
255
|
-
vecs = []
|
256
|
-
for vec in tqdm(self.vecs, '优化向量'):
|
257
|
-
vec = [(k, v) for k, v in vec.items()]
|
258
|
-
vec.sort(key=lambda x: x[1], reverse=True)
|
259
|
-
vec2 = {}
|
260
|
-
for k, v in vec:
|
261
|
-
if v < 0.0001:
|
262
|
-
break
|
263
|
-
vec2[k] = round(v, 4)
|
264
|
-
vecs.append(vec2)
|
265
|
-
|
266
|
-
self.vecs = vecs
|
267
|
-
return self.vecs
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2023/11/05
|
6
|
+
|
7
|
+
""" 基于jieba库的一些文本处理功能 """
|
8
|
+
|
9
|
+
from collections import Counter
|
10
|
+
import re
|
11
|
+
|
12
|
+
from tqdm import tqdm
|
13
|
+
import pandas as pd
|
14
|
+
|
15
|
+
import jieba
|
16
|
+
import jieba.posseg as pseg
|
17
|
+
from simhash import Simhash
|
18
|
+
|
19
|
+
from pyxllib.prog.pupil import DictTool, run_once
|
20
|
+
from pyxllib.file.specialist import XlPath
|
21
|
+
from pyxllib.algo.stat import update_dataframes_to_excel
|
22
|
+
|
23
|
+
|
24
|
+
def jieba_add_words(words):
|
25
|
+
for w in words:
|
26
|
+
jieba.add_word(w)
|
27
|
+
|
28
|
+
|
29
|
+
def jieba_del_words(words):
|
30
|
+
for w in words:
|
31
|
+
jieba.del_word(w)
|
32
|
+
|
33
|
+
|
34
|
+
@run_once('str')
|
35
|
+
def jieba_cut(text):
|
36
|
+
return tuple(jieba.cut(text))
|
37
|
+
|
38
|
+
|
39
|
+
@run_once('str')
|
40
|
+
def pseg_cut(text):
|
41
|
+
return tuple(pseg.cut(text))
|
42
|
+
|
43
|
+
|
44
|
+
def _count_word_frequency(texts, function_word=True):
|
45
|
+
""" 统计关键词出现频数 (主要是协助计算tf-idf)
|
46
|
+
|
47
|
+
:param texts: 输入字符串列表
|
48
|
+
:param function_word: 是否要统计虚词
|
49
|
+
:return: 一个dict
|
50
|
+
key: 分词名称
|
51
|
+
values: [x, y],x是出现总频数,y是这个词在多少篇文章中出现过
|
52
|
+
|
53
|
+
>>> _count_word_frequency(['正正正正', '正反正', '反反反反'])
|
54
|
+
{'正正': [1, 1], '反反': [2, 1]}
|
55
|
+
|
56
|
+
原没有过滤词性的结果:{'正正': [2, 1], '正': [1, 1], '反正': [1, 1], '反反': [2, 1]}
|
57
|
+
"""
|
58
|
+
|
59
|
+
d = dict()
|
60
|
+
for text in tqdm(texts, '词频统计'):
|
61
|
+
wordflags = list(pseg.cut(text))
|
62
|
+
words = set()
|
63
|
+
for word, flag in wordflags:
|
64
|
+
# 虚词不做记录
|
65
|
+
if (not function_word) and flag in ('uj', 'd', 'p', 'c', 'u', 'xc'):
|
66
|
+
continue
|
67
|
+
words.add(word)
|
68
|
+
if word not in d:
|
69
|
+
d[word] = [0, 0]
|
70
|
+
d[word][0] += 1
|
71
|
+
for word in words:
|
72
|
+
d[word][1] += 1
|
73
|
+
return d
|
74
|
+
|
75
|
+
|
76
|
+
def analyse_tf_idf(texts, outfile=None, sheet_name='tf-idf', *, function_word=True):
|
77
|
+
""" 分析tf-idf值
|
78
|
+
|
79
|
+
:param list[str] texts: 多份文件的文本内容
|
80
|
+
:return: 一个DataFrame数据
|
81
|
+
|
82
|
+
这个算法jieba可能有些自带库可以搞,但是自己写一下也不难啦
|
83
|
+
注意我这里返回的tf-idf中,是放大了总频数倍的,这样显示的数值大一点,看起来舒服~
|
84
|
+
"""
|
85
|
+
from math import log10
|
86
|
+
|
87
|
+
frequency = _count_word_frequency(texts, function_word)
|
88
|
+
DictTool.isub(frequency, [' ', '\t', '\n'])
|
89
|
+
|
90
|
+
n = len(texts)
|
91
|
+
sum_frequency = sum([v[0] for v in frequency.values()])
|
92
|
+
|
93
|
+
li = []
|
94
|
+
for k, v in frequency.items():
|
95
|
+
idf = log10(n / v[1])
|
96
|
+
# idf = 1
|
97
|
+
li.append([k, v[0], v[0] / sum_frequency, v[1], idf, v[0] * idf])
|
98
|
+
df = pd.DataFrame.from_records(li, columns=('词汇', '频数', '频率', '出现该词文章数', 'idf', 'tf-idf'))
|
99
|
+
df.sort_values(by='tf-idf', ascending=False, inplace=True)
|
100
|
+
|
101
|
+
if outfile:
|
102
|
+
update_dataframes_to_excel(outfile, {sheet_name: df})
|
103
|
+
|
104
|
+
return df
|
105
|
+
|
106
|
+
|
107
|
+
class TextClassifier:
|
108
|
+
def __init__(self, texts=None):
|
109
|
+
""" 文本分类器
|
110
|
+
|
111
|
+
:param list[str] texts: 文本内容
|
112
|
+
"""
|
113
|
+
|
114
|
+
self.texts = []
|
115
|
+
self.tfidf = {}
|
116
|
+
self.vecs = [] # 每份文本对应的向量化表达
|
117
|
+
self.default_tfidf = 1 # 如果没有计算tf-idf,可以全部默认用权重1
|
118
|
+
|
119
|
+
if texts:
|
120
|
+
for text in texts:
|
121
|
+
self.texts.append(text)
|
122
|
+
|
123
|
+
def get_text_tf(self, text, *,
|
124
|
+
function_word_weight=0.2,
|
125
|
+
normalize=True,
|
126
|
+
ingore_words=(' ', '\t', '\n'),
|
127
|
+
add_flag=False):
|
128
|
+
""" 这里可以定制提取text关键词的算法
|
129
|
+
|
130
|
+
:param function_word_weight: 这里可以自定义功能性词汇权重,一般是设一个小数降低权重
|
131
|
+
|
132
|
+
一般是定制一些过滤规则,比如过滤掉一些词性,或者过滤掉一些词
|
133
|
+
"""
|
134
|
+
ct = Counter()
|
135
|
+
|
136
|
+
# 1 初步的分词,以及是否要过滤虚词
|
137
|
+
wordflags = list(pseg_cut(text))
|
138
|
+
for word, flag in wordflags:
|
139
|
+
if flag in ('uj', 'd', 'p', 'c', 'u', 'xc', 'x'):
|
140
|
+
if add_flag:
|
141
|
+
ct[word + ',' + flag] += function_word_weight
|
142
|
+
else:
|
143
|
+
ct[word] += function_word_weight
|
144
|
+
else:
|
145
|
+
if add_flag:
|
146
|
+
ct[word + ',' + flag] += 1
|
147
|
+
else:
|
148
|
+
ct[word] += 1
|
149
|
+
|
150
|
+
# 2 归一化一些词
|
151
|
+
if normalize:
|
152
|
+
ct2 = Counter()
|
153
|
+
for k, v in ct.items():
|
154
|
+
# 如果需要对一些词汇做归一化,也可以这里设置
|
155
|
+
k = re.sub(r'\d', '0', k) # 把数字都换成0
|
156
|
+
ct2[k] += v
|
157
|
+
ct = ct2
|
158
|
+
|
159
|
+
# 3 过滤掉一些词
|
160
|
+
if ingore_words:
|
161
|
+
for k in ingore_words:
|
162
|
+
if k in ct:
|
163
|
+
del ct[k]
|
164
|
+
|
165
|
+
return ct
|
166
|
+
|
167
|
+
def compute_tfidf(self, outfile=None, sheet_name='tf-idf', normalize=False, function_word_weight=0.2,
|
168
|
+
add_flag=False):
|
169
|
+
""" 重算tfidf表 """
|
170
|
+
from math import log10
|
171
|
+
|
172
|
+
# 1 统计频数和出现该词的文章数
|
173
|
+
d = dict()
|
174
|
+
for text in tqdm(self.texts, '词频统计'):
|
175
|
+
ct = self.get_text_tf(text, normalize=normalize, function_word_weight=function_word_weight,
|
176
|
+
add_flag=add_flag)
|
177
|
+
for k, v in ct.items():
|
178
|
+
if k not in d:
|
179
|
+
d[k] = [0, 0]
|
180
|
+
d[k] = [d[k][0] + v, d[k][1] + 1]
|
181
|
+
|
182
|
+
# 2 计算tfidf
|
183
|
+
n = len(self.texts)
|
184
|
+
sum_tf = sum([v[0] for v in d.values()])
|
185
|
+
ls = []
|
186
|
+
for k, v in d.items():
|
187
|
+
idf = log10(n / v[1])
|
188
|
+
# idf = 1
|
189
|
+
ls.append([k, v[0], v[0] / sum_tf, v[1], idf, v[0] * idf])
|
190
|
+
|
191
|
+
df = pd.DataFrame.from_records(ls, columns=('词汇', '频数', '频率', '出现该词文章数', 'idf', 'tf-idf'))
|
192
|
+
df.sort_values(by='tf-idf', ascending=False, inplace=True)
|
193
|
+
|
194
|
+
# 3 保存到文件
|
195
|
+
if outfile:
|
196
|
+
update_dataframes_to_excel(outfile, {sheet_name: df})
|
197
|
+
|
198
|
+
self.tfidf = {row['词汇']: row['tf-idf'] for idx, row in df.iterrows()}
|
199
|
+
self.default_tfidf = df.loc[len(df) - 1]['tf-idf'] # 最后条的权重作为其他未见词的默认权重
|
200
|
+
|
201
|
+
return df
|
202
|
+
|
203
|
+
def normalization(self, d):
|
204
|
+
""" 向量归一化
|
205
|
+
|
206
|
+
输入一个类字典结构表示的向量,对向量做归一化处理
|
207
|
+
"""
|
208
|
+
length = sum([v * v for v in d.values()]) ** 0.5 # 向量长度
|
209
|
+
return {k: v / length for k, v in d.items()}
|
210
|
+
|
211
|
+
def get_text_vec(self, text):
|
212
|
+
""" 获取文本的向量化表达
|
213
|
+
|
214
|
+
:param str text: 文本内容
|
215
|
+
"""
|
216
|
+
ct = self.get_text_tf(text)
|
217
|
+
vec = {k: v * self.tfidf.get(k, self.default_tfidf) for k, v in ct.items()}
|
218
|
+
vec = self.normalization(vec)
|
219
|
+
return vec
|
220
|
+
|
221
|
+
def compute_vecs(self):
|
222
|
+
""" 重置向量化表达 """
|
223
|
+
vecs = []
|
224
|
+
for text in tqdm(self.texts, desc='query向量化'):
|
225
|
+
vecs.append(self.get_text_vec(text))
|
226
|
+
self.vecs = vecs
|
227
|
+
return vecs
|
228
|
+
|
229
|
+
def cosine_similar(self, x, y):
|
230
|
+
""" 两个向量的余弦相似度,值越大越相似
|
231
|
+
|
232
|
+
这里是简化的,只算两个向量的点积,请确保输入的都是单位长度的向量
|
233
|
+
注意这里x和y都是稀疏矩阵的存储形式,传入的是dict结构
|
234
|
+
"""
|
235
|
+
keys = x.keys() & y.keys() # 求出x和y共有的键值
|
236
|
+
return sum([x[k] * y[k] for k in keys])
|
237
|
+
|
238
|
+
def find_similar_vec(self, x, maxn=10):
|
239
|
+
""" 找与x最相近的向量,返回下标和相似度
|
240
|
+
|
241
|
+
:pamra x: 待查找的对象
|
242
|
+
:param maxn: 返回最相近的前maxn个对象
|
243
|
+
"""
|
244
|
+
if isinstance(x, str):
|
245
|
+
x = self.get_text_vec(x)
|
246
|
+
|
247
|
+
# todo 使用并行计算?或者其实也可以向量化,但向量化是稀疏矩阵,挺占空间的
|
248
|
+
sims = [(i, self.cosine_similar(x, v)) for i, v in enumerate(self.vecs)]
|
249
|
+
sims.sort(key=lambda x: x[1], reverse=True)
|
250
|
+
return sims[:maxn]
|
251
|
+
|
252
|
+
def refine_vecs(self):
|
253
|
+
""" 优化向量数据,去掉权重小余0.0001的维度 """
|
254
|
+
# 1 计算每个向量的长度
|
255
|
+
vecs = []
|
256
|
+
for vec in tqdm(self.vecs, '优化向量'):
|
257
|
+
vec = [(k, v) for k, v in vec.items()]
|
258
|
+
vec.sort(key=lambda x: x[1], reverse=True)
|
259
|
+
vec2 = {}
|
260
|
+
for k, v in vec:
|
261
|
+
if v < 0.0001:
|
262
|
+
break
|
263
|
+
vec2[k] = round(v, 4)
|
264
|
+
vecs.append(vec2)
|
265
|
+
|
266
|
+
self.vecs = vecs
|
267
|
+
return self.vecs
|
pyxllib/text/jinjalib.py
CHANGED
@@ -1,32 +1,32 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2024/05/26
|
6
|
-
|
7
|
-
from pyxllib.prog.pupil import check_install_package
|
8
|
-
|
9
|
-
# 一个xpath解析库
|
10
|
-
check_install_package('jinja2')
|
11
|
-
|
12
|
-
import jinja2
|
13
|
-
from jinja2 import Template, Environment
|
14
|
-
|
15
|
-
from pyxllib.file.specialist import XlPath
|
16
|
-
|
17
|
-
|
18
|
-
def set_template(s, *args, **kwargs):
|
19
|
-
""" todo 这个名字会不会太容易冲突了? """
|
20
|
-
return Template(s.strip(), *args, **kwargs)
|
21
|
-
|
22
|
-
|
23
|
-
def set_meta_template(s, meta_start='[[', meta_end=']]', **kwargs):
|
24
|
-
""" 支持预先用某些格式渲染后,再返回标准渲染模板 """
|
25
|
-
t = Template(s.strip(), variable_start_string=meta_start,
|
26
|
-
variable_end_string=meta_end).render(**kwargs)
|
27
|
-
return Template(t)
|
28
|
-
|
29
|
-
|
30
|
-
def get_jinja_template(name, **kwargs):
|
31
|
-
template = Environment(**kwargs).from_string((XlPath(__file__).parent / f'templates/{name}').read_text())
|
32
|
-
return template
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2024/05/26
|
6
|
+
|
7
|
+
from pyxllib.prog.pupil import check_install_package
|
8
|
+
|
9
|
+
# 一个xpath解析库
|
10
|
+
check_install_package('jinja2')
|
11
|
+
|
12
|
+
import jinja2
|
13
|
+
from jinja2 import Template, Environment
|
14
|
+
|
15
|
+
from pyxllib.file.specialist import XlPath
|
16
|
+
|
17
|
+
|
18
|
+
def set_template(s, *args, **kwargs):
|
19
|
+
""" todo 这个名字会不会太容易冲突了? """
|
20
|
+
return Template(s.strip(), *args, **kwargs)
|
21
|
+
|
22
|
+
|
23
|
+
def set_meta_template(s, meta_start='[[', meta_end=']]', **kwargs):
|
24
|
+
""" 支持预先用某些格式渲染后,再返回标准渲染模板 """
|
25
|
+
t = Template(s.strip(), variable_start_string=meta_start,
|
26
|
+
variable_end_string=meta_end).render(**kwargs)
|
27
|
+
return Template(t)
|
28
|
+
|
29
|
+
|
30
|
+
def get_jinja_template(name, **kwargs):
|
31
|
+
template = Environment(**kwargs).from_string((XlPath(__file__).parent / f'templates/{name}').read_text())
|
32
|
+
return template
|