pyxllib 0.3.197__py3-none-any.whl → 3.201.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. pyxllib/__init__.py +14 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +537 -541
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -389
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -629
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -241
  11. pyxllib/algo/stat.py +494 -494
  12. pyxllib/algo/treelib.py +145 -149
  13. pyxllib/algo/unitlib.py +62 -66
  14. pyxllib/autogui/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -246
  16. pyxllib/autogui/all.py +9 -9
  17. pyxllib/autogui/autogui.py +846 -852
  18. pyxllib/autogui/uiautolib.py +362 -362
  19. pyxllib/autogui/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -827
  21. pyxllib/autogui/wechat_msg.py +421 -421
  22. pyxllib/autogui/wxautolib.py +84 -84
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -137
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +236 -240
  34. pyxllib/data/jsonlib.py +85 -89
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1111 -1127
  37. pyxllib/data/sqlite.py +568 -568
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -505
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +251 -246
  42. pyxllib/ext/drissionlib.py +277 -277
  43. pyxllib/ext/kq5034lib.py +12 -12
  44. pyxllib/ext/qt.py +449 -449
  45. pyxllib/ext/robustprocfile.py +493 -497
  46. pyxllib/ext/seleniumlib.py +76 -76
  47. pyxllib/ext/tk.py +173 -173
  48. pyxllib/ext/unixlib.py +821 -827
  49. pyxllib/ext/utools.py +345 -351
  50. pyxllib/ext/webhook.py +124 -119
  51. pyxllib/ext/win32lib.py +40 -40
  52. pyxllib/ext/wjxlib.py +91 -88
  53. pyxllib/ext/wpsapi.py +124 -124
  54. pyxllib/ext/xlwork.py +9 -9
  55. pyxllib/ext/yuquelib.py +1110 -1105
  56. pyxllib/file/__init__.py +17 -17
  57. pyxllib/file/docxlib.py +757 -761
  58. pyxllib/file/gitlib.py +309 -309
  59. pyxllib/file/libreoffice.py +165 -165
  60. pyxllib/file/movielib.py +144 -148
  61. pyxllib/file/newbie.py +10 -10
  62. pyxllib/file/onenotelib.py +1469 -1469
  63. pyxllib/file/packlib/__init__.py +330 -330
  64. pyxllib/file/packlib/zipfile.py +2441 -2441
  65. pyxllib/file/pdflib.py +422 -426
  66. pyxllib/file/pupil.py +185 -185
  67. pyxllib/file/specialist/__init__.py +681 -685
  68. pyxllib/file/specialist/dirlib.py +799 -799
  69. pyxllib/file/specialist/download.py +193 -193
  70. pyxllib/file/specialist/filelib.py +2825 -2829
  71. pyxllib/file/xlsxlib.py +3122 -3131
  72. pyxllib/file/xlsyncfile.py +341 -341
  73. pyxllib/prog/__init__.py +5 -5
  74. pyxllib/prog/cachetools.py +58 -64
  75. pyxllib/prog/deprecatedlib.py +233 -233
  76. pyxllib/prog/filelock.py +42 -42
  77. pyxllib/prog/ipyexec.py +253 -253
  78. pyxllib/prog/multiprogs.py +940 -940
  79. pyxllib/prog/newbie.py +451 -451
  80. pyxllib/prog/pupil.py +1208 -1197
  81. pyxllib/prog/sitepackages.py +33 -33
  82. pyxllib/prog/specialist/__init__.py +348 -391
  83. pyxllib/prog/specialist/bc.py +203 -203
  84. pyxllib/prog/specialist/browser.py +497 -497
  85. pyxllib/prog/specialist/common.py +347 -347
  86. pyxllib/prog/specialist/datetime.py +198 -198
  87. pyxllib/prog/specialist/tictoc.py +240 -240
  88. pyxllib/prog/specialist/xllog.py +180 -180
  89. pyxllib/prog/xlosenv.py +110 -108
  90. pyxllib/stdlib/__init__.py +17 -17
  91. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  92. pyxllib/stdlib/tablepyxl/style.py +303 -303
  93. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  94. pyxllib/text/__init__.py +8 -8
  95. pyxllib/text/ahocorasick.py +36 -39
  96. pyxllib/text/airscript.js +754 -744
  97. pyxllib/text/charclasslib.py +121 -121
  98. pyxllib/text/jiebalib.py +267 -267
  99. pyxllib/text/jinjalib.py +27 -32
  100. pyxllib/text/jsa_ai_prompt.md +271 -271
  101. pyxllib/text/jscode.py +922 -922
  102. pyxllib/text/latex/__init__.py +158 -158
  103. pyxllib/text/levenshtein.py +303 -303
  104. pyxllib/text/nestenv.py +1215 -1215
  105. pyxllib/text/newbie.py +300 -300
  106. pyxllib/text/pupil/__init__.py +8 -8
  107. pyxllib/text/pupil/common.py +1121 -1121
  108. pyxllib/text/pupil/xlalign.py +326 -326
  109. pyxllib/text/pycode.py +47 -47
  110. pyxllib/text/specialist/__init__.py +8 -8
  111. pyxllib/text/specialist/common.py +112 -112
  112. pyxllib/text/specialist/ptag.py +186 -186
  113. pyxllib/text/spellchecker.py +172 -172
  114. pyxllib/text/templates/echart_base.html +10 -10
  115. pyxllib/text/templates/highlight_code.html +16 -16
  116. pyxllib/text/templates/latex_editor.html +102 -102
  117. pyxllib/text/vbacode.py +17 -17
  118. pyxllib/text/xmllib.py +741 -747
  119. pyxllib/xl.py +42 -39
  120. pyxllib/xlcv.py +17 -17
  121. pyxllib-3.201.1.dist-info/METADATA +296 -0
  122. pyxllib-3.201.1.dist-info/RECORD +125 -0
  123. {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/licenses/LICENSE +190 -190
  124. pyxllib/ext/old.py +0 -663
  125. pyxllib-0.3.197.dist-info/METADATA +0 -48
  126. pyxllib-0.3.197.dist-info/RECORD +0 -126
  127. {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/WHEEL +0 -0
pyxllib/text/jiebalib.py CHANGED
@@ -1,267 +1,267 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2023/11/05
6
-
7
- """ 基于jieba库的一些文本处理功能 """
8
-
9
- from collections import Counter
10
- import re
11
-
12
- from tqdm import tqdm
13
- import pandas as pd
14
-
15
- import jieba
16
- import jieba.posseg as pseg
17
- from simhash import Simhash
18
-
19
- from pyxllib.prog.pupil import DictTool, run_once
20
- from pyxllib.file.specialist import XlPath
21
- from pyxllib.algo.stat import update_dataframes_to_excel
22
-
23
-
24
- def jieba_add_words(words):
25
- for w in words:
26
- jieba.add_word(w)
27
-
28
-
29
- def jieba_del_words(words):
30
- for w in words:
31
- jieba.del_word(w)
32
-
33
-
34
- @run_once('str')
35
- def jieba_cut(text):
36
- return tuple(jieba.cut(text))
37
-
38
-
39
- @run_once('str')
40
- def pseg_cut(text):
41
- return tuple(pseg.cut(text))
42
-
43
-
44
- def _count_word_frequency(texts, function_word=True):
45
- """ 统计关键词出现频数 (主要是协助计算tf-idf)
46
-
47
- :param texts: 输入字符串列表
48
- :param function_word: 是否要统计虚词
49
- :return: 一个dict
50
- key: 分词名称
51
- values: [x, y],x是出现总频数,y是这个词在多少篇文章中出现过
52
-
53
- >>> _count_word_frequency(['正正正正', '正反正', '反反反反'])
54
- {'正正': [1, 1], '反反': [2, 1]}
55
-
56
- 原没有过滤词性的结果:{'正正': [2, 1], '正': [1, 1], '反正': [1, 1], '反反': [2, 1]}
57
- """
58
-
59
- d = dict()
60
- for text in tqdm(texts, '词频统计'):
61
- wordflags = list(pseg.cut(text))
62
- words = set()
63
- for word, flag in wordflags:
64
- # 虚词不做记录
65
- if (not function_word) and flag in ('uj', 'd', 'p', 'c', 'u', 'xc'):
66
- continue
67
- words.add(word)
68
- if word not in d:
69
- d[word] = [0, 0]
70
- d[word][0] += 1
71
- for word in words:
72
- d[word][1] += 1
73
- return d
74
-
75
-
76
- def analyse_tf_idf(texts, outfile=None, sheet_name='tf-idf', *, function_word=True):
77
- """ 分析tf-idf值
78
-
79
- :param list[str] texts: 多份文件的文本内容
80
- :return: 一个DataFrame数据
81
-
82
- 这个算法jieba可能有些自带库可以搞,但是自己写一下也不难啦
83
- 注意我这里返回的tf-idf中,是放大了总频数倍的,这样显示的数值大一点,看起来舒服~
84
- """
85
- from math import log10
86
-
87
- frequency = _count_word_frequency(texts, function_word)
88
- DictTool.isub(frequency, [' ', '\t', '\n'])
89
-
90
- n = len(texts)
91
- sum_frequency = sum([v[0] for v in frequency.values()])
92
-
93
- li = []
94
- for k, v in frequency.items():
95
- idf = log10(n / v[1])
96
- # idf = 1
97
- li.append([k, v[0], v[0] / sum_frequency, v[1], idf, v[0] * idf])
98
- df = pd.DataFrame.from_records(li, columns=('词汇', '频数', '频率', '出现该词文章数', 'idf', 'tf-idf'))
99
- df.sort_values(by='tf-idf', ascending=False, inplace=True)
100
-
101
- if outfile:
102
- update_dataframes_to_excel(outfile, {sheet_name: df})
103
-
104
- return df
105
-
106
-
107
- class TextClassifier:
108
- def __init__(self, texts=None):
109
- """ 文本分类器
110
-
111
- :param list[str] texts: 文本内容
112
- """
113
-
114
- self.texts = []
115
- self.tfidf = {}
116
- self.vecs = [] # 每份文本对应的向量化表达
117
- self.default_tfidf = 1 # 如果没有计算tf-idf,可以全部默认用权重1
118
-
119
- if texts:
120
- for text in texts:
121
- self.texts.append(text)
122
-
123
- def get_text_tf(self, text, *,
124
- function_word_weight=0.2,
125
- normalize=True,
126
- ingore_words=(' ', '\t', '\n'),
127
- add_flag=False):
128
- """ 这里可以定制提取text关键词的算法
129
-
130
- :param function_word_weight: 这里可以自定义功能性词汇权重,一般是设一个小数降低权重
131
-
132
- 一般是定制一些过滤规则,比如过滤掉一些词性,或者过滤掉一些词
133
- """
134
- ct = Counter()
135
-
136
- # 1 初步的分词,以及是否要过滤虚词
137
- wordflags = list(pseg_cut(text))
138
- for word, flag in wordflags:
139
- if flag in ('uj', 'd', 'p', 'c', 'u', 'xc', 'x'):
140
- if add_flag:
141
- ct[word + ',' + flag] += function_word_weight
142
- else:
143
- ct[word] += function_word_weight
144
- else:
145
- if add_flag:
146
- ct[word + ',' + flag] += 1
147
- else:
148
- ct[word] += 1
149
-
150
- # 2 归一化一些词
151
- if normalize:
152
- ct2 = Counter()
153
- for k, v in ct.items():
154
- # 如果需要对一些词汇做归一化,也可以这里设置
155
- k = re.sub(r'\d', '0', k) # 把数字都换成0
156
- ct2[k] += v
157
- ct = ct2
158
-
159
- # 3 过滤掉一些词
160
- if ingore_words:
161
- for k in ingore_words:
162
- if k in ct:
163
- del ct[k]
164
-
165
- return ct
166
-
167
- def compute_tfidf(self, outfile=None, sheet_name='tf-idf', normalize=False, function_word_weight=0.2,
168
- add_flag=False):
169
- """ 重算tfidf表 """
170
- from math import log10
171
-
172
- # 1 统计频数和出现该词的文章数
173
- d = dict()
174
- for text in tqdm(self.texts, '词频统计'):
175
- ct = self.get_text_tf(text, normalize=normalize, function_word_weight=function_word_weight,
176
- add_flag=add_flag)
177
- for k, v in ct.items():
178
- if k not in d:
179
- d[k] = [0, 0]
180
- d[k] = [d[k][0] + v, d[k][1] + 1]
181
-
182
- # 2 计算tfidf
183
- n = len(self.texts)
184
- sum_tf = sum([v[0] for v in d.values()])
185
- ls = []
186
- for k, v in d.items():
187
- idf = log10(n / v[1])
188
- # idf = 1
189
- ls.append([k, v[0], v[0] / sum_tf, v[1], idf, v[0] * idf])
190
-
191
- df = pd.DataFrame.from_records(ls, columns=('词汇', '频数', '频率', '出现该词文章数', 'idf', 'tf-idf'))
192
- df.sort_values(by='tf-idf', ascending=False, inplace=True)
193
-
194
- # 3 保存到文件
195
- if outfile:
196
- update_dataframes_to_excel(outfile, {sheet_name: df})
197
-
198
- self.tfidf = {row['词汇']: row['tf-idf'] for idx, row in df.iterrows()}
199
- self.default_tfidf = df.loc[len(df) - 1]['tf-idf'] # 最后条的权重作为其他未见词的默认权重
200
-
201
- return df
202
-
203
- def normalization(self, d):
204
- """ 向量归一化
205
-
206
- 输入一个类字典结构表示的向量,对向量做归一化处理
207
- """
208
- length = sum([v * v for v in d.values()]) ** 0.5 # 向量长度
209
- return {k: v / length for k, v in d.items()}
210
-
211
- def get_text_vec(self, text):
212
- """ 获取文本的向量化表达
213
-
214
- :param str text: 文本内容
215
- """
216
- ct = self.get_text_tf(text)
217
- vec = {k: v * self.tfidf.get(k, self.default_tfidf) for k, v in ct.items()}
218
- vec = self.normalization(vec)
219
- return vec
220
-
221
- def compute_vecs(self):
222
- """ 重置向量化表达 """
223
- vecs = []
224
- for text in tqdm(self.texts, desc='query向量化'):
225
- vecs.append(self.get_text_vec(text))
226
- self.vecs = vecs
227
- return vecs
228
-
229
- def cosine_similar(self, x, y):
230
- """ 两个向量的余弦相似度,值越大越相似
231
-
232
- 这里是简化的,只算两个向量的点积,请确保输入的都是单位长度的向量
233
- 注意这里x和y都是稀疏矩阵的存储形式,传入的是dict结构
234
- """
235
- keys = x.keys() & y.keys() # 求出x和y共有的键值
236
- return sum([x[k] * y[k] for k in keys])
237
-
238
- def find_similar_vec(self, x, maxn=10):
239
- """ 找与x最相近的向量,返回下标和相似度
240
-
241
- :pamra x: 待查找的对象
242
- :param maxn: 返回最相近的前maxn个对象
243
- """
244
- if isinstance(x, str):
245
- x = self.get_text_vec(x)
246
-
247
- # todo 使用并行计算?或者其实也可以向量化,但向量化是稀疏矩阵,挺占空间的
248
- sims = [(i, self.cosine_similar(x, v)) for i, v in enumerate(self.vecs)]
249
- sims.sort(key=lambda x: x[1], reverse=True)
250
- return sims[:maxn]
251
-
252
- def refine_vecs(self):
253
- """ 优化向量数据,去掉权重小余0.0001的维度 """
254
- # 1 计算每个向量的长度
255
- vecs = []
256
- for vec in tqdm(self.vecs, '优化向量'):
257
- vec = [(k, v) for k, v in vec.items()]
258
- vec.sort(key=lambda x: x[1], reverse=True)
259
- vec2 = {}
260
- for k, v in vec:
261
- if v < 0.0001:
262
- break
263
- vec2[k] = round(v, 4)
264
- vecs.append(vec2)
265
-
266
- self.vecs = vecs
267
- return self.vecs
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2023/11/05
6
+
7
+ """ 基于jieba库的一些文本处理功能 """
8
+
9
+ from collections import Counter
10
+ import re
11
+
12
+ from tqdm import tqdm
13
+ import pandas as pd
14
+
15
+ import jieba
16
+ import jieba.posseg as pseg
17
+ from simhash import Simhash
18
+
19
+ from pyxllib.prog.pupil import DictTool, run_once
20
+ from pyxllib.file.specialist import XlPath
21
+ from pyxllib.algo.stat import update_dataframes_to_excel
22
+
23
+
24
+ def jieba_add_words(words):
25
+ for w in words:
26
+ jieba.add_word(w)
27
+
28
+
29
+ def jieba_del_words(words):
30
+ for w in words:
31
+ jieba.del_word(w)
32
+
33
+
34
+ @run_once('str')
35
+ def jieba_cut(text):
36
+ return tuple(jieba.cut(text))
37
+
38
+
39
+ @run_once('str')
40
+ def pseg_cut(text):
41
+ return tuple(pseg.cut(text))
42
+
43
+
44
+ def _count_word_frequency(texts, function_word=True):
45
+ """ 统计关键词出现频数 (主要是协助计算tf-idf)
46
+
47
+ :param texts: 输入字符串列表
48
+ :param function_word: 是否要统计虚词
49
+ :return: 一个dict
50
+ key: 分词名称
51
+ values: [x, y],x是出现总频数,y是这个词在多少篇文章中出现过
52
+
53
+ >>> _count_word_frequency(['正正正正', '正反正', '反反反反'])
54
+ {'正正': [1, 1], '反反': [2, 1]}
55
+
56
+ 原没有过滤词性的结果:{'正正': [2, 1], '正': [1, 1], '反正': [1, 1], '反反': [2, 1]}
57
+ """
58
+
59
+ d = dict()
60
+ for text in tqdm(texts, '词频统计'):
61
+ wordflags = list(pseg.cut(text))
62
+ words = set()
63
+ for word, flag in wordflags:
64
+ # 虚词不做记录
65
+ if (not function_word) and flag in ('uj', 'd', 'p', 'c', 'u', 'xc'):
66
+ continue
67
+ words.add(word)
68
+ if word not in d:
69
+ d[word] = [0, 0]
70
+ d[word][0] += 1
71
+ for word in words:
72
+ d[word][1] += 1
73
+ return d
74
+
75
+
76
+ def analyse_tf_idf(texts, outfile=None, sheet_name='tf-idf', *, function_word=True):
77
+ """ 分析tf-idf值
78
+
79
+ :param list[str] texts: 多份文件的文本内容
80
+ :return: 一个DataFrame数据
81
+
82
+ 这个算法jieba可能有些自带库可以搞,但是自己写一下也不难啦
83
+ 注意我这里返回的tf-idf中,是放大了总频数倍的,这样显示的数值大一点,看起来舒服~
84
+ """
85
+ from math import log10
86
+
87
+ frequency = _count_word_frequency(texts, function_word)
88
+ DictTool.isub(frequency, [' ', '\t', '\n'])
89
+
90
+ n = len(texts)
91
+ sum_frequency = sum([v[0] for v in frequency.values()])
92
+
93
+ li = []
94
+ for k, v in frequency.items():
95
+ idf = log10(n / v[1])
96
+ # idf = 1
97
+ li.append([k, v[0], v[0] / sum_frequency, v[1], idf, v[0] * idf])
98
+ df = pd.DataFrame.from_records(li, columns=('词汇', '频数', '频率', '出现该词文章数', 'idf', 'tf-idf'))
99
+ df.sort_values(by='tf-idf', ascending=False, inplace=True)
100
+
101
+ if outfile:
102
+ update_dataframes_to_excel(outfile, {sheet_name: df})
103
+
104
+ return df
105
+
106
+
107
+ class TextClassifier:
108
+ def __init__(self, texts=None):
109
+ """ 文本分类器
110
+
111
+ :param list[str] texts: 文本内容
112
+ """
113
+
114
+ self.texts = []
115
+ self.tfidf = {}
116
+ self.vecs = [] # 每份文本对应的向量化表达
117
+ self.default_tfidf = 1 # 如果没有计算tf-idf,可以全部默认用权重1
118
+
119
+ if texts:
120
+ for text in texts:
121
+ self.texts.append(text)
122
+
123
+ def get_text_tf(self, text, *,
124
+ function_word_weight=0.2,
125
+ normalize=True,
126
+ ingore_words=(' ', '\t', '\n'),
127
+ add_flag=False):
128
+ """ 这里可以定制提取text关键词的算法
129
+
130
+ :param function_word_weight: 这里可以自定义功能性词汇权重,一般是设一个小数降低权重
131
+
132
+ 一般是定制一些过滤规则,比如过滤掉一些词性,或者过滤掉一些词
133
+ """
134
+ ct = Counter()
135
+
136
+ # 1 初步的分词,以及是否要过滤虚词
137
+ wordflags = list(pseg_cut(text))
138
+ for word, flag in wordflags:
139
+ if flag in ('uj', 'd', 'p', 'c', 'u', 'xc', 'x'):
140
+ if add_flag:
141
+ ct[word + ',' + flag] += function_word_weight
142
+ else:
143
+ ct[word] += function_word_weight
144
+ else:
145
+ if add_flag:
146
+ ct[word + ',' + flag] += 1
147
+ else:
148
+ ct[word] += 1
149
+
150
+ # 2 归一化一些词
151
+ if normalize:
152
+ ct2 = Counter()
153
+ for k, v in ct.items():
154
+ # 如果需要对一些词汇做归一化,也可以这里设置
155
+ k = re.sub(r'\d', '0', k) # 把数字都换成0
156
+ ct2[k] += v
157
+ ct = ct2
158
+
159
+ # 3 过滤掉一些词
160
+ if ingore_words:
161
+ for k in ingore_words:
162
+ if k in ct:
163
+ del ct[k]
164
+
165
+ return ct
166
+
167
+ def compute_tfidf(self, outfile=None, sheet_name='tf-idf', normalize=False, function_word_weight=0.2,
168
+ add_flag=False):
169
+ """ 重算tfidf表 """
170
+ from math import log10
171
+
172
+ # 1 统计频数和出现该词的文章数
173
+ d = dict()
174
+ for text in tqdm(self.texts, '词频统计'):
175
+ ct = self.get_text_tf(text, normalize=normalize, function_word_weight=function_word_weight,
176
+ add_flag=add_flag)
177
+ for k, v in ct.items():
178
+ if k not in d:
179
+ d[k] = [0, 0]
180
+ d[k] = [d[k][0] + v, d[k][1] + 1]
181
+
182
+ # 2 计算tfidf
183
+ n = len(self.texts)
184
+ sum_tf = sum([v[0] for v in d.values()])
185
+ ls = []
186
+ for k, v in d.items():
187
+ idf = log10(n / v[1])
188
+ # idf = 1
189
+ ls.append([k, v[0], v[0] / sum_tf, v[1], idf, v[0] * idf])
190
+
191
+ df = pd.DataFrame.from_records(ls, columns=('词汇', '频数', '频率', '出现该词文章数', 'idf', 'tf-idf'))
192
+ df.sort_values(by='tf-idf', ascending=False, inplace=True)
193
+
194
+ # 3 保存到文件
195
+ if outfile:
196
+ update_dataframes_to_excel(outfile, {sheet_name: df})
197
+
198
+ self.tfidf = {row['词汇']: row['tf-idf'] for idx, row in df.iterrows()}
199
+ self.default_tfidf = df.loc[len(df) - 1]['tf-idf'] # 最后条的权重作为其他未见词的默认权重
200
+
201
+ return df
202
+
203
+ def normalization(self, d):
204
+ """ 向量归一化
205
+
206
+ 输入一个类字典结构表示的向量,对向量做归一化处理
207
+ """
208
+ length = sum([v * v for v in d.values()]) ** 0.5 # 向量长度
209
+ return {k: v / length for k, v in d.items()}
210
+
211
+ def get_text_vec(self, text):
212
+ """ 获取文本的向量化表达
213
+
214
+ :param str text: 文本内容
215
+ """
216
+ ct = self.get_text_tf(text)
217
+ vec = {k: v * self.tfidf.get(k, self.default_tfidf) for k, v in ct.items()}
218
+ vec = self.normalization(vec)
219
+ return vec
220
+
221
+ def compute_vecs(self):
222
+ """ 重置向量化表达 """
223
+ vecs = []
224
+ for text in tqdm(self.texts, desc='query向量化'):
225
+ vecs.append(self.get_text_vec(text))
226
+ self.vecs = vecs
227
+ return vecs
228
+
229
+ def cosine_similar(self, x, y):
230
+ """ 两个向量的余弦相似度,值越大越相似
231
+
232
+ 这里是简化的,只算两个向量的点积,请确保输入的都是单位长度的向量
233
+ 注意这里x和y都是稀疏矩阵的存储形式,传入的是dict结构
234
+ """
235
+ keys = x.keys() & y.keys() # 求出x和y共有的键值
236
+ return sum([x[k] * y[k] for k in keys])
237
+
238
+ def find_similar_vec(self, x, maxn=10):
239
+ """ 找与x最相近的向量,返回下标和相似度
240
+
241
+ :pamra x: 待查找的对象
242
+ :param maxn: 返回最相近的前maxn个对象
243
+ """
244
+ if isinstance(x, str):
245
+ x = self.get_text_vec(x)
246
+
247
+ # todo 使用并行计算?或者其实也可以向量化,但向量化是稀疏矩阵,挺占空间的
248
+ sims = [(i, self.cosine_similar(x, v)) for i, v in enumerate(self.vecs)]
249
+ sims.sort(key=lambda x: x[1], reverse=True)
250
+ return sims[:maxn]
251
+
252
+ def refine_vecs(self):
253
+ """ 优化向量数据,去掉权重小余0.0001的维度 """
254
+ # 1 计算每个向量的长度
255
+ vecs = []
256
+ for vec in tqdm(self.vecs, '优化向量'):
257
+ vec = [(k, v) for k, v in vec.items()]
258
+ vec.sort(key=lambda x: x[1], reverse=True)
259
+ vec2 = {}
260
+ for k, v in vec:
261
+ if v < 0.0001:
262
+ break
263
+ vec2[k] = round(v, 4)
264
+ vecs.append(vec2)
265
+
266
+ self.vecs = vecs
267
+ return self.vecs
pyxllib/text/jinjalib.py CHANGED
@@ -1,32 +1,27 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2024/05/26
6
-
7
- from pyxllib.prog.pupil import check_install_package
8
-
9
- # 一个xpath解析库
10
- check_install_package('jinja2')
11
-
12
- import jinja2
13
- from jinja2 import Template, Environment
14
-
15
- from pyxllib.file.specialist import XlPath
16
-
17
-
18
- def set_template(s, *args, **kwargs):
19
- """ todo 这个名字会不会太容易冲突了? """
20
- return Template(s.strip(), *args, **kwargs)
21
-
22
-
23
- def set_meta_template(s, meta_start='[[', meta_end=']]', **kwargs):
24
- """ 支持预先用某些格式渲染后,再返回标准渲染模板 """
25
- t = Template(s.strip(), variable_start_string=meta_start,
26
- variable_end_string=meta_end).render(**kwargs)
27
- return Template(t)
28
-
29
-
30
- def get_jinja_template(name, **kwargs):
31
- template = Environment(**kwargs).from_string((XlPath(__file__).parent / f'templates/{name}').read_text())
32
- return template
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2024/05/26
6
+
7
+ import jinja2
8
+ from jinja2 import Template, Environment
9
+
10
+ from pyxllib.file.specialist import XlPath
11
+
12
+
13
+ def set_template(s, *args, **kwargs):
14
+ """ todo 这个名字会不会太容易冲突了? """
15
+ return Template(s.strip(), *args, **kwargs)
16
+
17
+
18
+ def set_meta_template(s, meta_start='[[', meta_end=']]', **kwargs):
19
+ """ 支持预先用某些格式渲染后,再返回标准渲染模板 """
20
+ t = Template(s.strip(), variable_start_string=meta_start,
21
+ variable_end_string=meta_end).render(**kwargs)
22
+ return Template(t)
23
+
24
+
25
+ def get_jinja_template(name, **kwargs):
26
+ template = Environment(**kwargs).from_string((XlPath(__file__).parent / f'templates/{name}').read_text())
27
+ return template