pyxllib 0.3.197__py3-none-any.whl → 0.3.200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +21 -21
- pyxllib/algo/__init__.py +8 -8
- pyxllib/algo/disjoint.py +54 -54
- pyxllib/algo/geo.py +541 -541
- pyxllib/algo/intervals.py +964 -964
- pyxllib/algo/matcher.py +389 -389
- pyxllib/algo/newbie.py +166 -166
- pyxllib/algo/pupil.py +629 -629
- pyxllib/algo/shapelylib.py +67 -67
- pyxllib/algo/specialist.py +241 -241
- pyxllib/algo/stat.py +494 -494
- pyxllib/algo/treelib.py +149 -149
- pyxllib/algo/unitlib.py +66 -66
- pyxllib/autogui/__init__.py +5 -5
- pyxllib/autogui/activewin.py +246 -246
- pyxllib/autogui/all.py +9 -9
- pyxllib/autogui/autogui.py +852 -852
- pyxllib/autogui/uiautolib.py +362 -362
- pyxllib/autogui/virtualkey.py +102 -102
- pyxllib/autogui/wechat.py +827 -827
- pyxllib/autogui/wechat_msg.py +421 -421
- pyxllib/autogui/wxautolib.py +84 -84
- pyxllib/cv/__init__.py +5 -5
- pyxllib/cv/expert.py +267 -267
- pyxllib/cv/imfile.py +159 -159
- pyxllib/cv/imhash.py +39 -39
- pyxllib/cv/pupil.py +9 -9
- pyxllib/cv/rgbfmt.py +1525 -1525
- pyxllib/cv/slidercaptcha.py +137 -137
- pyxllib/cv/trackbartools.py +251 -251
- pyxllib/cv/xlcvlib.py +1040 -1040
- pyxllib/cv/xlpillib.py +423 -423
- pyxllib/data/echarts.py +240 -240
- pyxllib/data/jsonlib.py +89 -89
- pyxllib/data/oss.py +72 -72
- pyxllib/data/pglib.py +1127 -1127
- pyxllib/data/sqlite.py +568 -568
- pyxllib/data/sqllib.py +297 -297
- pyxllib/ext/JLineViewer.py +505 -505
- pyxllib/ext/__init__.py +6 -6
- pyxllib/ext/demolib.py +246 -246
- pyxllib/ext/drissionlib.py +277 -277
- pyxllib/ext/kq5034lib.py +12 -12
- pyxllib/ext/old.py +663 -663
- pyxllib/ext/qt.py +449 -449
- pyxllib/ext/robustprocfile.py +497 -497
- pyxllib/ext/seleniumlib.py +76 -76
- pyxllib/ext/tk.py +173 -173
- pyxllib/ext/unixlib.py +827 -827
- pyxllib/ext/utools.py +351 -351
- pyxllib/ext/webhook.py +124 -119
- pyxllib/ext/win32lib.py +40 -40
- pyxllib/ext/wjxlib.py +88 -88
- pyxllib/ext/wpsapi.py +124 -124
- pyxllib/ext/xlwork.py +9 -9
- pyxllib/ext/yuquelib.py +1105 -1105
- pyxllib/file/__init__.py +17 -17
- pyxllib/file/docxlib.py +761 -761
- pyxllib/file/gitlib.py +309 -309
- pyxllib/file/libreoffice.py +165 -165
- pyxllib/file/movielib.py +148 -148
- pyxllib/file/newbie.py +10 -10
- pyxllib/file/onenotelib.py +1469 -1469
- pyxllib/file/packlib/__init__.py +330 -330
- pyxllib/file/packlib/zipfile.py +2441 -2441
- pyxllib/file/pdflib.py +426 -426
- pyxllib/file/pupil.py +185 -185
- pyxllib/file/specialist/__init__.py +685 -685
- pyxllib/file/specialist/dirlib.py +799 -799
- pyxllib/file/specialist/download.py +193 -193
- pyxllib/file/specialist/filelib.py +2829 -2829
- pyxllib/file/xlsxlib.py +3131 -3131
- pyxllib/file/xlsyncfile.py +341 -341
- pyxllib/prog/__init__.py +5 -5
- pyxllib/prog/cachetools.py +64 -64
- pyxllib/prog/deprecatedlib.py +233 -233
- pyxllib/prog/filelock.py +42 -42
- pyxllib/prog/ipyexec.py +253 -253
- pyxllib/prog/multiprogs.py +940 -940
- pyxllib/prog/newbie.py +451 -451
- pyxllib/prog/pupil.py +1197 -1197
- pyxllib/prog/sitepackages.py +33 -33
- pyxllib/prog/specialist/__init__.py +391 -391
- pyxllib/prog/specialist/bc.py +203 -203
- pyxllib/prog/specialist/browser.py +497 -497
- pyxllib/prog/specialist/common.py +347 -347
- pyxllib/prog/specialist/datetime.py +198 -198
- pyxllib/prog/specialist/tictoc.py +240 -240
- pyxllib/prog/specialist/xllog.py +180 -180
- pyxllib/prog/xlosenv.py +108 -108
- pyxllib/stdlib/__init__.py +17 -17
- pyxllib/stdlib/tablepyxl/__init__.py +10 -10
- pyxllib/stdlib/tablepyxl/style.py +303 -303
- pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
- pyxllib/text/__init__.py +8 -8
- pyxllib/text/ahocorasick.py +39 -39
- pyxllib/text/airscript.js +744 -744
- pyxllib/text/charclasslib.py +121 -121
- pyxllib/text/jiebalib.py +267 -267
- pyxllib/text/jinjalib.py +32 -32
- pyxllib/text/jsa_ai_prompt.md +271 -271
- pyxllib/text/jscode.py +922 -922
- pyxllib/text/latex/__init__.py +158 -158
- pyxllib/text/levenshtein.py +303 -303
- pyxllib/text/nestenv.py +1215 -1215
- pyxllib/text/newbie.py +300 -300
- pyxllib/text/pupil/__init__.py +8 -8
- pyxllib/text/pupil/common.py +1121 -1121
- pyxllib/text/pupil/xlalign.py +326 -326
- pyxllib/text/pycode.py +47 -47
- pyxllib/text/specialist/__init__.py +8 -8
- pyxllib/text/specialist/common.py +112 -112
- pyxllib/text/specialist/ptag.py +186 -186
- pyxllib/text/spellchecker.py +172 -172
- pyxllib/text/templates/echart_base.html +10 -10
- pyxllib/text/templates/highlight_code.html +16 -16
- pyxllib/text/templates/latex_editor.html +102 -102
- pyxllib/text/vbacode.py +17 -17
- pyxllib/text/xmllib.py +747 -747
- pyxllib/xl.py +42 -39
- pyxllib/xlcv.py +17 -17
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/METADATA +1 -1
- pyxllib-0.3.200.dist-info/RECORD +126 -0
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/licenses/LICENSE +190 -190
- pyxllib-0.3.197.dist-info/RECORD +0 -126
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +0 -0
pyxllib/text/levenshtein.py
CHANGED
@@ -1,303 +1,303 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2021/06/06 17:01
|
6
|
-
|
7
|
-
from pyxllib.prog.pupil import check_install_package
|
8
|
-
|
9
|
-
# 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
|
10
|
-
# 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
|
11
|
-
# MatchSimString计算编辑距离需要
|
12
|
-
check_install_package('Levenshtein', 'python-Levenshtein')
|
13
|
-
|
14
|
-
from collections import defaultdict
|
15
|
-
from more_itertools import chunked
|
16
|
-
import warnings
|
17
|
-
|
18
|
-
import Levenshtein
|
19
|
-
import numpy as np
|
20
|
-
import pandas as pd
|
21
|
-
|
22
|
-
from pyxllib.prog.pupil import run_once
|
23
|
-
from pyxllib.prog.specialist import dataframe_str
|
24
|
-
from pyxllib.text.pupil import briefstr
|
25
|
-
|
26
|
-
# 忽略特定的警告
|
27
|
-
warnings.filterwarnings("ignore", category=FutureWarning,
|
28
|
-
module="sklearn.cluster._agglomerative",
|
29
|
-
lineno=1005)
|
30
|
-
|
31
|
-
|
32
|
-
@run_once('str')
|
33
|
-
def get_levenshtein_similar(x, y):
|
34
|
-
""" 缓存各字符串之间的编辑距离 """
|
35
|
-
return Levenshtein.ratio(x, y)
|
36
|
-
|
37
|
-
|
38
|
-
class MatchSimString:
|
39
|
-
""" 匹配近似字符串
|
40
|
-
|
41
|
-
mss = MatchSimString()
|
42
|
-
|
43
|
-
# 1 添加候选对象
|
44
|
-
mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
|
45
|
-
mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
|
46
|
-
mss.append_candidate('删除所有标签中间多余的空白')
|
47
|
-
|
48
|
-
# 2 需要匹配的对象1
|
49
|
-
s = '奕本初一福周厦门培油'
|
50
|
-
|
51
|
-
idx, sim = mss.match(s)
|
52
|
-
print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
|
53
|
-
print('相似度:', sim) # 相似度: 0.22
|
54
|
-
|
55
|
-
# 3 需要匹配的对象2
|
56
|
-
s = '圆柱与【圆锥】_教案空白版'
|
57
|
-
|
58
|
-
idx, sim = mss.match(s)
|
59
|
-
print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
|
60
|
-
print('相似度:', sim) # 相似度: 0.375
|
61
|
-
|
62
|
-
如果append_candidate有传递2个扩展信息参数,可以索引获取:
|
63
|
-
mss.ext_value[idx]
|
64
|
-
"""
|
65
|
-
|
66
|
-
def __init__(self, method=briefstr):
|
67
|
-
self.preproc = method
|
68
|
-
self.origin_str = [] # 原始字符串内容
|
69
|
-
self.key_str = [] # 对原始字符串进行处理后的字符
|
70
|
-
self.ext_value = [] # 扩展存储一些信息
|
71
|
-
|
72
|
-
def __getitem__(self, item):
|
73
|
-
return self.origin_str[item]
|
74
|
-
|
75
|
-
def __delitem__(self, item):
|
76
|
-
del self.origin_str[item]
|
77
|
-
del self.key_str[item]
|
78
|
-
del self.ext_value[item]
|
79
|
-
|
80
|
-
def __len__(self):
|
81
|
-
return len(self.key_str)
|
82
|
-
|
83
|
-
def get_similarity(self, x, y):
|
84
|
-
""" 计算两对数据之间的相似度 """
|
85
|
-
pass
|
86
|
-
|
87
|
-
def append_candidate(self, k, v=None):
|
88
|
-
self.origin_str.append(k)
|
89
|
-
if callable(self.preproc):
|
90
|
-
k = self.preproc(k)
|
91
|
-
self.key_str.append(k)
|
92
|
-
self.ext_value.append(v)
|
93
|
-
|
94
|
-
def match(self, s):
|
95
|
-
""" 跟候选字符串进行匹配,返回最佳匹配结果
|
96
|
-
"""
|
97
|
-
idx, sim = -1, 0
|
98
|
-
for i in range(len(self)):
|
99
|
-
k, v = self.key_str[i], self.ext_value[i]
|
100
|
-
sim_ = Levenshtein.ratio(k, s)
|
101
|
-
if sim_ > sim:
|
102
|
-
sim = sim_
|
103
|
-
idx = i
|
104
|
-
i += 1
|
105
|
-
return idx, sim
|
106
|
-
|
107
|
-
def match_many(self, s, count=1):
|
108
|
-
"""跟候选字符串进行匹配,返回多个最佳匹配结果
|
109
|
-
:param str s: 待匹配的字符串
|
110
|
-
:param int count: 需要返回的匹配数量
|
111
|
-
:return: 匹配结果列表,列表中的元素为(idx, sim)对
|
112
|
-
"""
|
113
|
-
scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
|
114
|
-
# 根据相似度排序并返回前count个结果
|
115
|
-
return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
|
116
|
-
|
117
|
-
def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
|
118
|
-
"""输入一个字符串s,和候选项做近似匹配
|
119
|
-
|
120
|
-
:param s: 需要进行匹配的字符串s
|
121
|
-
:param count: 只输出部分匹配结果
|
122
|
-
-1:输出所有匹配结果
|
123
|
-
0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
|
124
|
-
整数:输出匹配度最高的count个结果
|
125
|
-
:param showstr: 字符串显示效果
|
126
|
-
"""
|
127
|
-
# 1 计算编辑距离,存储结果到res
|
128
|
-
res = []
|
129
|
-
n = len(self)
|
130
|
-
for i in range(n):
|
131
|
-
k, v = self.key_str[i], self.ext_value[i]
|
132
|
-
sim = Levenshtein.ratio(k, s)
|
133
|
-
res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
|
134
|
-
i += 1
|
135
|
-
|
136
|
-
# 2 排序、节选结果
|
137
|
-
res = sorted(res, key=lambda x: -x[2])
|
138
|
-
if 0 < count < 1:
|
139
|
-
n = max(1, int(n * count))
|
140
|
-
elif isinstance(count, int) and count > 0:
|
141
|
-
n = min(count, n)
|
142
|
-
res = res[:n]
|
143
|
-
|
144
|
-
# 3 输出
|
145
|
-
df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
|
146
|
-
s = dataframe_str(df)
|
147
|
-
s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
|
148
|
-
print(s)
|
149
|
-
|
150
|
-
def agglomerative_clustering(self, threshold=0.5):
|
151
|
-
""" 对内部字符串进行层次聚类
|
152
|
-
|
153
|
-
:param threshold: 可以理解成距离的阈值,距离小于这个阈值的字符串会被聚为一类
|
154
|
-
值越小,分出的类别越多越细
|
155
|
-
"""
|
156
|
-
check_install_package('sklearn', 'scikit-learn')
|
157
|
-
from sklearn.cluster import AgglomerativeClustering
|
158
|
-
|
159
|
-
# 1 给每个样本标类别
|
160
|
-
distance_matrix = np.zeros((len(self), len(self)))
|
161
|
-
for i in range(len(self)):
|
162
|
-
for j in range(i + 1, len(self)):
|
163
|
-
# 我们需要距离,所以用1减去相似度
|
164
|
-
distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
|
165
|
-
distance_matrix[i, j] = distance_matrix[j, i] = distance
|
166
|
-
|
167
|
-
# 进行层次聚类
|
168
|
-
clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
|
169
|
-
distance_threshold=threshold,
|
170
|
-
linkage='complete')
|
171
|
-
labels = clustering.fit_predict(distance_matrix)
|
172
|
-
|
173
|
-
return labels
|
174
|
-
|
175
|
-
def display_clusters(self, threshold=0.5):
|
176
|
-
""" 根据agglomerative_clustering的结果,显示各个聚类的内容 """
|
177
|
-
|
178
|
-
labels = self.agglomerative_clustering(threshold=threshold)
|
179
|
-
cluster_dict = defaultdict(list)
|
180
|
-
|
181
|
-
# 组织数据到字典中
|
182
|
-
for idx, label in enumerate(labels):
|
183
|
-
cluster_dict[label].append(self.origin_str[idx])
|
184
|
-
|
185
|
-
# 按标签排序并显示
|
186
|
-
result = {}
|
187
|
-
for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
|
188
|
-
result[label] = items
|
189
|
-
|
190
|
-
return result
|
191
|
-
|
192
|
-
|
193
|
-
class HierarchicalMatchSimString(MatchSimString):
|
194
|
-
""" 在面对数据量很大的候选数据情况下,建议使用这个层次聚类后的匹配方法 """
|
195
|
-
|
196
|
-
def __init__(self, method=briefstr):
|
197
|
-
super().__init__(method)
|
198
|
-
self.groups = dict()
|
199
|
-
|
200
|
-
def get_center_sample(self, indices=None):
|
201
|
-
""" 输入一组下标,计算中心样本,未输入参数值的时候,则在全量样本里找 """
|
202
|
-
if indices is None:
|
203
|
-
indices = range(len(self))
|
204
|
-
|
205
|
-
# 用于存储之前计算的结果
|
206
|
-
cached_results = {}
|
207
|
-
|
208
|
-
def get_similarity(i, j):
|
209
|
-
""" 获取两个索引的相似度,利用缓存来避免重复计算 """
|
210
|
-
if (i, j) in cached_results:
|
211
|
-
return cached_results[(i, j)]
|
212
|
-
sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
|
213
|
-
cached_results[(i, j)] = cached_results[(j, i)] = sim_val
|
214
|
-
return sim_val
|
215
|
-
|
216
|
-
center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
|
217
|
-
return center_idx
|
218
|
-
|
219
|
-
def merge_group(self, indices, threshold=0.5, strategy='center'):
|
220
|
-
""" 对输入的indexs清单,按照threshold的阈值进行合并
|
221
|
-
返回的是一个字典,key是代表性样本,value是同组内的数据编号
|
222
|
-
|
223
|
-
:param strategy: 代表样本的挑选策略
|
224
|
-
center,中心样本
|
225
|
-
first,第一个样本
|
226
|
-
"""
|
227
|
-
check_install_package('sklearn', 'scikit-learn')
|
228
|
-
from sklearn.cluster import AgglomerativeClustering
|
229
|
-
|
230
|
-
# 1 给每个样本标类别
|
231
|
-
n = len(indices)
|
232
|
-
distance_matrix = np.zeros((n, n))
|
233
|
-
for i in range(n):
|
234
|
-
for j in range(i + 1, n):
|
235
|
-
# 我们需要距离,所以用1减去相似度
|
236
|
-
distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
|
237
|
-
distance_matrix[i, j] = distance_matrix[j, i] = distance
|
238
|
-
|
239
|
-
# 进行层次聚类
|
240
|
-
clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
|
241
|
-
distance_threshold=threshold,
|
242
|
-
linkage='complete')
|
243
|
-
labels = clustering.fit_predict(distance_matrix)
|
244
|
-
|
245
|
-
# 2 分组字典
|
246
|
-
cluster_dict = defaultdict(list)
|
247
|
-
# 组织数据到字典中
|
248
|
-
for i, label in enumerate(labels):
|
249
|
-
cluster_dict[label].append(indices[i])
|
250
|
-
|
251
|
-
# 3 改成代表样本映射到一组里,并且按照样本数从多到少排序
|
252
|
-
result = {}
|
253
|
-
for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
|
254
|
-
if strategy == 'first':
|
255
|
-
representative = items[0]
|
256
|
-
elif strategy == 'center':
|
257
|
-
# 使用局部索引计算平均距离
|
258
|
-
local_indices = [i for i, idx in enumerate(indices) if idx in items]
|
259
|
-
sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
|
260
|
-
avg_distances = sub_matrix.mean(axis=1)
|
261
|
-
representative_idx = np.argmin(avg_distances)
|
262
|
-
representative = items[representative_idx]
|
263
|
-
else:
|
264
|
-
raise ValueError(f'Invalid strategy: {strategy}')
|
265
|
-
result[representative] = items
|
266
|
-
|
267
|
-
return result
|
268
|
-
|
269
|
-
def init_groups(self, threshold=0.5, batch_size=1000):
|
270
|
-
"""
|
271
|
-
:param threshold: 按照阈值进行分组,在这个距离内的都会归到一组
|
272
|
-
:param batch_size: 因为数据可能太大,不可能一次性全量两两比较,这里可以分batch处理
|
273
|
-
这样虽然结果不太精确,但能大大减小运算量
|
274
|
-
"""
|
275
|
-
# 1 最开始每个样本都是一个组
|
276
|
-
groups = {i: [i] for i in range(len(self))}
|
277
|
-
new_groups = {}
|
278
|
-
|
279
|
-
# 2 不断合并,直到没有组数变化
|
280
|
-
while len(groups) > 1:
|
281
|
-
for indices in chunked(groups.keys(), batch_size):
|
282
|
-
# 对于这里返回的字典,原groups里的values也要对应拼接的
|
283
|
-
indices2 = self.merge_group(indices, threshold=threshold)
|
284
|
-
for idx, idxs in indices2.items():
|
285
|
-
# 获取原始分组中的索引
|
286
|
-
original_idxs = [groups[original_idx] for original_idx in idxs]
|
287
|
-
# 展平列表并分配到新分组中
|
288
|
-
new_groups[idx] = [item for sublist in original_idxs for item in sublist]
|
289
|
-
|
290
|
-
# 如果分组没有发生变化,退出循环
|
291
|
-
if len(new_groups) == len(groups):
|
292
|
-
break
|
293
|
-
|
294
|
-
groups = new_groups
|
295
|
-
new_groups = {}
|
296
|
-
|
297
|
-
# 3 按数量从多到少排序
|
298
|
-
new_groups = {}
|
299
|
-
for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
|
300
|
-
new_groups[label] = items # 暂用第一个出现的作为代表
|
301
|
-
|
302
|
-
self.groups = new_groups
|
303
|
-
return self.groups
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2021/06/06 17:01
|
6
|
+
|
7
|
+
from pyxllib.prog.pupil import check_install_package
|
8
|
+
|
9
|
+
# 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
|
10
|
+
# 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
|
11
|
+
# MatchSimString计算编辑距离需要
|
12
|
+
check_install_package('Levenshtein', 'python-Levenshtein')
|
13
|
+
|
14
|
+
from collections import defaultdict
|
15
|
+
from more_itertools import chunked
|
16
|
+
import warnings
|
17
|
+
|
18
|
+
import Levenshtein
|
19
|
+
import numpy as np
|
20
|
+
import pandas as pd
|
21
|
+
|
22
|
+
from pyxllib.prog.pupil import run_once
|
23
|
+
from pyxllib.prog.specialist import dataframe_str
|
24
|
+
from pyxllib.text.pupil import briefstr
|
25
|
+
|
26
|
+
# 忽略特定的警告
|
27
|
+
warnings.filterwarnings("ignore", category=FutureWarning,
|
28
|
+
module="sklearn.cluster._agglomerative",
|
29
|
+
lineno=1005)
|
30
|
+
|
31
|
+
|
32
|
+
@run_once('str')
|
33
|
+
def get_levenshtein_similar(x, y):
|
34
|
+
""" 缓存各字符串之间的编辑距离 """
|
35
|
+
return Levenshtein.ratio(x, y)
|
36
|
+
|
37
|
+
|
38
|
+
class MatchSimString:
|
39
|
+
""" 匹配近似字符串
|
40
|
+
|
41
|
+
mss = MatchSimString()
|
42
|
+
|
43
|
+
# 1 添加候选对象
|
44
|
+
mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
|
45
|
+
mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
|
46
|
+
mss.append_candidate('删除所有标签中间多余的空白')
|
47
|
+
|
48
|
+
# 2 需要匹配的对象1
|
49
|
+
s = '奕本初一福周厦门培油'
|
50
|
+
|
51
|
+
idx, sim = mss.match(s)
|
52
|
+
print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
|
53
|
+
print('相似度:', sim) # 相似度: 0.22
|
54
|
+
|
55
|
+
# 3 需要匹配的对象2
|
56
|
+
s = '圆柱与【圆锥】_教案空白版'
|
57
|
+
|
58
|
+
idx, sim = mss.match(s)
|
59
|
+
print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
|
60
|
+
print('相似度:', sim) # 相似度: 0.375
|
61
|
+
|
62
|
+
如果append_candidate有传递2个扩展信息参数,可以索引获取:
|
63
|
+
mss.ext_value[idx]
|
64
|
+
"""
|
65
|
+
|
66
|
+
def __init__(self, method=briefstr):
|
67
|
+
self.preproc = method
|
68
|
+
self.origin_str = [] # 原始字符串内容
|
69
|
+
self.key_str = [] # 对原始字符串进行处理后的字符
|
70
|
+
self.ext_value = [] # 扩展存储一些信息
|
71
|
+
|
72
|
+
def __getitem__(self, item):
|
73
|
+
return self.origin_str[item]
|
74
|
+
|
75
|
+
def __delitem__(self, item):
|
76
|
+
del self.origin_str[item]
|
77
|
+
del self.key_str[item]
|
78
|
+
del self.ext_value[item]
|
79
|
+
|
80
|
+
def __len__(self):
|
81
|
+
return len(self.key_str)
|
82
|
+
|
83
|
+
def get_similarity(self, x, y):
|
84
|
+
""" 计算两对数据之间的相似度 """
|
85
|
+
pass
|
86
|
+
|
87
|
+
def append_candidate(self, k, v=None):
|
88
|
+
self.origin_str.append(k)
|
89
|
+
if callable(self.preproc):
|
90
|
+
k = self.preproc(k)
|
91
|
+
self.key_str.append(k)
|
92
|
+
self.ext_value.append(v)
|
93
|
+
|
94
|
+
def match(self, s):
|
95
|
+
""" 跟候选字符串进行匹配,返回最佳匹配结果
|
96
|
+
"""
|
97
|
+
idx, sim = -1, 0
|
98
|
+
for i in range(len(self)):
|
99
|
+
k, v = self.key_str[i], self.ext_value[i]
|
100
|
+
sim_ = Levenshtein.ratio(k, s)
|
101
|
+
if sim_ > sim:
|
102
|
+
sim = sim_
|
103
|
+
idx = i
|
104
|
+
i += 1
|
105
|
+
return idx, sim
|
106
|
+
|
107
|
+
def match_many(self, s, count=1):
|
108
|
+
"""跟候选字符串进行匹配,返回多个最佳匹配结果
|
109
|
+
:param str s: 待匹配的字符串
|
110
|
+
:param int count: 需要返回的匹配数量
|
111
|
+
:return: 匹配结果列表,列表中的元素为(idx, sim)对
|
112
|
+
"""
|
113
|
+
scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
|
114
|
+
# 根据相似度排序并返回前count个结果
|
115
|
+
return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
|
116
|
+
|
117
|
+
def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
|
118
|
+
"""输入一个字符串s,和候选项做近似匹配
|
119
|
+
|
120
|
+
:param s: 需要进行匹配的字符串s
|
121
|
+
:param count: 只输出部分匹配结果
|
122
|
+
-1:输出所有匹配结果
|
123
|
+
0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
|
124
|
+
整数:输出匹配度最高的count个结果
|
125
|
+
:param showstr: 字符串显示效果
|
126
|
+
"""
|
127
|
+
# 1 计算编辑距离,存储结果到res
|
128
|
+
res = []
|
129
|
+
n = len(self)
|
130
|
+
for i in range(n):
|
131
|
+
k, v = self.key_str[i], self.ext_value[i]
|
132
|
+
sim = Levenshtein.ratio(k, s)
|
133
|
+
res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
|
134
|
+
i += 1
|
135
|
+
|
136
|
+
# 2 排序、节选结果
|
137
|
+
res = sorted(res, key=lambda x: -x[2])
|
138
|
+
if 0 < count < 1:
|
139
|
+
n = max(1, int(n * count))
|
140
|
+
elif isinstance(count, int) and count > 0:
|
141
|
+
n = min(count, n)
|
142
|
+
res = res[:n]
|
143
|
+
|
144
|
+
# 3 输出
|
145
|
+
df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
|
146
|
+
s = dataframe_str(df)
|
147
|
+
s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
|
148
|
+
print(s)
|
149
|
+
|
150
|
+
def agglomerative_clustering(self, threshold=0.5):
|
151
|
+
""" 对内部字符串进行层次聚类
|
152
|
+
|
153
|
+
:param threshold: 可以理解成距离的阈值,距离小于这个阈值的字符串会被聚为一类
|
154
|
+
值越小,分出的类别越多越细
|
155
|
+
"""
|
156
|
+
check_install_package('sklearn', 'scikit-learn')
|
157
|
+
from sklearn.cluster import AgglomerativeClustering
|
158
|
+
|
159
|
+
# 1 给每个样本标类别
|
160
|
+
distance_matrix = np.zeros((len(self), len(self)))
|
161
|
+
for i in range(len(self)):
|
162
|
+
for j in range(i + 1, len(self)):
|
163
|
+
# 我们需要距离,所以用1减去相似度
|
164
|
+
distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
|
165
|
+
distance_matrix[i, j] = distance_matrix[j, i] = distance
|
166
|
+
|
167
|
+
# 进行层次聚类
|
168
|
+
clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
|
169
|
+
distance_threshold=threshold,
|
170
|
+
linkage='complete')
|
171
|
+
labels = clustering.fit_predict(distance_matrix)
|
172
|
+
|
173
|
+
return labels
|
174
|
+
|
175
|
+
def display_clusters(self, threshold=0.5):
|
176
|
+
""" 根据agglomerative_clustering的结果,显示各个聚类的内容 """
|
177
|
+
|
178
|
+
labels = self.agglomerative_clustering(threshold=threshold)
|
179
|
+
cluster_dict = defaultdict(list)
|
180
|
+
|
181
|
+
# 组织数据到字典中
|
182
|
+
for idx, label in enumerate(labels):
|
183
|
+
cluster_dict[label].append(self.origin_str[idx])
|
184
|
+
|
185
|
+
# 按标签排序并显示
|
186
|
+
result = {}
|
187
|
+
for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
|
188
|
+
result[label] = items
|
189
|
+
|
190
|
+
return result
|
191
|
+
|
192
|
+
|
193
|
+
class HierarchicalMatchSimString(MatchSimString):
|
194
|
+
""" 在面对数据量很大的候选数据情况下,建议使用这个层次聚类后的匹配方法 """
|
195
|
+
|
196
|
+
def __init__(self, method=briefstr):
|
197
|
+
super().__init__(method)
|
198
|
+
self.groups = dict()
|
199
|
+
|
200
|
+
def get_center_sample(self, indices=None):
|
201
|
+
""" 输入一组下标,计算中心样本,未输入参数值的时候,则在全量样本里找 """
|
202
|
+
if indices is None:
|
203
|
+
indices = range(len(self))
|
204
|
+
|
205
|
+
# 用于存储之前计算的结果
|
206
|
+
cached_results = {}
|
207
|
+
|
208
|
+
def get_similarity(i, j):
|
209
|
+
""" 获取两个索引的相似度,利用缓存来避免重复计算 """
|
210
|
+
if (i, j) in cached_results:
|
211
|
+
return cached_results[(i, j)]
|
212
|
+
sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
|
213
|
+
cached_results[(i, j)] = cached_results[(j, i)] = sim_val
|
214
|
+
return sim_val
|
215
|
+
|
216
|
+
center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
|
217
|
+
return center_idx
|
218
|
+
|
219
|
+
def merge_group(self, indices, threshold=0.5, strategy='center'):
|
220
|
+
""" 对输入的indexs清单,按照threshold的阈值进行合并
|
221
|
+
返回的是一个字典,key是代表性样本,value是同组内的数据编号
|
222
|
+
|
223
|
+
:param strategy: 代表样本的挑选策略
|
224
|
+
center,中心样本
|
225
|
+
first,第一个样本
|
226
|
+
"""
|
227
|
+
check_install_package('sklearn', 'scikit-learn')
|
228
|
+
from sklearn.cluster import AgglomerativeClustering
|
229
|
+
|
230
|
+
# 1 给每个样本标类别
|
231
|
+
n = len(indices)
|
232
|
+
distance_matrix = np.zeros((n, n))
|
233
|
+
for i in range(n):
|
234
|
+
for j in range(i + 1, n):
|
235
|
+
# 我们需要距离,所以用1减去相似度
|
236
|
+
distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
|
237
|
+
distance_matrix[i, j] = distance_matrix[j, i] = distance
|
238
|
+
|
239
|
+
# 进行层次聚类
|
240
|
+
clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
|
241
|
+
distance_threshold=threshold,
|
242
|
+
linkage='complete')
|
243
|
+
labels = clustering.fit_predict(distance_matrix)
|
244
|
+
|
245
|
+
# 2 分组字典
|
246
|
+
cluster_dict = defaultdict(list)
|
247
|
+
# 组织数据到字典中
|
248
|
+
for i, label in enumerate(labels):
|
249
|
+
cluster_dict[label].append(indices[i])
|
250
|
+
|
251
|
+
# 3 改成代表样本映射到一组里,并且按照样本数从多到少排序
|
252
|
+
result = {}
|
253
|
+
for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
|
254
|
+
if strategy == 'first':
|
255
|
+
representative = items[0]
|
256
|
+
elif strategy == 'center':
|
257
|
+
# 使用局部索引计算平均距离
|
258
|
+
local_indices = [i for i, idx in enumerate(indices) if idx in items]
|
259
|
+
sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
|
260
|
+
avg_distances = sub_matrix.mean(axis=1)
|
261
|
+
representative_idx = np.argmin(avg_distances)
|
262
|
+
representative = items[representative_idx]
|
263
|
+
else:
|
264
|
+
raise ValueError(f'Invalid strategy: {strategy}')
|
265
|
+
result[representative] = items
|
266
|
+
|
267
|
+
return result
|
268
|
+
|
269
|
+
def init_groups(self, threshold=0.5, batch_size=1000):
|
270
|
+
"""
|
271
|
+
:param threshold: 按照阈值进行分组,在这个距离内的都会归到一组
|
272
|
+
:param batch_size: 因为数据可能太大,不可能一次性全量两两比较,这里可以分batch处理
|
273
|
+
这样虽然结果不太精确,但能大大减小运算量
|
274
|
+
"""
|
275
|
+
# 1 最开始每个样本都是一个组
|
276
|
+
groups = {i: [i] for i in range(len(self))}
|
277
|
+
new_groups = {}
|
278
|
+
|
279
|
+
# 2 不断合并,直到没有组数变化
|
280
|
+
while len(groups) > 1:
|
281
|
+
for indices in chunked(groups.keys(), batch_size):
|
282
|
+
# 对于这里返回的字典,原groups里的values也要对应拼接的
|
283
|
+
indices2 = self.merge_group(indices, threshold=threshold)
|
284
|
+
for idx, idxs in indices2.items():
|
285
|
+
# 获取原始分组中的索引
|
286
|
+
original_idxs = [groups[original_idx] for original_idx in idxs]
|
287
|
+
# 展平列表并分配到新分组中
|
288
|
+
new_groups[idx] = [item for sublist in original_idxs for item in sublist]
|
289
|
+
|
290
|
+
# 如果分组没有发生变化,退出循环
|
291
|
+
if len(new_groups) == len(groups):
|
292
|
+
break
|
293
|
+
|
294
|
+
groups = new_groups
|
295
|
+
new_groups = {}
|
296
|
+
|
297
|
+
# 3 按数量从多到少排序
|
298
|
+
new_groups = {}
|
299
|
+
for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
|
300
|
+
new_groups[label] = items # 暂用第一个出现的作为代表
|
301
|
+
|
302
|
+
self.groups = new_groups
|
303
|
+
return self.groups
|