pyxllib 0.3.197__py3-none-any.whl → 3.201.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. pyxllib/__init__.py +14 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +537 -541
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -389
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -629
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -241
  11. pyxllib/algo/stat.py +494 -494
  12. pyxllib/algo/treelib.py +145 -149
  13. pyxllib/algo/unitlib.py +62 -66
  14. pyxllib/autogui/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -246
  16. pyxllib/autogui/all.py +9 -9
  17. pyxllib/autogui/autogui.py +846 -852
  18. pyxllib/autogui/uiautolib.py +362 -362
  19. pyxllib/autogui/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -827
  21. pyxllib/autogui/wechat_msg.py +421 -421
  22. pyxllib/autogui/wxautolib.py +84 -84
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -137
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +236 -240
  34. pyxllib/data/jsonlib.py +85 -89
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1111 -1127
  37. pyxllib/data/sqlite.py +568 -568
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -505
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +251 -246
  42. pyxllib/ext/drissionlib.py +277 -277
  43. pyxllib/ext/kq5034lib.py +12 -12
  44. pyxllib/ext/qt.py +449 -449
  45. pyxllib/ext/robustprocfile.py +493 -497
  46. pyxllib/ext/seleniumlib.py +76 -76
  47. pyxllib/ext/tk.py +173 -173
  48. pyxllib/ext/unixlib.py +821 -827
  49. pyxllib/ext/utools.py +345 -351
  50. pyxllib/ext/webhook.py +124 -119
  51. pyxllib/ext/win32lib.py +40 -40
  52. pyxllib/ext/wjxlib.py +91 -88
  53. pyxllib/ext/wpsapi.py +124 -124
  54. pyxllib/ext/xlwork.py +9 -9
  55. pyxllib/ext/yuquelib.py +1110 -1105
  56. pyxllib/file/__init__.py +17 -17
  57. pyxllib/file/docxlib.py +757 -761
  58. pyxllib/file/gitlib.py +309 -309
  59. pyxllib/file/libreoffice.py +165 -165
  60. pyxllib/file/movielib.py +144 -148
  61. pyxllib/file/newbie.py +10 -10
  62. pyxllib/file/onenotelib.py +1469 -1469
  63. pyxllib/file/packlib/__init__.py +330 -330
  64. pyxllib/file/packlib/zipfile.py +2441 -2441
  65. pyxllib/file/pdflib.py +422 -426
  66. pyxllib/file/pupil.py +185 -185
  67. pyxllib/file/specialist/__init__.py +681 -685
  68. pyxllib/file/specialist/dirlib.py +799 -799
  69. pyxllib/file/specialist/download.py +193 -193
  70. pyxllib/file/specialist/filelib.py +2825 -2829
  71. pyxllib/file/xlsxlib.py +3122 -3131
  72. pyxllib/file/xlsyncfile.py +341 -341
  73. pyxllib/prog/__init__.py +5 -5
  74. pyxllib/prog/cachetools.py +58 -64
  75. pyxllib/prog/deprecatedlib.py +233 -233
  76. pyxllib/prog/filelock.py +42 -42
  77. pyxllib/prog/ipyexec.py +253 -253
  78. pyxllib/prog/multiprogs.py +940 -940
  79. pyxllib/prog/newbie.py +451 -451
  80. pyxllib/prog/pupil.py +1208 -1197
  81. pyxllib/prog/sitepackages.py +33 -33
  82. pyxllib/prog/specialist/__init__.py +348 -391
  83. pyxllib/prog/specialist/bc.py +203 -203
  84. pyxllib/prog/specialist/browser.py +497 -497
  85. pyxllib/prog/specialist/common.py +347 -347
  86. pyxllib/prog/specialist/datetime.py +198 -198
  87. pyxllib/prog/specialist/tictoc.py +240 -240
  88. pyxllib/prog/specialist/xllog.py +180 -180
  89. pyxllib/prog/xlosenv.py +110 -108
  90. pyxllib/stdlib/__init__.py +17 -17
  91. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  92. pyxllib/stdlib/tablepyxl/style.py +303 -303
  93. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  94. pyxllib/text/__init__.py +8 -8
  95. pyxllib/text/ahocorasick.py +36 -39
  96. pyxllib/text/airscript.js +754 -744
  97. pyxllib/text/charclasslib.py +121 -121
  98. pyxllib/text/jiebalib.py +267 -267
  99. pyxllib/text/jinjalib.py +27 -32
  100. pyxllib/text/jsa_ai_prompt.md +271 -271
  101. pyxllib/text/jscode.py +922 -922
  102. pyxllib/text/latex/__init__.py +158 -158
  103. pyxllib/text/levenshtein.py +303 -303
  104. pyxllib/text/nestenv.py +1215 -1215
  105. pyxllib/text/newbie.py +300 -300
  106. pyxllib/text/pupil/__init__.py +8 -8
  107. pyxllib/text/pupil/common.py +1121 -1121
  108. pyxllib/text/pupil/xlalign.py +326 -326
  109. pyxllib/text/pycode.py +47 -47
  110. pyxllib/text/specialist/__init__.py +8 -8
  111. pyxllib/text/specialist/common.py +112 -112
  112. pyxllib/text/specialist/ptag.py +186 -186
  113. pyxllib/text/spellchecker.py +172 -172
  114. pyxllib/text/templates/echart_base.html +10 -10
  115. pyxllib/text/templates/highlight_code.html +16 -16
  116. pyxllib/text/templates/latex_editor.html +102 -102
  117. pyxllib/text/vbacode.py +17 -17
  118. pyxllib/text/xmllib.py +741 -747
  119. pyxllib/xl.py +42 -39
  120. pyxllib/xlcv.py +17 -17
  121. pyxllib-3.201.1.dist-info/METADATA +296 -0
  122. pyxllib-3.201.1.dist-info/RECORD +125 -0
  123. {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/licenses/LICENSE +190 -190
  124. pyxllib/ext/old.py +0 -663
  125. pyxllib-0.3.197.dist-info/METADATA +0 -48
  126. pyxllib-0.3.197.dist-info/RECORD +0 -126
  127. {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/WHEEL +0 -0
@@ -1,303 +1,303 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2021/06/06 17:01
6
-
7
- from pyxllib.prog.pupil import check_install_package
8
-
9
- # 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
10
- # 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
11
- # MatchSimString计算编辑距离需要
12
- check_install_package('Levenshtein', 'python-Levenshtein')
13
-
14
- from collections import defaultdict
15
- from more_itertools import chunked
16
- import warnings
17
-
18
- import Levenshtein
19
- import numpy as np
20
- import pandas as pd
21
-
22
- from pyxllib.prog.pupil import run_once
23
- from pyxllib.prog.specialist import dataframe_str
24
- from pyxllib.text.pupil import briefstr
25
-
26
- # 忽略特定的警告
27
- warnings.filterwarnings("ignore", category=FutureWarning,
28
- module="sklearn.cluster._agglomerative",
29
- lineno=1005)
30
-
31
-
32
- @run_once('str')
33
- def get_levenshtein_similar(x, y):
34
- """ 缓存各字符串之间的编辑距离 """
35
- return Levenshtein.ratio(x, y)
36
-
37
-
38
- class MatchSimString:
39
- """ 匹配近似字符串
40
-
41
- mss = MatchSimString()
42
-
43
- # 1 添加候选对象
44
- mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
45
- mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
46
- mss.append_candidate('删除所有标签中间多余的空白')
47
-
48
- # 2 需要匹配的对象1
49
- s = '奕本初一福周厦门培油'
50
-
51
- idx, sim = mss.match(s)
52
- print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
53
- print('相似度:', sim) # 相似度: 0.22
54
-
55
- # 3 需要匹配的对象2
56
- s = '圆柱与【圆锥】_教案空白版'
57
-
58
- idx, sim = mss.match(s)
59
- print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
60
- print('相似度:', sim) # 相似度: 0.375
61
-
62
- 如果append_candidate有传递2个扩展信息参数,可以索引获取:
63
- mss.ext_value[idx]
64
- """
65
-
66
- def __init__(self, method=briefstr):
67
- self.preproc = method
68
- self.origin_str = [] # 原始字符串内容
69
- self.key_str = [] # 对原始字符串进行处理后的字符
70
- self.ext_value = [] # 扩展存储一些信息
71
-
72
- def __getitem__(self, item):
73
- return self.origin_str[item]
74
-
75
- def __delitem__(self, item):
76
- del self.origin_str[item]
77
- del self.key_str[item]
78
- del self.ext_value[item]
79
-
80
- def __len__(self):
81
- return len(self.key_str)
82
-
83
- def get_similarity(self, x, y):
84
- """ 计算两对数据之间的相似度 """
85
- pass
86
-
87
- def append_candidate(self, k, v=None):
88
- self.origin_str.append(k)
89
- if callable(self.preproc):
90
- k = self.preproc(k)
91
- self.key_str.append(k)
92
- self.ext_value.append(v)
93
-
94
- def match(self, s):
95
- """ 跟候选字符串进行匹配,返回最佳匹配结果
96
- """
97
- idx, sim = -1, 0
98
- for i in range(len(self)):
99
- k, v = self.key_str[i], self.ext_value[i]
100
- sim_ = Levenshtein.ratio(k, s)
101
- if sim_ > sim:
102
- sim = sim_
103
- idx = i
104
- i += 1
105
- return idx, sim
106
-
107
- def match_many(self, s, count=1):
108
- """跟候选字符串进行匹配,返回多个最佳匹配结果
109
- :param str s: 待匹配的字符串
110
- :param int count: 需要返回的匹配数量
111
- :return: 匹配结果列表,列表中的元素为(idx, sim)对
112
- """
113
- scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
114
- # 根据相似度排序并返回前count个结果
115
- return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
116
-
117
- def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
118
- """输入一个字符串s,和候选项做近似匹配
119
-
120
- :param s: 需要进行匹配的字符串s
121
- :param count: 只输出部分匹配结果
122
- -1:输出所有匹配结果
123
- 0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
124
- 整数:输出匹配度最高的count个结果
125
- :param showstr: 字符串显示效果
126
- """
127
- # 1 计算编辑距离,存储结果到res
128
- res = []
129
- n = len(self)
130
- for i in range(n):
131
- k, v = self.key_str[i], self.ext_value[i]
132
- sim = Levenshtein.ratio(k, s)
133
- res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
134
- i += 1
135
-
136
- # 2 排序、节选结果
137
- res = sorted(res, key=lambda x: -x[2])
138
- if 0 < count < 1:
139
- n = max(1, int(n * count))
140
- elif isinstance(count, int) and count > 0:
141
- n = min(count, n)
142
- res = res[:n]
143
-
144
- # 3 输出
145
- df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
146
- s = dataframe_str(df)
147
- s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
148
- print(s)
149
-
150
- def agglomerative_clustering(self, threshold=0.5):
151
- """ 对内部字符串进行层次聚类
152
-
153
- :param threshold: 可以理解成距离的阈值,距离小于这个阈值的字符串会被聚为一类
154
- 值越小,分出的类别越多越细
155
- """
156
- check_install_package('sklearn', 'scikit-learn')
157
- from sklearn.cluster import AgglomerativeClustering
158
-
159
- # 1 给每个样本标类别
160
- distance_matrix = np.zeros((len(self), len(self)))
161
- for i in range(len(self)):
162
- for j in range(i + 1, len(self)):
163
- # 我们需要距离,所以用1减去相似度
164
- distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
165
- distance_matrix[i, j] = distance_matrix[j, i] = distance
166
-
167
- # 进行层次聚类
168
- clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
169
- distance_threshold=threshold,
170
- linkage='complete')
171
- labels = clustering.fit_predict(distance_matrix)
172
-
173
- return labels
174
-
175
- def display_clusters(self, threshold=0.5):
176
- """ 根据agglomerative_clustering的结果,显示各个聚类的内容 """
177
-
178
- labels = self.agglomerative_clustering(threshold=threshold)
179
- cluster_dict = defaultdict(list)
180
-
181
- # 组织数据到字典中
182
- for idx, label in enumerate(labels):
183
- cluster_dict[label].append(self.origin_str[idx])
184
-
185
- # 按标签排序并显示
186
- result = {}
187
- for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
188
- result[label] = items
189
-
190
- return result
191
-
192
-
193
- class HierarchicalMatchSimString(MatchSimString):
194
- """ 在面对数据量很大的候选数据情况下,建议使用这个层次聚类后的匹配方法 """
195
-
196
- def __init__(self, method=briefstr):
197
- super().__init__(method)
198
- self.groups = dict()
199
-
200
- def get_center_sample(self, indices=None):
201
- """ 输入一组下标,计算中心样本,未输入参数值的时候,则在全量样本里找 """
202
- if indices is None:
203
- indices = range(len(self))
204
-
205
- # 用于存储之前计算的结果
206
- cached_results = {}
207
-
208
- def get_similarity(i, j):
209
- """ 获取两个索引的相似度,利用缓存来避免重复计算 """
210
- if (i, j) in cached_results:
211
- return cached_results[(i, j)]
212
- sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
213
- cached_results[(i, j)] = cached_results[(j, i)] = sim_val
214
- return sim_val
215
-
216
- center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
217
- return center_idx
218
-
219
- def merge_group(self, indices, threshold=0.5, strategy='center'):
220
- """ 对输入的indexs清单,按照threshold的阈值进行合并
221
- 返回的是一个字典,key是代表性样本,value是同组内的数据编号
222
-
223
- :param strategy: 代表样本的挑选策略
224
- center,中心样本
225
- first,第一个样本
226
- """
227
- check_install_package('sklearn', 'scikit-learn')
228
- from sklearn.cluster import AgglomerativeClustering
229
-
230
- # 1 给每个样本标类别
231
- n = len(indices)
232
- distance_matrix = np.zeros((n, n))
233
- for i in range(n):
234
- for j in range(i + 1, n):
235
- # 我们需要距离,所以用1减去相似度
236
- distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
237
- distance_matrix[i, j] = distance_matrix[j, i] = distance
238
-
239
- # 进行层次聚类
240
- clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
241
- distance_threshold=threshold,
242
- linkage='complete')
243
- labels = clustering.fit_predict(distance_matrix)
244
-
245
- # 2 分组字典
246
- cluster_dict = defaultdict(list)
247
- # 组织数据到字典中
248
- for i, label in enumerate(labels):
249
- cluster_dict[label].append(indices[i])
250
-
251
- # 3 改成代表样本映射到一组里,并且按照样本数从多到少排序
252
- result = {}
253
- for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
254
- if strategy == 'first':
255
- representative = items[0]
256
- elif strategy == 'center':
257
- # 使用局部索引计算平均距离
258
- local_indices = [i for i, idx in enumerate(indices) if idx in items]
259
- sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
260
- avg_distances = sub_matrix.mean(axis=1)
261
- representative_idx = np.argmin(avg_distances)
262
- representative = items[representative_idx]
263
- else:
264
- raise ValueError(f'Invalid strategy: {strategy}')
265
- result[representative] = items
266
-
267
- return result
268
-
269
- def init_groups(self, threshold=0.5, batch_size=1000):
270
- """
271
- :param threshold: 按照阈值进行分组,在这个距离内的都会归到一组
272
- :param batch_size: 因为数据可能太大,不可能一次性全量两两比较,这里可以分batch处理
273
- 这样虽然结果不太精确,但能大大减小运算量
274
- """
275
- # 1 最开始每个样本都是一个组
276
- groups = {i: [i] for i in range(len(self))}
277
- new_groups = {}
278
-
279
- # 2 不断合并,直到没有组数变化
280
- while len(groups) > 1:
281
- for indices in chunked(groups.keys(), batch_size):
282
- # 对于这里返回的字典,原groups里的values也要对应拼接的
283
- indices2 = self.merge_group(indices, threshold=threshold)
284
- for idx, idxs in indices2.items():
285
- # 获取原始分组中的索引
286
- original_idxs = [groups[original_idx] for original_idx in idxs]
287
- # 展平列表并分配到新分组中
288
- new_groups[idx] = [item for sublist in original_idxs for item in sublist]
289
-
290
- # 如果分组没有发生变化,退出循环
291
- if len(new_groups) == len(groups):
292
- break
293
-
294
- groups = new_groups
295
- new_groups = {}
296
-
297
- # 3 按数量从多到少排序
298
- new_groups = {}
299
- for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
300
- new_groups[label] = items # 暂用第一个出现的作为代表
301
-
302
- self.groups = new_groups
303
- return self.groups
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2021/06/06 17:01
6
+
7
+ # from pyxllib.prog.pupil import check_install_package
8
+
9
+ # 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
10
+ # 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
11
+ # MatchSimString计算编辑距离需要
12
+ # check_install_package('Levenshtein', 'python-Levenshtein')
13
+
14
+ from collections import defaultdict
15
+ from more_itertools import chunked
16
+ import warnings
17
+
18
+ import Levenshtein
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ from pyxllib.prog.pupil import run_once
23
+ from pyxllib.prog.specialist import dataframe_str
24
+ from pyxllib.text.pupil import briefstr
25
+
26
+ # 忽略特定的警告
27
+ warnings.filterwarnings("ignore", category=FutureWarning,
28
+ module="sklearn.cluster._agglomerative",
29
+ lineno=1005)
30
+
31
+
32
+ @run_once('str')
33
+ def get_levenshtein_similar(x, y):
34
+ """ 缓存各字符串之间的编辑距离 """
35
+ return Levenshtein.ratio(x, y)
36
+
37
+
38
+ class MatchSimString:
39
+ """ 匹配近似字符串
40
+
41
+ mss = MatchSimString()
42
+
43
+ # 1 添加候选对象
44
+ mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
45
+ mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
46
+ mss.append_candidate('删除所有标签中间多余的空白')
47
+
48
+ # 2 需要匹配的对象1
49
+ s = '奕本初一福周厦门培油'
50
+
51
+ idx, sim = mss.match(s)
52
+ print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
53
+ print('相似度:', sim) # 相似度: 0.22
54
+
55
+ # 3 需要匹配的对象2
56
+ s = '圆柱与【圆锥】_教案空白版'
57
+
58
+ idx, sim = mss.match(s)
59
+ print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
60
+ print('相似度:', sim) # 相似度: 0.375
61
+
62
+ 如果append_candidate有传递2个扩展信息参数,可以索引获取:
63
+ mss.ext_value[idx]
64
+ """
65
+
66
+ def __init__(self, method=briefstr):
67
+ self.preproc = method
68
+ self.origin_str = [] # 原始字符串内容
69
+ self.key_str = [] # 对原始字符串进行处理后的字符
70
+ self.ext_value = [] # 扩展存储一些信息
71
+
72
+ def __getitem__(self, item):
73
+ return self.origin_str[item]
74
+
75
+ def __delitem__(self, item):
76
+ del self.origin_str[item]
77
+ del self.key_str[item]
78
+ del self.ext_value[item]
79
+
80
+ def __len__(self):
81
+ return len(self.key_str)
82
+
83
+ def get_similarity(self, x, y):
84
+ """ 计算两对数据之间的相似度 """
85
+ pass
86
+
87
+ def append_candidate(self, k, v=None):
88
+ self.origin_str.append(k)
89
+ if callable(self.preproc):
90
+ k = self.preproc(k)
91
+ self.key_str.append(k)
92
+ self.ext_value.append(v)
93
+
94
+ def match(self, s):
95
+ """ 跟候选字符串进行匹配,返回最佳匹配结果
96
+ """
97
+ idx, sim = -1, 0
98
+ for i in range(len(self)):
99
+ k, v = self.key_str[i], self.ext_value[i]
100
+ sim_ = Levenshtein.ratio(k, s)
101
+ if sim_ > sim:
102
+ sim = sim_
103
+ idx = i
104
+ i += 1
105
+ return idx, sim
106
+
107
+ def match_many(self, s, count=1):
108
+ """跟候选字符串进行匹配,返回多个最佳匹配结果
109
+ :param str s: 待匹配的字符串
110
+ :param int count: 需要返回的匹配数量
111
+ :return: 匹配结果列表,列表中的元素为(idx, sim)对
112
+ """
113
+ scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
114
+ # 根据相似度排序并返回前count个结果
115
+ return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
116
+
117
+ def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
118
+ """输入一个字符串s,和候选项做近似匹配
119
+
120
+ :param s: 需要进行匹配的字符串s
121
+ :param count: 只输出部分匹配结果
122
+ -1:输出所有匹配结果
123
+ 0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
124
+ 整数:输出匹配度最高的count个结果
125
+ :param showstr: 字符串显示效果
126
+ """
127
+ # 1 计算编辑距离,存储结果到res
128
+ res = []
129
+ n = len(self)
130
+ for i in range(n):
131
+ k, v = self.key_str[i], self.ext_value[i]
132
+ sim = Levenshtein.ratio(k, s)
133
+ res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
134
+ i += 1
135
+
136
+ # 2 排序、节选结果
137
+ res = sorted(res, key=lambda x: -x[2])
138
+ if 0 < count < 1:
139
+ n = max(1, int(n * count))
140
+ elif isinstance(count, int) and count > 0:
141
+ n = min(count, n)
142
+ res = res[:n]
143
+
144
+ # 3 输出
145
+ df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
146
+ s = dataframe_str(df)
147
+ s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
148
+ print(s)
149
+
150
+ def agglomerative_clustering(self, threshold=0.5):
151
+ """ 对内部字符串进行层次聚类
152
+
153
+ :param threshold: 可以理解成距离的阈值,距离小于这个阈值的字符串会被聚为一类
154
+ 值越小,分出的类别越多越细
155
+ """
156
+ check_install_package('sklearn', 'scikit-learn')
157
+ from sklearn.cluster import AgglomerativeClustering
158
+
159
+ # 1 给每个样本标类别
160
+ distance_matrix = np.zeros((len(self), len(self)))
161
+ for i in range(len(self)):
162
+ for j in range(i + 1, len(self)):
163
+ # 我们需要距离,所以用1减去相似度
164
+ distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
165
+ distance_matrix[i, j] = distance_matrix[j, i] = distance
166
+
167
+ # 进行层次聚类
168
+ clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
169
+ distance_threshold=threshold,
170
+ linkage='complete')
171
+ labels = clustering.fit_predict(distance_matrix)
172
+
173
+ return labels
174
+
175
+ def display_clusters(self, threshold=0.5):
176
+ """ 根据agglomerative_clustering的结果,显示各个聚类的内容 """
177
+
178
+ labels = self.agglomerative_clustering(threshold=threshold)
179
+ cluster_dict = defaultdict(list)
180
+
181
+ # 组织数据到字典中
182
+ for idx, label in enumerate(labels):
183
+ cluster_dict[label].append(self.origin_str[idx])
184
+
185
+ # 按标签排序并显示
186
+ result = {}
187
+ for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
188
+ result[label] = items
189
+
190
+ return result
191
+
192
+
193
+ class HierarchicalMatchSimString(MatchSimString):
194
+ """ 在面对数据量很大的候选数据情况下,建议使用这个层次聚类后的匹配方法 """
195
+
196
+ def __init__(self, method=briefstr):
197
+ super().__init__(method)
198
+ self.groups = dict()
199
+
200
+ def get_center_sample(self, indices=None):
201
+ """ 输入一组下标,计算中心样本,未输入参数值的时候,则在全量样本里找 """
202
+ if indices is None:
203
+ indices = range(len(self))
204
+
205
+ # 用于存储之前计算的结果
206
+ cached_results = {}
207
+
208
+ def get_similarity(i, j):
209
+ """ 获取两个索引的相似度,利用缓存来避免重复计算 """
210
+ if (i, j) in cached_results:
211
+ return cached_results[(i, j)]
212
+ sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
213
+ cached_results[(i, j)] = cached_results[(j, i)] = sim_val
214
+ return sim_val
215
+
216
+ center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
217
+ return center_idx
218
+
219
+ def merge_group(self, indices, threshold=0.5, strategy='center'):
220
+ """ 对输入的indexs清单,按照threshold的阈值进行合并
221
+ 返回的是一个字典,key是代表性样本,value是同组内的数据编号
222
+
223
+ :param strategy: 代表样本的挑选策略
224
+ center,中心样本
225
+ first,第一个样本
226
+ """
227
+ check_install_package('sklearn', 'scikit-learn')
228
+ from sklearn.cluster import AgglomerativeClustering
229
+
230
+ # 1 给每个样本标类别
231
+ n = len(indices)
232
+ distance_matrix = np.zeros((n, n))
233
+ for i in range(n):
234
+ for j in range(i + 1, n):
235
+ # 我们需要距离,所以用1减去相似度
236
+ distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
237
+ distance_matrix[i, j] = distance_matrix[j, i] = distance
238
+
239
+ # 进行层次聚类
240
+ clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
241
+ distance_threshold=threshold,
242
+ linkage='complete')
243
+ labels = clustering.fit_predict(distance_matrix)
244
+
245
+ # 2 分组字典
246
+ cluster_dict = defaultdict(list)
247
+ # 组织数据到字典中
248
+ for i, label in enumerate(labels):
249
+ cluster_dict[label].append(indices[i])
250
+
251
+ # 3 改成代表样本映射到一组里,并且按照样本数从多到少排序
252
+ result = {}
253
+ for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
254
+ if strategy == 'first':
255
+ representative = items[0]
256
+ elif strategy == 'center':
257
+ # 使用局部索引计算平均距离
258
+ local_indices = [i for i, idx in enumerate(indices) if idx in items]
259
+ sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
260
+ avg_distances = sub_matrix.mean(axis=1)
261
+ representative_idx = np.argmin(avg_distances)
262
+ representative = items[representative_idx]
263
+ else:
264
+ raise ValueError(f'Invalid strategy: {strategy}')
265
+ result[representative] = items
266
+
267
+ return result
268
+
269
+ def init_groups(self, threshold=0.5, batch_size=1000):
270
+ """
271
+ :param threshold: 按照阈值进行分组,在这个距离内的都会归到一组
272
+ :param batch_size: 因为数据可能太大,不可能一次性全量两两比较,这里可以分batch处理
273
+ 这样虽然结果不太精确,但能大大减小运算量
274
+ """
275
+ # 1 最开始每个样本都是一个组
276
+ groups = {i: [i] for i in range(len(self))}
277
+ new_groups = {}
278
+
279
+ # 2 不断合并,直到没有组数变化
280
+ while len(groups) > 1:
281
+ for indices in chunked(groups.keys(), batch_size):
282
+ # 对于这里返回的字典,原groups里的values也要对应拼接的
283
+ indices2 = self.merge_group(indices, threshold=threshold)
284
+ for idx, idxs in indices2.items():
285
+ # 获取原始分组中的索引
286
+ original_idxs = [groups[original_idx] for original_idx in idxs]
287
+ # 展平列表并分配到新分组中
288
+ new_groups[idx] = [item for sublist in original_idxs for item in sublist]
289
+
290
+ # 如果分组没有发生变化,退出循环
291
+ if len(new_groups) == len(groups):
292
+ break
293
+
294
+ groups = new_groups
295
+ new_groups = {}
296
+
297
+ # 3 按数量从多到少排序
298
+ new_groups = {}
299
+ for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
300
+ new_groups[label] = items # 暂用第一个出现的作为代表
301
+
302
+ self.groups = new_groups
303
+ return self.groups