pyxllib 0.3.197__py3-none-any.whl → 0.3.200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. pyxllib/__init__.py +21 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +541 -541
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -389
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -629
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -241
  11. pyxllib/algo/stat.py +494 -494
  12. pyxllib/algo/treelib.py +149 -149
  13. pyxllib/algo/unitlib.py +66 -66
  14. pyxllib/autogui/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -246
  16. pyxllib/autogui/all.py +9 -9
  17. pyxllib/autogui/autogui.py +852 -852
  18. pyxllib/autogui/uiautolib.py +362 -362
  19. pyxllib/autogui/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -827
  21. pyxllib/autogui/wechat_msg.py +421 -421
  22. pyxllib/autogui/wxautolib.py +84 -84
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -137
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +240 -240
  34. pyxllib/data/jsonlib.py +89 -89
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1127 -1127
  37. pyxllib/data/sqlite.py +568 -568
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -505
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +246 -246
  42. pyxllib/ext/drissionlib.py +277 -277
  43. pyxllib/ext/kq5034lib.py +12 -12
  44. pyxllib/ext/old.py +663 -663
  45. pyxllib/ext/qt.py +449 -449
  46. pyxllib/ext/robustprocfile.py +497 -497
  47. pyxllib/ext/seleniumlib.py +76 -76
  48. pyxllib/ext/tk.py +173 -173
  49. pyxllib/ext/unixlib.py +827 -827
  50. pyxllib/ext/utools.py +351 -351
  51. pyxllib/ext/webhook.py +124 -119
  52. pyxllib/ext/win32lib.py +40 -40
  53. pyxllib/ext/wjxlib.py +88 -88
  54. pyxllib/ext/wpsapi.py +124 -124
  55. pyxllib/ext/xlwork.py +9 -9
  56. pyxllib/ext/yuquelib.py +1105 -1105
  57. pyxllib/file/__init__.py +17 -17
  58. pyxllib/file/docxlib.py +761 -761
  59. pyxllib/file/gitlib.py +309 -309
  60. pyxllib/file/libreoffice.py +165 -165
  61. pyxllib/file/movielib.py +148 -148
  62. pyxllib/file/newbie.py +10 -10
  63. pyxllib/file/onenotelib.py +1469 -1469
  64. pyxllib/file/packlib/__init__.py +330 -330
  65. pyxllib/file/packlib/zipfile.py +2441 -2441
  66. pyxllib/file/pdflib.py +426 -426
  67. pyxllib/file/pupil.py +185 -185
  68. pyxllib/file/specialist/__init__.py +685 -685
  69. pyxllib/file/specialist/dirlib.py +799 -799
  70. pyxllib/file/specialist/download.py +193 -193
  71. pyxllib/file/specialist/filelib.py +2829 -2829
  72. pyxllib/file/xlsxlib.py +3131 -3131
  73. pyxllib/file/xlsyncfile.py +341 -341
  74. pyxllib/prog/__init__.py +5 -5
  75. pyxllib/prog/cachetools.py +64 -64
  76. pyxllib/prog/deprecatedlib.py +233 -233
  77. pyxllib/prog/filelock.py +42 -42
  78. pyxllib/prog/ipyexec.py +253 -253
  79. pyxllib/prog/multiprogs.py +940 -940
  80. pyxllib/prog/newbie.py +451 -451
  81. pyxllib/prog/pupil.py +1197 -1197
  82. pyxllib/prog/sitepackages.py +33 -33
  83. pyxllib/prog/specialist/__init__.py +391 -391
  84. pyxllib/prog/specialist/bc.py +203 -203
  85. pyxllib/prog/specialist/browser.py +497 -497
  86. pyxllib/prog/specialist/common.py +347 -347
  87. pyxllib/prog/specialist/datetime.py +198 -198
  88. pyxllib/prog/specialist/tictoc.py +240 -240
  89. pyxllib/prog/specialist/xllog.py +180 -180
  90. pyxllib/prog/xlosenv.py +108 -108
  91. pyxllib/stdlib/__init__.py +17 -17
  92. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  93. pyxllib/stdlib/tablepyxl/style.py +303 -303
  94. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  95. pyxllib/text/__init__.py +8 -8
  96. pyxllib/text/ahocorasick.py +39 -39
  97. pyxllib/text/airscript.js +744 -744
  98. pyxllib/text/charclasslib.py +121 -121
  99. pyxllib/text/jiebalib.py +267 -267
  100. pyxllib/text/jinjalib.py +32 -32
  101. pyxllib/text/jsa_ai_prompt.md +271 -271
  102. pyxllib/text/jscode.py +922 -922
  103. pyxllib/text/latex/__init__.py +158 -158
  104. pyxllib/text/levenshtein.py +303 -303
  105. pyxllib/text/nestenv.py +1215 -1215
  106. pyxllib/text/newbie.py +300 -300
  107. pyxllib/text/pupil/__init__.py +8 -8
  108. pyxllib/text/pupil/common.py +1121 -1121
  109. pyxllib/text/pupil/xlalign.py +326 -326
  110. pyxllib/text/pycode.py +47 -47
  111. pyxllib/text/specialist/__init__.py +8 -8
  112. pyxllib/text/specialist/common.py +112 -112
  113. pyxllib/text/specialist/ptag.py +186 -186
  114. pyxllib/text/spellchecker.py +172 -172
  115. pyxllib/text/templates/echart_base.html +10 -10
  116. pyxllib/text/templates/highlight_code.html +16 -16
  117. pyxllib/text/templates/latex_editor.html +102 -102
  118. pyxllib/text/vbacode.py +17 -17
  119. pyxllib/text/xmllib.py +747 -747
  120. pyxllib/xl.py +42 -39
  121. pyxllib/xlcv.py +17 -17
  122. {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/METADATA +1 -1
  123. pyxllib-0.3.200.dist-info/RECORD +126 -0
  124. {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/licenses/LICENSE +190 -190
  125. pyxllib-0.3.197.dist-info/RECORD +0 -126
  126. {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +0 -0
@@ -1,303 +1,303 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2021/06/06 17:01
6
-
7
- from pyxllib.prog.pupil import check_install_package
8
-
9
- # 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
10
- # 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
11
- # MatchSimString计算编辑距离需要
12
- check_install_package('Levenshtein', 'python-Levenshtein')
13
-
14
- from collections import defaultdict
15
- from more_itertools import chunked
16
- import warnings
17
-
18
- import Levenshtein
19
- import numpy as np
20
- import pandas as pd
21
-
22
- from pyxllib.prog.pupil import run_once
23
- from pyxllib.prog.specialist import dataframe_str
24
- from pyxllib.text.pupil import briefstr
25
-
26
- # 忽略特定的警告
27
- warnings.filterwarnings("ignore", category=FutureWarning,
28
- module="sklearn.cluster._agglomerative",
29
- lineno=1005)
30
-
31
-
32
- @run_once('str')
33
- def get_levenshtein_similar(x, y):
34
- """ 缓存各字符串之间的编辑距离 """
35
- return Levenshtein.ratio(x, y)
36
-
37
-
38
- class MatchSimString:
39
- """ 匹配近似字符串
40
-
41
- mss = MatchSimString()
42
-
43
- # 1 添加候选对象
44
- mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
45
- mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
46
- mss.append_candidate('删除所有标签中间多余的空白')
47
-
48
- # 2 需要匹配的对象1
49
- s = '奕本初一福周厦门培油'
50
-
51
- idx, sim = mss.match(s)
52
- print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
53
- print('相似度:', sim) # 相似度: 0.22
54
-
55
- # 3 需要匹配的对象2
56
- s = '圆柱与【圆锥】_教案空白版'
57
-
58
- idx, sim = mss.match(s)
59
- print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
60
- print('相似度:', sim) # 相似度: 0.375
61
-
62
- 如果append_candidate有传递2个扩展信息参数,可以索引获取:
63
- mss.ext_value[idx]
64
- """
65
-
66
- def __init__(self, method=briefstr):
67
- self.preproc = method
68
- self.origin_str = [] # 原始字符串内容
69
- self.key_str = [] # 对原始字符串进行处理后的字符
70
- self.ext_value = [] # 扩展存储一些信息
71
-
72
- def __getitem__(self, item):
73
- return self.origin_str[item]
74
-
75
- def __delitem__(self, item):
76
- del self.origin_str[item]
77
- del self.key_str[item]
78
- del self.ext_value[item]
79
-
80
- def __len__(self):
81
- return len(self.key_str)
82
-
83
- def get_similarity(self, x, y):
84
- """ 计算两对数据之间的相似度 """
85
- pass
86
-
87
- def append_candidate(self, k, v=None):
88
- self.origin_str.append(k)
89
- if callable(self.preproc):
90
- k = self.preproc(k)
91
- self.key_str.append(k)
92
- self.ext_value.append(v)
93
-
94
- def match(self, s):
95
- """ 跟候选字符串进行匹配,返回最佳匹配结果
96
- """
97
- idx, sim = -1, 0
98
- for i in range(len(self)):
99
- k, v = self.key_str[i], self.ext_value[i]
100
- sim_ = Levenshtein.ratio(k, s)
101
- if sim_ > sim:
102
- sim = sim_
103
- idx = i
104
- i += 1
105
- return idx, sim
106
-
107
- def match_many(self, s, count=1):
108
- """跟候选字符串进行匹配,返回多个最佳匹配结果
109
- :param str s: 待匹配的字符串
110
- :param int count: 需要返回的匹配数量
111
- :return: 匹配结果列表,列表中的元素为(idx, sim)对
112
- """
113
- scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
114
- # 根据相似度排序并返回前count个结果
115
- return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
116
-
117
- def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
118
- """输入一个字符串s,和候选项做近似匹配
119
-
120
- :param s: 需要进行匹配的字符串s
121
- :param count: 只输出部分匹配结果
122
- -1:输出所有匹配结果
123
- 0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
124
- 整数:输出匹配度最高的count个结果
125
- :param showstr: 字符串显示效果
126
- """
127
- # 1 计算编辑距离,存储结果到res
128
- res = []
129
- n = len(self)
130
- for i in range(n):
131
- k, v = self.key_str[i], self.ext_value[i]
132
- sim = Levenshtein.ratio(k, s)
133
- res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
134
- i += 1
135
-
136
- # 2 排序、节选结果
137
- res = sorted(res, key=lambda x: -x[2])
138
- if 0 < count < 1:
139
- n = max(1, int(n * count))
140
- elif isinstance(count, int) and count > 0:
141
- n = min(count, n)
142
- res = res[:n]
143
-
144
- # 3 输出
145
- df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
146
- s = dataframe_str(df)
147
- s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
148
- print(s)
149
-
150
- def agglomerative_clustering(self, threshold=0.5):
151
- """ 对内部字符串进行层次聚类
152
-
153
- :param threshold: 可以理解成距离的阈值,距离小于这个阈值的字符串会被聚为一类
154
- 值越小,分出的类别越多越细
155
- """
156
- check_install_package('sklearn', 'scikit-learn')
157
- from sklearn.cluster import AgglomerativeClustering
158
-
159
- # 1 给每个样本标类别
160
- distance_matrix = np.zeros((len(self), len(self)))
161
- for i in range(len(self)):
162
- for j in range(i + 1, len(self)):
163
- # 我们需要距离,所以用1减去相似度
164
- distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
165
- distance_matrix[i, j] = distance_matrix[j, i] = distance
166
-
167
- # 进行层次聚类
168
- clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
169
- distance_threshold=threshold,
170
- linkage='complete')
171
- labels = clustering.fit_predict(distance_matrix)
172
-
173
- return labels
174
-
175
- def display_clusters(self, threshold=0.5):
176
- """ 根据agglomerative_clustering的结果,显示各个聚类的内容 """
177
-
178
- labels = self.agglomerative_clustering(threshold=threshold)
179
- cluster_dict = defaultdict(list)
180
-
181
- # 组织数据到字典中
182
- for idx, label in enumerate(labels):
183
- cluster_dict[label].append(self.origin_str[idx])
184
-
185
- # 按标签排序并显示
186
- result = {}
187
- for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
188
- result[label] = items
189
-
190
- return result
191
-
192
-
193
- class HierarchicalMatchSimString(MatchSimString):
194
- """ 在面对数据量很大的候选数据情况下,建议使用这个层次聚类后的匹配方法 """
195
-
196
- def __init__(self, method=briefstr):
197
- super().__init__(method)
198
- self.groups = dict()
199
-
200
- def get_center_sample(self, indices=None):
201
- """ 输入一组下标,计算中心样本,未输入参数值的时候,则在全量样本里找 """
202
- if indices is None:
203
- indices = range(len(self))
204
-
205
- # 用于存储之前计算的结果
206
- cached_results = {}
207
-
208
- def get_similarity(i, j):
209
- """ 获取两个索引的相似度,利用缓存来避免重复计算 """
210
- if (i, j) in cached_results:
211
- return cached_results[(i, j)]
212
- sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
213
- cached_results[(i, j)] = cached_results[(j, i)] = sim_val
214
- return sim_val
215
-
216
- center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
217
- return center_idx
218
-
219
- def merge_group(self, indices, threshold=0.5, strategy='center'):
220
- """ 对输入的indexs清单,按照threshold的阈值进行合并
221
- 返回的是一个字典,key是代表性样本,value是同组内的数据编号
222
-
223
- :param strategy: 代表样本的挑选策略
224
- center,中心样本
225
- first,第一个样本
226
- """
227
- check_install_package('sklearn', 'scikit-learn')
228
- from sklearn.cluster import AgglomerativeClustering
229
-
230
- # 1 给每个样本标类别
231
- n = len(indices)
232
- distance_matrix = np.zeros((n, n))
233
- for i in range(n):
234
- for j in range(i + 1, n):
235
- # 我们需要距离,所以用1减去相似度
236
- distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
237
- distance_matrix[i, j] = distance_matrix[j, i] = distance
238
-
239
- # 进行层次聚类
240
- clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
241
- distance_threshold=threshold,
242
- linkage='complete')
243
- labels = clustering.fit_predict(distance_matrix)
244
-
245
- # 2 分组字典
246
- cluster_dict = defaultdict(list)
247
- # 组织数据到字典中
248
- for i, label in enumerate(labels):
249
- cluster_dict[label].append(indices[i])
250
-
251
- # 3 改成代表样本映射到一组里,并且按照样本数从多到少排序
252
- result = {}
253
- for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
254
- if strategy == 'first':
255
- representative = items[0]
256
- elif strategy == 'center':
257
- # 使用局部索引计算平均距离
258
- local_indices = [i for i, idx in enumerate(indices) if idx in items]
259
- sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
260
- avg_distances = sub_matrix.mean(axis=1)
261
- representative_idx = np.argmin(avg_distances)
262
- representative = items[representative_idx]
263
- else:
264
- raise ValueError(f'Invalid strategy: {strategy}')
265
- result[representative] = items
266
-
267
- return result
268
-
269
- def init_groups(self, threshold=0.5, batch_size=1000):
270
- """
271
- :param threshold: 按照阈值进行分组,在这个距离内的都会归到一组
272
- :param batch_size: 因为数据可能太大,不可能一次性全量两两比较,这里可以分batch处理
273
- 这样虽然结果不太精确,但能大大减小运算量
274
- """
275
- # 1 最开始每个样本都是一个组
276
- groups = {i: [i] for i in range(len(self))}
277
- new_groups = {}
278
-
279
- # 2 不断合并,直到没有组数变化
280
- while len(groups) > 1:
281
- for indices in chunked(groups.keys(), batch_size):
282
- # 对于这里返回的字典,原groups里的values也要对应拼接的
283
- indices2 = self.merge_group(indices, threshold=threshold)
284
- for idx, idxs in indices2.items():
285
- # 获取原始分组中的索引
286
- original_idxs = [groups[original_idx] for original_idx in idxs]
287
- # 展平列表并分配到新分组中
288
- new_groups[idx] = [item for sublist in original_idxs for item in sublist]
289
-
290
- # 如果分组没有发生变化,退出循环
291
- if len(new_groups) == len(groups):
292
- break
293
-
294
- groups = new_groups
295
- new_groups = {}
296
-
297
- # 3 按数量从多到少排序
298
- new_groups = {}
299
- for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
300
- new_groups[label] = items # 暂用第一个出现的作为代表
301
-
302
- self.groups = new_groups
303
- return self.groups
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2021/06/06 17:01
6
+
7
+ from pyxllib.prog.pupil import check_install_package
8
+
9
+ # 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
10
+ # 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
11
+ # MatchSimString计算编辑距离需要
12
+ check_install_package('Levenshtein', 'python-Levenshtein')
13
+
14
+ from collections import defaultdict
15
+ from more_itertools import chunked
16
+ import warnings
17
+
18
+ import Levenshtein
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ from pyxllib.prog.pupil import run_once
23
+ from pyxllib.prog.specialist import dataframe_str
24
+ from pyxllib.text.pupil import briefstr
25
+
26
+ # 忽略特定的警告
27
+ warnings.filterwarnings("ignore", category=FutureWarning,
28
+ module="sklearn.cluster._agglomerative",
29
+ lineno=1005)
30
+
31
+
32
+ @run_once('str')
33
+ def get_levenshtein_similar(x, y):
34
+ """ 缓存各字符串之间的编辑距离 """
35
+ return Levenshtein.ratio(x, y)
36
+
37
+
38
+ class MatchSimString:
39
+ """ 匹配近似字符串
40
+
41
+ mss = MatchSimString()
42
+
43
+ # 1 添加候选对象
44
+ mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
45
+ mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
46
+ mss.append_candidate('删除所有标签中间多余的空白')
47
+
48
+ # 2 需要匹配的对象1
49
+ s = '奕本初一福周厦门培油'
50
+
51
+ idx, sim = mss.match(s)
52
+ print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
53
+ print('相似度:', sim) # 相似度: 0.22
54
+
55
+ # 3 需要匹配的对象2
56
+ s = '圆柱与【圆锥】_教案空白版'
57
+
58
+ idx, sim = mss.match(s)
59
+ print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
60
+ print('相似度:', sim) # 相似度: 0.375
61
+
62
+ 如果append_candidate有传递2个扩展信息参数,可以索引获取:
63
+ mss.ext_value[idx]
64
+ """
65
+
66
+ def __init__(self, method=briefstr):
67
+ self.preproc = method
68
+ self.origin_str = [] # 原始字符串内容
69
+ self.key_str = [] # 对原始字符串进行处理后的字符
70
+ self.ext_value = [] # 扩展存储一些信息
71
+
72
+ def __getitem__(self, item):
73
+ return self.origin_str[item]
74
+
75
+ def __delitem__(self, item):
76
+ del self.origin_str[item]
77
+ del self.key_str[item]
78
+ del self.ext_value[item]
79
+
80
+ def __len__(self):
81
+ return len(self.key_str)
82
+
83
+ def get_similarity(self, x, y):
84
+ """ 计算两对数据之间的相似度 """
85
+ pass
86
+
87
+ def append_candidate(self, k, v=None):
88
+ self.origin_str.append(k)
89
+ if callable(self.preproc):
90
+ k = self.preproc(k)
91
+ self.key_str.append(k)
92
+ self.ext_value.append(v)
93
+
94
+ def match(self, s):
95
+ """ 跟候选字符串进行匹配,返回最佳匹配结果
96
+ """
97
+ idx, sim = -1, 0
98
+ for i in range(len(self)):
99
+ k, v = self.key_str[i], self.ext_value[i]
100
+ sim_ = Levenshtein.ratio(k, s)
101
+ if sim_ > sim:
102
+ sim = sim_
103
+ idx = i
104
+ i += 1
105
+ return idx, sim
106
+
107
+ def match_many(self, s, count=1):
108
+ """跟候选字符串进行匹配,返回多个最佳匹配结果
109
+ :param str s: 待匹配的字符串
110
+ :param int count: 需要返回的匹配数量
111
+ :return: 匹配结果列表,列表中的元素为(idx, sim)对
112
+ """
113
+ scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
114
+ # 根据相似度排序并返回前count个结果
115
+ return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
116
+
117
+ def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
118
+ """输入一个字符串s,和候选项做近似匹配
119
+
120
+ :param s: 需要进行匹配的字符串s
121
+ :param count: 只输出部分匹配结果
122
+ -1:输出所有匹配结果
123
+ 0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
124
+ 整数:输出匹配度最高的count个结果
125
+ :param showstr: 字符串显示效果
126
+ """
127
+ # 1 计算编辑距离,存储结果到res
128
+ res = []
129
+ n = len(self)
130
+ for i in range(n):
131
+ k, v = self.key_str[i], self.ext_value[i]
132
+ sim = Levenshtein.ratio(k, s)
133
+ res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
134
+ i += 1
135
+
136
+ # 2 排序、节选结果
137
+ res = sorted(res, key=lambda x: -x[2])
138
+ if 0 < count < 1:
139
+ n = max(1, int(n * count))
140
+ elif isinstance(count, int) and count > 0:
141
+ n = min(count, n)
142
+ res = res[:n]
143
+
144
+ # 3 输出
145
+ df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
146
+ s = dataframe_str(df)
147
+ s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
148
+ print(s)
149
+
150
+ def agglomerative_clustering(self, threshold=0.5):
151
+ """ 对内部字符串进行层次聚类
152
+
153
+ :param threshold: 可以理解成距离的阈值,距离小于这个阈值的字符串会被聚为一类
154
+ 值越小,分出的类别越多越细
155
+ """
156
+ check_install_package('sklearn', 'scikit-learn')
157
+ from sklearn.cluster import AgglomerativeClustering
158
+
159
+ # 1 给每个样本标类别
160
+ distance_matrix = np.zeros((len(self), len(self)))
161
+ for i in range(len(self)):
162
+ for j in range(i + 1, len(self)):
163
+ # 我们需要距离,所以用1减去相似度
164
+ distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
165
+ distance_matrix[i, j] = distance_matrix[j, i] = distance
166
+
167
+ # 进行层次聚类
168
+ clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
169
+ distance_threshold=threshold,
170
+ linkage='complete')
171
+ labels = clustering.fit_predict(distance_matrix)
172
+
173
+ return labels
174
+
175
+ def display_clusters(self, threshold=0.5):
176
+ """ 根据agglomerative_clustering的结果,显示各个聚类的内容 """
177
+
178
+ labels = self.agglomerative_clustering(threshold=threshold)
179
+ cluster_dict = defaultdict(list)
180
+
181
+ # 组织数据到字典中
182
+ for idx, label in enumerate(labels):
183
+ cluster_dict[label].append(self.origin_str[idx])
184
+
185
+ # 按标签排序并显示
186
+ result = {}
187
+ for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
188
+ result[label] = items
189
+
190
+ return result
191
+
192
+
193
+ class HierarchicalMatchSimString(MatchSimString):
194
+ """ 在面对数据量很大的候选数据情况下,建议使用这个层次聚类后的匹配方法 """
195
+
196
+ def __init__(self, method=briefstr):
197
+ super().__init__(method)
198
+ self.groups = dict()
199
+
200
+ def get_center_sample(self, indices=None):
201
+ """ 输入一组下标,计算中心样本,未输入参数值的时候,则在全量样本里找 """
202
+ if indices is None:
203
+ indices = range(len(self))
204
+
205
+ # 用于存储之前计算的结果
206
+ cached_results = {}
207
+
208
+ def get_similarity(i, j):
209
+ """ 获取两个索引的相似度,利用缓存来避免重复计算 """
210
+ if (i, j) in cached_results:
211
+ return cached_results[(i, j)]
212
+ sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
213
+ cached_results[(i, j)] = cached_results[(j, i)] = sim_val
214
+ return sim_val
215
+
216
+ center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
217
+ return center_idx
218
+
219
+ def merge_group(self, indices, threshold=0.5, strategy='center'):
220
+ """ 对输入的indexs清单,按照threshold的阈值进行合并
221
+ 返回的是一个字典,key是代表性样本,value是同组内的数据编号
222
+
223
+ :param strategy: 代表样本的挑选策略
224
+ center,中心样本
225
+ first,第一个样本
226
+ """
227
+ check_install_package('sklearn', 'scikit-learn')
228
+ from sklearn.cluster import AgglomerativeClustering
229
+
230
+ # 1 给每个样本标类别
231
+ n = len(indices)
232
+ distance_matrix = np.zeros((n, n))
233
+ for i in range(n):
234
+ for j in range(i + 1, n):
235
+ # 我们需要距离,所以用1减去相似度
236
+ distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
237
+ distance_matrix[i, j] = distance_matrix[j, i] = distance
238
+
239
+ # 进行层次聚类
240
+ clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
241
+ distance_threshold=threshold,
242
+ linkage='complete')
243
+ labels = clustering.fit_predict(distance_matrix)
244
+
245
+ # 2 分组字典
246
+ cluster_dict = defaultdict(list)
247
+ # 组织数据到字典中
248
+ for i, label in enumerate(labels):
249
+ cluster_dict[label].append(indices[i])
250
+
251
+ # 3 改成代表样本映射到一组里,并且按照样本数从多到少排序
252
+ result = {}
253
+ for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
254
+ if strategy == 'first':
255
+ representative = items[0]
256
+ elif strategy == 'center':
257
+ # 使用局部索引计算平均距离
258
+ local_indices = [i for i, idx in enumerate(indices) if idx in items]
259
+ sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
260
+ avg_distances = sub_matrix.mean(axis=1)
261
+ representative_idx = np.argmin(avg_distances)
262
+ representative = items[representative_idx]
263
+ else:
264
+ raise ValueError(f'Invalid strategy: {strategy}')
265
+ result[representative] = items
266
+
267
+ return result
268
+
269
+ def init_groups(self, threshold=0.5, batch_size=1000):
270
+ """
271
+ :param threshold: 按照阈值进行分组,在这个距离内的都会归到一组
272
+ :param batch_size: 因为数据可能太大,不可能一次性全量两两比较,这里可以分batch处理
273
+ 这样虽然结果不太精确,但能大大减小运算量
274
+ """
275
+ # 1 最开始每个样本都是一个组
276
+ groups = {i: [i] for i in range(len(self))}
277
+ new_groups = {}
278
+
279
+ # 2 不断合并,直到没有组数变化
280
+ while len(groups) > 1:
281
+ for indices in chunked(groups.keys(), batch_size):
282
+ # 对于这里返回的字典,原groups里的values也要对应拼接的
283
+ indices2 = self.merge_group(indices, threshold=threshold)
284
+ for idx, idxs in indices2.items():
285
+ # 获取原始分组中的索引
286
+ original_idxs = [groups[original_idx] for original_idx in idxs]
287
+ # 展平列表并分配到新分组中
288
+ new_groups[idx] = [item for sublist in original_idxs for item in sublist]
289
+
290
+ # 如果分组没有发生变化,退出循环
291
+ if len(new_groups) == len(groups):
292
+ break
293
+
294
+ groups = new_groups
295
+ new_groups = {}
296
+
297
+ # 3 按数量从多到少排序
298
+ new_groups = {}
299
+ for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
300
+ new_groups[label] = items # 暂用第一个出现的作为代表
301
+
302
+ self.groups = new_groups
303
+ return self.groups