pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. pyxllib/__init__.py +9 -2
  2. pyxllib/algo/__init__.py +8 -0
  3. pyxllib/algo/disjoint.py +54 -0
  4. pyxllib/algo/geo.py +541 -0
  5. pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
  6. pyxllib/algo/matcher.py +389 -0
  7. pyxllib/algo/newbie.py +166 -0
  8. pyxllib/algo/pupil.py +629 -0
  9. pyxllib/algo/shapelylib.py +67 -0
  10. pyxllib/algo/specialist.py +241 -0
  11. pyxllib/algo/stat.py +494 -0
  12. pyxllib/algo/treelib.py +149 -0
  13. pyxllib/algo/unitlib.py +66 -0
  14. pyxllib/autogui/__init__.py +5 -0
  15. pyxllib/autogui/activewin.py +246 -0
  16. pyxllib/autogui/all.py +9 -0
  17. pyxllib/autogui/autogui.py +852 -0
  18. pyxllib/autogui/uiautolib.py +362 -0
  19. pyxllib/autogui/virtualkey.py +102 -0
  20. pyxllib/autogui/wechat.py +827 -0
  21. pyxllib/autogui/wechat_msg.py +421 -0
  22. pyxllib/autogui/wxautolib.py +84 -0
  23. pyxllib/cv/__init__.py +1 -11
  24. pyxllib/cv/expert.py +267 -0
  25. pyxllib/cv/{imlib.py → imfile.py} +18 -83
  26. pyxllib/cv/imhash.py +39 -0
  27. pyxllib/cv/pupil.py +9 -0
  28. pyxllib/cv/rgbfmt.py +1525 -0
  29. pyxllib/cv/slidercaptcha.py +137 -0
  30. pyxllib/cv/trackbartools.py +163 -49
  31. pyxllib/cv/xlcvlib.py +1040 -0
  32. pyxllib/cv/xlpillib.py +423 -0
  33. pyxllib/data/__init__.py +0 -0
  34. pyxllib/data/echarts.py +240 -0
  35. pyxllib/data/jsonlib.py +89 -0
  36. pyxllib/{util/oss2_.py → data/oss.py} +11 -9
  37. pyxllib/data/pglib.py +1127 -0
  38. pyxllib/data/sqlite.py +568 -0
  39. pyxllib/{util → data}/sqllib.py +13 -31
  40. pyxllib/ext/JLineViewer.py +505 -0
  41. pyxllib/ext/__init__.py +6 -0
  42. pyxllib/{util → ext}/demolib.py +119 -35
  43. pyxllib/ext/drissionlib.py +277 -0
  44. pyxllib/ext/kq5034lib.py +12 -0
  45. pyxllib/{util/main.py → ext/old.py} +122 -284
  46. pyxllib/ext/qt.py +449 -0
  47. pyxllib/ext/robustprocfile.py +497 -0
  48. pyxllib/ext/seleniumlib.py +76 -0
  49. pyxllib/{util/tklib.py → ext/tk.py} +10 -11
  50. pyxllib/ext/unixlib.py +827 -0
  51. pyxllib/ext/utools.py +351 -0
  52. pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
  53. pyxllib/ext/win32lib.py +40 -0
  54. pyxllib/ext/wjxlib.py +88 -0
  55. pyxllib/ext/wpsapi.py +124 -0
  56. pyxllib/ext/xlwork.py +9 -0
  57. pyxllib/ext/yuquelib.py +1105 -0
  58. pyxllib/file/__init__.py +17 -0
  59. pyxllib/file/docxlib.py +761 -0
  60. pyxllib/{util → file}/gitlib.py +40 -27
  61. pyxllib/file/libreoffice.py +165 -0
  62. pyxllib/file/movielib.py +148 -0
  63. pyxllib/file/newbie.py +10 -0
  64. pyxllib/file/onenotelib.py +1469 -0
  65. pyxllib/file/packlib/__init__.py +330 -0
  66. pyxllib/{util → file/packlib}/zipfile.py +598 -195
  67. pyxllib/file/pdflib.py +426 -0
  68. pyxllib/file/pupil.py +185 -0
  69. pyxllib/file/specialist/__init__.py +685 -0
  70. pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
  71. pyxllib/file/specialist/download.py +193 -0
  72. pyxllib/file/specialist/filelib.py +2829 -0
  73. pyxllib/file/xlsxlib.py +3131 -0
  74. pyxllib/file/xlsyncfile.py +341 -0
  75. pyxllib/prog/__init__.py +5 -0
  76. pyxllib/prog/cachetools.py +64 -0
  77. pyxllib/prog/deprecatedlib.py +233 -0
  78. pyxllib/prog/filelock.py +42 -0
  79. pyxllib/prog/ipyexec.py +253 -0
  80. pyxllib/prog/multiprogs.py +940 -0
  81. pyxllib/prog/newbie.py +451 -0
  82. pyxllib/prog/pupil.py +1197 -0
  83. pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
  84. pyxllib/prog/specialist/__init__.py +391 -0
  85. pyxllib/prog/specialist/bc.py +203 -0
  86. pyxllib/prog/specialist/browser.py +497 -0
  87. pyxllib/prog/specialist/common.py +347 -0
  88. pyxllib/prog/specialist/datetime.py +199 -0
  89. pyxllib/prog/specialist/tictoc.py +240 -0
  90. pyxllib/prog/specialist/xllog.py +180 -0
  91. pyxllib/prog/xlosenv.py +108 -0
  92. pyxllib/stdlib/__init__.py +17 -0
  93. pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
  94. pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
  95. pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
  96. pyxllib/text/__init__.py +8 -0
  97. pyxllib/text/ahocorasick.py +39 -0
  98. pyxllib/text/airscript.js +744 -0
  99. pyxllib/text/charclasslib.py +121 -0
  100. pyxllib/text/jiebalib.py +267 -0
  101. pyxllib/text/jinjalib.py +32 -0
  102. pyxllib/text/jsa_ai_prompt.md +271 -0
  103. pyxllib/text/jscode.py +922 -0
  104. pyxllib/text/latex/__init__.py +158 -0
  105. pyxllib/text/levenshtein.py +303 -0
  106. pyxllib/text/nestenv.py +1215 -0
  107. pyxllib/text/newbie.py +300 -0
  108. pyxllib/text/pupil/__init__.py +8 -0
  109. pyxllib/text/pupil/common.py +1121 -0
  110. pyxllib/text/pupil/xlalign.py +326 -0
  111. pyxllib/text/pycode.py +47 -0
  112. pyxllib/text/specialist/__init__.py +8 -0
  113. pyxllib/text/specialist/common.py +112 -0
  114. pyxllib/text/specialist/ptag.py +186 -0
  115. pyxllib/text/spellchecker.py +172 -0
  116. pyxllib/text/templates/echart_base.html +11 -0
  117. pyxllib/text/templates/highlight_code.html +17 -0
  118. pyxllib/text/templates/latex_editor.html +103 -0
  119. pyxllib/text/vbacode.py +17 -0
  120. pyxllib/text/xmllib.py +747 -0
  121. pyxllib/xl.py +39 -0
  122. pyxllib/xlcv.py +17 -0
  123. pyxllib-0.3.197.dist-info/METADATA +48 -0
  124. pyxllib-0.3.197.dist-info/RECORD +126 -0
  125. {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
  126. pyxllib/basic/_1_strlib.py +0 -945
  127. pyxllib/basic/_2_timelib.py +0 -488
  128. pyxllib/basic/_3_pathlib.py +0 -916
  129. pyxllib/basic/_4_loglib.py +0 -419
  130. pyxllib/basic/__init__.py +0 -54
  131. pyxllib/basic/arrow_.py +0 -250
  132. pyxllib/basic/chardet_.py +0 -66
  133. pyxllib/basic/dirlib.py +0 -529
  134. pyxllib/basic/dprint.py +0 -202
  135. pyxllib/basic/extension.py +0 -12
  136. pyxllib/basic/judge.py +0 -31
  137. pyxllib/basic/log.py +0 -204
  138. pyxllib/basic/pathlib_.py +0 -705
  139. pyxllib/basic/pytictoc.py +0 -102
  140. pyxllib/basic/qiniu_.py +0 -61
  141. pyxllib/basic/strlib.py +0 -761
  142. pyxllib/basic/timer.py +0 -132
  143. pyxllib/cv/cv.py +0 -834
  144. pyxllib/cv/cvlib/_1_geo.py +0 -543
  145. pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
  146. pyxllib/cv/cvlib/_2_imgproc.py +0 -594
  147. pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
  148. pyxllib/cv/cvlib/_4_cvimg.py +0 -211
  149. pyxllib/cv/cvlib/__init__.py +0 -10
  150. pyxllib/cv/debugtools.py +0 -82
  151. pyxllib/cv/fitz_.py +0 -300
  152. pyxllib/cv/installer.py +0 -42
  153. pyxllib/debug/_0_installer.py +0 -38
  154. pyxllib/debug/_1_typelib.py +0 -277
  155. pyxllib/debug/_2_chrome.py +0 -198
  156. pyxllib/debug/_3_showdir.py +0 -161
  157. pyxllib/debug/_4_bcompare.py +0 -140
  158. pyxllib/debug/__init__.py +0 -49
  159. pyxllib/debug/bcompare.py +0 -132
  160. pyxllib/debug/chrome.py +0 -198
  161. pyxllib/debug/installer.py +0 -38
  162. pyxllib/debug/showdir.py +0 -158
  163. pyxllib/debug/typelib.py +0 -278
  164. pyxllib/image/__init__.py +0 -12
  165. pyxllib/torch/__init__.py +0 -20
  166. pyxllib/torch/modellib.py +0 -37
  167. pyxllib/torch/trainlib.py +0 -344
  168. pyxllib/util/__init__.py +0 -20
  169. pyxllib/util/aip_.py +0 -141
  170. pyxllib/util/casiadb.py +0 -59
  171. pyxllib/util/excellib.py +0 -495
  172. pyxllib/util/filelib.py +0 -612
  173. pyxllib/util/jsondata.py +0 -27
  174. pyxllib/util/jsondata2.py +0 -92
  175. pyxllib/util/labelmelib.py +0 -139
  176. pyxllib/util/onepy/__init__.py +0 -29
  177. pyxllib/util/onepy/onepy.py +0 -574
  178. pyxllib/util/onepy/onmanager.py +0 -170
  179. pyxllib/util/pyautogui_.py +0 -219
  180. pyxllib/util/textlib.py +0 -1305
  181. pyxllib/util/unorder.py +0 -22
  182. pyxllib/util/xmllib.py +0 -639
  183. pyxllib-0.0.43.dist-info/METADATA +0 -39
  184. pyxllib-0.0.43.dist-info/RECORD +0 -80
  185. pyxllib-0.0.43.dist-info/top_level.txt +0 -1
  186. {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/util/textlib.py DELETED
@@ -1,1305 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Data : 2018/12/27
6
-
7
-
8
- """
9
- 文本处理、常用正则匹配模式
10
-
11
- 下面大量的函数前缀含义:
12
- grp,generate regular pattern,生成正则模式字符串
13
- grr,generate regular replace,生成正则替换目标格式
14
- """
15
-
16
- from pyxllib.util.mathlib import *
17
-
18
- import base64
19
- import bisect
20
- import itertools
21
-
22
- ____section_0_import = """
23
- try ... except不影响效率的
24
- 主要是导入特殊包,好像是比较耗费时间,这里要占用掉0.1秒多时间
25
- """
26
-
27
- # 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
28
- # 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
29
- try:
30
- # MatchSimString计算编辑距离需要
31
- import Levenshtein
32
- except ModuleNotFoundError:
33
- subprocess.run(['pip', 'install', 'python-Levenshtein'])
34
- import Levenshtein
35
-
36
- # import textract # ensure_content读取word文档需要
37
-
38
- try: # 拼写检查库,即词汇库
39
- from spellchecker import SpellChecker
40
- except ModuleNotFoundError:
41
- subprocess.run(['pip', 'install', 'pyspellchecker'])
42
- from spellchecker import SpellChecker
43
-
44
- ____section_1_text = """
45
- 一些文本处理函数和类
46
- """
47
-
48
-
49
- class ContentLine(object):
50
- """用行数的特性分析一段文本"""
51
-
52
- def __init__(self, content):
53
- """用一段文本初始化"""
54
- self.content = ensure_content(content) # 原始文本
55
- self.linepos = list() # linepos[i-1] = v:第i行终止位置(\n)所在下标为v
56
- for i in range(len(self.content)):
57
- if self.content[i] == '\n':
58
- self.linepos.append(i)
59
- self.linepos.append(len(self.content))
60
- self.lines = self.content.splitlines() # 每一行的文本内容
61
-
62
- def line_start_pos(self, line):
63
- """第line行的其实pos位置"""
64
- pass
65
-
66
- def lines_num(self):
67
- """返回总行数"""
68
- return self.content.count('\n')
69
-
70
- def match_lines(self, pattern):
71
- """返回符合正则规则的行号
72
-
73
- 180515扩展: pattern也能输入一个函数
74
- """
75
- # 1 定义函数句柄
76
- if not callable(pattern):
77
- def f(s):
78
- return re.search(pattern, s)
79
- else:
80
- f = pattern
81
- # 2 循环判断
82
- res = list()
83
- for i, line in enumerate(self.lines):
84
- if f(line):
85
- res.append(i)
86
- return res
87
-
88
- def in_line(self, ob):
89
- """输入关键词ob,返回行号"""
90
-
91
- if hasattr(ob, 'span'):
92
- return self.in_line(ob.span()[0])
93
- elif isinstance(ob, int):
94
- "如果给入一个下标值,如23,计算第23个字符处于原文中第几行"
95
- return bisect.bisect_right(self.linepos, ob - 1) + 1
96
- elif isinstance(ob, str):
97
- "输入一段文本,判断该文中有哪些行与该行内容相同"
98
- res = list()
99
- for i, line in enumerate(self.lines):
100
- if line == ob:
101
- res.append(i + 1)
102
- return res
103
- elif isinstance(ob, (list, tuple, collections.Iterable)):
104
- return list(map(self.in_line, ob))
105
- else:
106
- dprint(typename(ob)) # 类型错误
107
- raise ValueError
108
-
109
- def regular_search(self, re_str):
110
- """同InLine,但是支持正则搜索"""
111
- return self.in_line(re.finditer(re_str, self.content))
112
-
113
- def lines_content(self, lines) -> str:
114
- """返回lines集合中数字所对行号的所有内容
115
-
116
- 注意输入的lines起始编号是1
117
- """
118
- lines = sorted(set(lines)) # 去重
119
- res = map(lambda n: '{:6} {}'.format(n, self.lines[n - 1]), lines)
120
- return '\n'.join(res)
121
-
122
- def __str__(self):
123
- return self.content
124
-
125
-
126
- def binary_cut_str(s, fmt='0'):
127
- """180801坤泽:“二分”切割字符串
128
- :param s: 要截取的全字符串
129
- :param fmt: 截取格式,本来是想只支持0、1的,后来想想支持23456789也行
130
- 0:左边一半
131
- 1:右边的1/2
132
- 2:右边的1/3
133
- 3:右边的1/4
134
- ...
135
- 9:右边的1/10
136
- :return: 截取后的字符串
137
-
138
- >>> binary_cut_str('1234', '0')
139
- '12'
140
- >>> binary_cut_str('1234', '1')
141
- '34'
142
- >>> binary_cut_str('1234', '10')
143
- '3'
144
- >>> binary_cut_str('123456789', '20')
145
- '7'
146
- >>> binary_cut_str('123456789', '210') # 向下取整,'21'获得了9,然后'0'取到空字符串
147
- ''
148
- """
149
- for t in fmt:
150
- t = int(t)
151
- n = len(s) // (1 + max(1, t))
152
- if t == 0:
153
- s = s[:n]
154
- else:
155
- s = s[(len(s) - n):]
156
- return s
157
-
158
-
159
- def digits2chinese(n):
160
- """TODO:目前处理范围有限,还需要再扩展
161
- """
162
- s = '十一二三四五六七八九'
163
- if n == 0:
164
- return '零'
165
- elif n <= 10:
166
- return s[n % 10]
167
- elif n < 20:
168
- return '十' + s[n % 10]
169
- elif n < 100:
170
- return s[n // 10] + s[n % 10]
171
- else:
172
- raise NotImplementedError
173
-
174
-
175
- def chinese2digits(chinese_str):
176
- """把汉字变为阿拉伯数字
177
- https://blog.csdn.net/leon_wzm/article/details/78963082
178
- """
179
-
180
- def inner(m):
181
- t = m.group()
182
- if t is None or t.strip() == '':
183
- raise ValueError(f'input error for {chinese_str}')
184
- t = t.strip()
185
- t = t.replace('百十', '百一十')
186
- common_used_numerals = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
187
- '十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
188
- total = 0
189
- r = 1 # right,右边一位的值
190
- for i in range(len(t) - 1, -1, -1): # 从右往左一位一位读取
191
- val = common_used_numerals.get(t[i]) # 使用get不存在会返回None
192
- if val is None:
193
- # dprint(chinese_str)
194
- return chinese_str
195
- # raise ValueError(f't[i]={t[i]} can not be accepted.')
196
- if val >= 10 and i == 0: # 最左位是“十百千万亿”这样的单位数词
197
- if val > r: # 一般是“十三”这类会进入这个if分支
198
- r = val
199
- total += val
200
- else:
201
- r *= val
202
- elif val >= 10:
203
- if val > r: # 跳了单位数词(正常情况都会跳),例如 一万一百零三
204
- r = val
205
- else: # 单位数词叠加情况,例如 一千亿
206
- r *= val
207
- else: # 不是单位数词的数词,如果上一步是单位数词,增加一个单位量
208
- total += r * val
209
- return str(total)
210
-
211
- return re.sub(r'[零一二两三四五六七八九十百千万亿]+', inner, chinese_str)
212
-
213
-
214
- def digits2roman(d):
215
- """
216
- >>> digits2roman(2)
217
- 'Ⅱ'
218
- >>> digits2roman(12)
219
- 'Ⅻ'
220
- """
221
- rmn = '~ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ' # roman数字number的缩写
222
-
223
- d = int(d) # 确保是整数类型
224
- if d <= 12:
225
- return rmn[d]
226
- else:
227
- raise NotImplementedError
228
-
229
-
230
- def roman2digits(d):
231
- """
232
- >>> roman2digits('Ⅱ')
233
- 2
234
- >>> roman2digits('Ⅻ')
235
- 12
236
- """
237
- rmn = '~ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'
238
- if d in rmn:
239
- return rmn.index(d)
240
- else:
241
- raise NotImplemented
242
-
243
-
244
- def digits2circlednumber(d):
245
- d = int(d)
246
- if 0 < d <= 20:
247
- return '①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳'[d - 1]
248
- else:
249
- raise NotImplemented
250
-
251
-
252
- def circlednumber2digits(d):
253
- t = '①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳'
254
- if d in t:
255
- return t.index(d) + 1
256
- else:
257
- raise NotImplemented
258
-
259
-
260
- def gettag_name(tagstr):
261
- """
262
- >>> gettag_name('%<topic type=danxuan description=单选题>')
263
- 'topic'
264
- >>> gettag_name('</topic>')
265
- 'topic'
266
- """
267
- m = re.search(r'</?([a-zA-Z_]+)', tagstr)
268
- if m:
269
- return m.group(1)
270
- else:
271
- return None
272
-
273
-
274
- def settag_name(tagstr, *, new_name=None, switch=None):
275
- """设置标签名称,或者将标签类型设为close类型
276
-
277
- >>> settag_name('%<topic type=danxuan description=单选题>', new_name='mdzz')
278
- '%<mdzz type=danxuan description=单选题>'
279
- >>> settag_name('<topic type=danxuan description=单选题>', switch=False)
280
- '</topic>'
281
- """
282
- if new_name: # 是否设置新名称
283
- tagstr = re.sub(r'(</?)([a-zA-Z_]+)', lambda m: m.group(1) + new_name, tagstr)
284
-
285
- if switch is not None: # 是否设置标签开关
286
- if switch: # 将标签改为开
287
- tagstr = tagstr.replace('</', '<')
288
- else: # 将标签改为关
289
- name = gettag_name(tagstr)
290
- res = f'</{name}>' # 会删除所有attr属性
291
- tagstr = '%' + res if '%<' in tagstr else res
292
-
293
- return tagstr
294
-
295
-
296
- def gettag_attr(tagstr, attrname):
297
- r"""tagstr是一个标签字符串,attrname是要索引的名字
298
- 返回属性值,如果不存在该属性则返回None
299
-
300
- >>> gettag_attr('%<topic type=danxuan description=单选题> 123\n<a b=c></a>', 'type')
301
- 'danxuan'
302
- >>> gettag_attr('%<topic type="dan xu an" description=单选题>', 'type')
303
- 'dan xu an'
304
- >>> gettag_attr("%<topic type='dan xu an' description=单选题>", 'type')
305
- 'dan xu an'
306
- >>> gettag_attr('%<topic type=dan xu an description=单选题>', 'description')
307
- '单选题'
308
- >>> gettag_attr('%<topic type=dan xu an description=单选题>', 'type')
309
- 'dan'
310
- >>> gettag_attr('%<topic type=danxuan description=单选题 >', 'description')
311
- '单选题'
312
- >>> gettag_attr('%<topic type=danxuan description=单选题 >', 'description123') is None
313
- True
314
- """
315
- import bs4
316
- soup = BeautifulSoup(tagstr, 'lxml')
317
- try:
318
- for tag in soup.p.contents:
319
- if isinstance(tag, bs4.Tag):
320
- return tag.get(attrname, None)
321
- except AttributeError:
322
- dprint(tagstr)
323
- return None
324
-
325
-
326
- def settag_attr(tagstr, attrname, target_value):
327
- r"""tagstr是一个标签字符串,attrname是要索引的名字
328
- 重设该属性的值,设置成功则返回新的tagstr;否则返回原始值
329
-
330
- close类型不能用这个命令,用了的话不进行任何处理,直接返回
331
-
332
- >>> settag_attr('%<topic type=danxuan> 123\n<a></a>', 'type', 'tiankong')
333
- '%<topic type="tiankong"> 123\n<a></a>'
334
- >>> settag_attr('%<topic>', 'type', 'tiankong')
335
- '%<topic type="tiankong">'
336
- >>> settag_attr('</topic>', 'type', 'tiankong')
337
- '</topic>'
338
- >>> settag_attr('<seq value="1">', 'value', '练习1.2')
339
- '<seq value="练习1.2">'
340
- >>> settag_attr('<seq type=123 value=1>', 'type', '') # 删除attr操作
341
- '<seq value=1>'
342
- >>> settag_attr('<seq type=123 value=1>', 'value', '') # 删除attr操作
343
- '<seq type=123>'
344
- >>> settag_attr('<seq type=123 value=1>', 'haha', '') # 删除attr操作
345
- '<seq type=123 value=1>'
346
- """
347
- # 如果是close类型是不处理的
348
- if tagstr.startswith('</'): return tagstr
349
-
350
- # 预处理targetValue的值,删除空白
351
- target_value = re.sub(r'\s', '', target_value)
352
- r = re.compile(r'(<|\s)(' + attrname + r'=)(.+?)(\s+\w+=|\s*>)')
353
- gs = r.search(tagstr)
354
- if target_value:
355
- if not gs: # 如果未找到则添加attr与value
356
- n = tagstr.find('>')
357
- return tagstr[:n] + ' ' + attrname + '="' + target_value + '"' + tagstr[n:]
358
- else: # 如果找到则更改value
359
- # TODO: 目前的替换值是直接放到正则式里了,这样会有很大的风险,后续看看能不能优化这个处理算法
360
- return r.sub(r'\1\g<2>"' + target_value + r'"\4', tagstr)
361
- else:
362
- if gs:
363
- return r.sub(r'\4', tagstr)
364
- else:
365
- return tagstr
366
-
367
-
368
- def briefstr(s):
369
- """对文本内容进行一些修改,从而简化其内容,提取关键信息
370
- 一般用于字符串近似对比
371
- """
372
- # 1 删除所有空白字符
373
- # debuglib.dprint(debuglib.typename(s))
374
- s = re.sub(r'\s+', '', s)
375
- # 2 转小写字符
376
- s = s.casefold()
377
- return s
378
-
379
-
380
- def brieftexstr(s):
381
- """对比两段tex文本
382
- """
383
- # 1 删除百分注
384
- s = re.sub(r'%' + grp_bracket(2, '<', '>'), r'', s)
385
- # 2 删除所有空白字符
386
- # debuglib.dprint(debuglib.typename(s))
387
- s = re.sub(r'\s+', '', s)
388
- # 3 转小写字符
389
- s = s.casefold()
390
- return s
391
-
392
-
393
- class MatchSimString:
394
- """匹配近似字符串
395
-
396
- mss = MatchSimString()
397
-
398
- # 1 添加候选对象
399
- mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
400
- mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
401
- mss.append_candidate('删除所有标签中间多余的空白')
402
-
403
- # 2 需要匹配的对象1
404
- s = '奕本初一福周厦门培油'
405
-
406
- idx, sim = mss.match(s)
407
- print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
408
- print('相似度:', sim) # 相似度: 0.22
409
-
410
- # 3 需要匹配的对象2
411
- s = '圆柱与【圆锥】_教案空白版'
412
-
413
- idx, sim = mss.match(s)
414
- print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
415
- print('相似度:', sim) # 相似度: 0.375
416
-
417
- 如果append_candidate有传递2个扩展信息参数,可以索引获取:
418
- mss.ext_value[idx]
419
- """
420
-
421
- def __init__(self, method=briefstr):
422
- self.preproc = method
423
- self.origin_str = list() # 原始字符串内容
424
- self.key_str = list() # 对原始字符串进行处理后的字符
425
- self.ext_value = list() # 扩展存储一些信息
426
-
427
- def __getitem__(self, item):
428
- return self.origin_str[item]
429
-
430
- def __len__(self):
431
- return len(self.key_str)
432
-
433
- def append_candidate(self, k, v=None):
434
- self.origin_str.append(k)
435
- if callable(self.preproc):
436
- k = self.preproc(k)
437
- self.key_str.append(k)
438
- self.ext_value.append(v)
439
-
440
- def match(self, s):
441
- """跟候选字符串进行匹配,返回最佳匹配结果
442
- """
443
- idx, sim = -1, 0
444
- for i in range(len(self)):
445
- k, v = self.key_str[i], self.ext_value[i]
446
- sim_ = Levenshtein.ratio(k, s)
447
- if sim_ > sim:
448
- sim = sim_
449
- idx = i
450
- i += 1
451
- return idx, sim
452
-
453
- def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
454
- """输入一个字符串s,和候选项做近似匹配
455
-
456
- :param s: 需要进行匹配的字符串s
457
- :param count: 只输出部分匹配结果
458
- -1:输出所有匹配结果
459
- 0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
460
- 整数:输出匹配度最高的count个结果
461
- :param showstr: 字符串显示效果
462
- """
463
- # 1 计算编辑距离,存储结果到res
464
- res = []
465
- n = len(self)
466
- for i in range(n):
467
- k, v = self.key_str[i], self.ext_value[i]
468
- sim = Levenshtein.ratio(k, s)
469
- res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
470
- i += 1
471
-
472
- # 2 排序、节选结果
473
- res = sorted(res, key=lambda x: -x[2])
474
- if 0 < count < 1:
475
- n = max(1, int(n * count))
476
- elif isinstance(count, int) and count > 0:
477
- n = min(count, n)
478
- res = res[:n]
479
-
480
- # 3 输出
481
- df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
482
- s = dataframe_str(df)
483
- s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
484
- print(s)
485
-
486
-
487
- def endswith(s, tags):
488
- """除了模拟str.endswith方法,输入的tag也可以是可迭代对象
489
-
490
- >>> endswith('a.dvi', ('.log', '.aux', '.dvi', 'busy'))
491
- True
492
- """
493
- if isinstance(tags, str):
494
- return s.endswith(tags)
495
- elif isinstance(tags, (list, tuple)):
496
- for t in tags:
497
- if s.endswith(t):
498
- return True
499
- else:
500
- raise TypeError
501
- return False
502
-
503
-
504
- def mydictstr(d, key_value_delimit='=', item_delimit=' '):
505
- """将一个字典转成字符串"""
506
- res = []
507
- for k, v in d.items():
508
- res.append(str(k) + key_value_delimit + str(v).replace('\n', r'\n'))
509
- res = item_delimit.join(res)
510
- return res
511
-
512
-
513
- def findnth(haystack, needle, n):
514
- """https://stackoverflow.com/questions/1883980/find-the-nth-occurrence-of-substring-in-a-string"""
515
- if n < 0:
516
- n += haystack.count(needle)
517
- if n < 0:
518
- return -1
519
-
520
- parts = haystack.split(needle, n + 1)
521
- if len(parts) <= n + 1:
522
- return -1
523
- return len(haystack) - len(parts[-1]) - len(needle)
524
-
525
-
526
- def refine_digits_set(digits):
527
- """美化连续数字的输出效果
528
-
529
- >>> refine_digits_set([210, 207, 207, 208, 211, 212])
530
- '207,208,210-212'
531
- """
532
- arr = sorted(list(set(digits))) # 去重
533
- n = len(arr)
534
- res = ''
535
- i = 0
536
- while i < n:
537
- j = i + 2
538
- if j < n and arr[i] + 2 == arr[j]:
539
- while j < n and arr[j] - arr[i] == j - i:
540
- j += 1
541
- j = j if j < n else n - 1
542
- res += str(arr[i]) + '-' + str(arr[j]) + ','
543
- i = j + 1
544
- else:
545
- res += str(arr[i]) + ','
546
- i += 1
547
- return res[:-1] # -1是去掉最后一个','
548
-
549
-
550
- def printoneline(s):
551
- """将输出控制在单行,适应终端大小"""
552
- try:
553
- columns = os.get_terminal_size().columns - 3 # 获取终端的窗口宽度
554
- except OSError: # 如果没和终端相连,会抛出异常
555
- # 这应该就是在PyCharm,直接来个大值吧
556
- columns = 500
557
- s = shorten(s, columns)
558
- print(s)
559
-
560
-
561
- def del_tail_newline(s):
562
- """删除末尾的换行"""
563
- if len(s) > 1 and s[-1] == '\n':
564
- s = s[:-1]
565
- return s
566
-
567
-
568
- ____section_2_regular = """
569
- 跟正则相关的一些文本处理函数和类
570
- """
571
-
572
-
573
- def grp_bracket(depth=0, left='{', right=None):
574
- r"""括号匹配,默认花括号匹配,也可以改为圆括号、方括号匹配。
575
-
576
- 效果类似于“{.*?}”,
577
- 但是左右花括号是确保匹配的,有可选参数可以提升支持的嵌套层级,
578
- 数字越大匹配嵌套能力越强,但是速度性能会一定程度降低。
579
- 例如“grp_bracket(5)”。
580
-
581
- :param depth: 括号递归深度
582
- :param left: 左边字符:(、[、{
583
- :param right: 右边字符
584
- :return:
585
-
586
- 先了解一下正则常识:
587
- >>> re.sub(r'[^\[\]]', r'', r'a[b]a[]') # 删除非方括号
588
- '[][]'
589
- >>> re.sub(r'[^\(\)]', r'', r'a(b)a()') # 删除非圆括号
590
- '()()'
591
- >>> re.sub(r'[^()]', r'', r'a(b)a()') # 不用\也可以
592
- '()()'
593
-
594
- 该函数使用效果:
595
- >>> re.sub(grp_bracket(5), r'', r'x{aaa{b{d}b}ccc{d{{}e}ff}gg}y')
596
- 'xy'
597
- >>> re.sub(grp_bracket(5, '(', ')'), r'', r'x(aaa(b(d)b)ccc(d(()e)ff)gg)y')
598
- 'xy'
599
- >>> re.sub(grp_bracket(5, '[', ']'), r'', r'x[aaa[b[d]b]ccc[d[[]e]ff]gg]y')
600
- 'xy'
601
- """
602
- # 用a, b简化引用名称
603
- a, b = left, right
604
- if b is None:
605
- if a == '(':
606
- b = ')'
607
- elif a == '[':
608
- b = ']'
609
- elif a == '{':
610
- b = '}'
611
- else:
612
- raise NotImplementedError
613
- # 特殊符号需要转义
614
- if a in '([':
615
- a = '\\' + a
616
- if b in ')]':
617
- b = '\\' + b
618
- c = f'[^{a}{b}]'
619
- # 建立匹配素材
620
- pattern_0 = f'{a}{c}*{b}'
621
- pat_left = f'{a}(?:{c}|'
622
- pat_right = f')*{b}'
623
-
624
- # 生成匹配规则的函数
625
- def gen(pattern, depth=0):
626
- while depth:
627
- pattern = pat_left + pattern + pat_right
628
- depth -= 1
629
- return pattern
630
-
631
- s = gen(pattern_0, depth=depth)
632
- return s
633
-
634
-
635
- # 定义常用的几种格式,并且只匹配抓取花括号里面的值,不要花括号本身
636
- SQUARE3 = r'\\[(' + grp_bracket(3, '[')[3:-3] + r')\\]'
637
- BRACE1 = '{(' + grp_bracket(1)[1:-1] + ')}'
638
- BRACE2 = '{(' + grp_bracket(2)[1:-1] + ')}'
639
- BRACE3 = '{(' + grp_bracket(3)[1:-1] + ')}'
640
- BRACE4 = '{(' + grp_bracket(4)[1:-1] + ')}'
641
- BRACE5 = '{(' + grp_bracket(5)[1:-1] + ')}'
642
- """使用示例
643
- >> m = re.search(r'\\multicolumn' + BRACE3*3, r'\multicolumn{2}{|c|}{$2^{12}$个数}')
644
- >> m.groups()
645
- ('2', '|c|', '$2^{12}$个数')
646
- """
647
-
648
-
649
- def grp_figure(cnt_groups=0, parpic=False):
650
- """生成跟图片匹配相关的表达式
651
-
652
- D:\2017LaTeX\D招培试卷\高中地理,用过 \captionfig{3-3.eps}{图~3}
653
- 奕本从2018秋季教材开始使用多种图片格式
654
-
655
- 191224周二18:20 更新:匹配到的图片名不带花括号
656
- """
657
- if cnt_groups == 0: # 不分组
658
- s = r'\\(?:includegraphics|figt|figc|figr|fig).*?' + grp_bracket(3) # 注意第1组fig要放最后面
659
- elif cnt_groups == 1: # 只分1组,那么只对图片括号内的内容分组
660
- s = r'\\(?:includegraphics|figt|figc|figr|fig).*?' + BRACE3
661
- elif cnt_groups == 2: # 只分2组,那么只对插图命令和图片分组
662
- s = r'\\(includegraphics|figt|figc|figr|fig).*?' + BRACE3
663
- elif cnt_groups == 3:
664
- s = r'\\(includegraphics|figt|figc|figr|fig)(.*?)' + BRACE3
665
- else:
666
- s = None
667
-
668
- if s and parpic:
669
- s = r'{?\\parpic(?:\[.\])?{' + s + r'}*'
670
-
671
- return s
672
-
673
-
674
- def grp_topic(*, type_value=None):
675
- """定位topic
676
-
677
- :param type_value: 设置题目类型(TODO: 功能尚未开发)
678
- """
679
- s = r'%<topic.*?%</topic>' # 注意外部使用的re要开flags=re.DOTALL
680
- return s
681
-
682
-
683
- def grp_chinese_char():
684
- return r'[\u4e00-\u9fa5,。;?()【】、①-⑨]'
685
-
686
-
687
- def grr_check(m):
688
- """用来检查匹配情况"""
689
- s0 = m.group()
690
- pass # 还没想好什么样的功能是和写到re.sub里面的repl
691
- return s0
692
-
693
-
694
- def regularcheck(pattern, string, flags=0):
695
- arr = []
696
- cl = ContentLine(string)
697
- for i, m in enumerate(re.finditer(pattern, string, flags)):
698
- ss = map(lambda x: textwrap.shorten(x, 200), m.groups())
699
- arr.append([i + 1, cl.in_line(m.start(0)), *ss])
700
- tablehead = ['行号'] + list(map(lambda x: f'第{x}组', range(len_in_dim2(arr) - 2)))
701
- df = pd.DataFrame.from_records(arr, columns=tablehead)
702
- res = f'正则模式:{pattern},匹配结果:\n' + dataframe_str(df)
703
- return res
704
-
705
-
706
- class StrIdxBack:
707
- r"""字符串删除部分干扰字符后,对新字符串匹配并回溯找原字符串的下标
708
-
709
- >>> ob = StrIdxBack('bxx ax xbxax')
710
- >>> ob.delchars(r'[ x]+')
711
- >>> ob # 删除空格、删除字符x
712
- baba
713
- >>> print(ob.idx) # keystr中与原字符串对应位置:(0, 5, 9, 11)
714
- (0, 5, 9, 11)
715
- >>> m = re.match(r'b(ab)', ob.keystr)
716
- >>> m = ob.matchback(m)
717
- >>> m.group(1)
718
- 'ax xb'
719
- >>> ob.search('ab') # 找出原字符串中内容:'ax xb'
720
- 'ax xb'
721
- """
722
-
723
- def __init__(self, s):
724
- self.oristr = s
725
- self.idx = tuple(range(len(s))) # 存储还保留着内容的下标
726
- self.keystr = s
727
-
728
- def delchars(self, pattern, flags=0):
729
- """模仿正则的替换语法
730
- 但是不用输入替换目标s,以及目标格式,因为都是删除操作
731
-
732
- 利用正则可以知道被删除的是哪个区间范围
733
- >>> ob = StrIdxBack('abc123df4a'); ob.delchars(r'\d+'); str(ob)
734
- 'abcdfa'
735
- >>> ob.idx
736
- (0, 1, 2, 6, 7, 9)
737
- """
738
- k = 0
739
- idxs = []
740
-
741
- def repl(m):
742
- nonlocal k, idxs
743
- idxs.append(self.idx[k:m.start(0)])
744
- k = m.end(0)
745
- return ''
746
-
747
- self.keystr = re.sub(pattern, repl, self.keystr, flags=flags)
748
- idxs.append(self.idx[k:])
749
- self.idx = tuple(itertools.chain(*idxs))
750
-
751
- def compare_newstr(self, limit=300):
752
- r"""比较直观的比较字符串前后变化
753
-
754
- newstr相对于oldnew作展开,比较直观的显示字符串前后变化差异
755
- >>> ob = StrIdxBack('abab'); ob.delchars('b'); ob.compare_newstr()
756
- 'a a '
757
- """
758
- s1 = self.oristr
759
- dd = set(self.idx)
760
-
761
- s2 = []
762
- k = 0
763
- for i in range(min(len(s1), limit)):
764
- if i in dd:
765
- s2.append(s1[i])
766
- k += 1
767
- else:
768
- if ord(s1[i]) < 128:
769
- if s1[i] == ' ': # 原来是空格的,删除后要用_表示
770
- s2.append('_')
771
- else: # 原始不是空格的,可以用空格表示已被删除
772
- s2.append(' ')
773
- else: # 中文字符要用两个空格表示才能对齐
774
- s2.append(' ')
775
- s2 = ''.join(s2)
776
- s2 = s2.replace('\n', r'\n')
777
-
778
- return s2
779
-
780
- def compare(self, limit=300):
781
- """比较直观的比较字符串前后变化"""
782
- s1 = self.oristr
783
-
784
- s1 = s1.replace('\n', r'\n')[:limit]
785
- s2 = self.compare_newstr(limit)
786
-
787
- return s1 + '\n' + s2 + '\n'
788
-
789
- def matchback(self, m):
790
- """输入一个keystr匹配的match对象,将其映射回oristr的match对象"""
791
- regs = []
792
- for rs in getattr(m, 'regs'):
793
- regs.append((self.idx[rs[0]], self.idx[rs[1] - 1] + 1)) # 注意右边界的处理有细节
794
- return ReMatch(regs, self.oristr, m.pos, len(self.oristr), m.lastindex, m.lastgroup, m.re)
795
-
796
- def search(self, pattern):
797
- """在新字符串上查找模式,但是返回的是原字符串的相关下标数据"""
798
- m = re.search(pattern, self.keystr)
799
- if m:
800
- m = self.matchback(m) # pycharm这里会提示m没有regs的成员变量,其实是正常的,没问题
801
- return m.group()
802
- else:
803
- return ''
804
-
805
- def __repr__(self):
806
- """返回处理后当前的新字符串"""
807
- return self.keystr
808
-
809
-
810
- def bracket_match(s, idx):
811
- """括号匹配位置
812
- 这里以{、}为例,注意也要适用于'[]', '()'
813
- >>> bracket_match('{123}', 0)
814
- 4
815
- >>> bracket_match('0{23{5}}89', 1)
816
- 7
817
- >>> bracket_match('0{23{5}}89', 7)
818
- 1
819
- >>> bracket_match('0{23{5}78', 1) is None
820
- True
821
- >>> bracket_match('0{23{5}78', 20) is None
822
- True
823
- >>> bracket_match('0[2[4]{7}]01', 9)
824
- 1
825
- >>> bracket_match('0{[34{6}89}', -4)
826
- 5
827
- """
828
- key = '{[(<>)]}'
829
- try:
830
- if idx < 0:
831
- idx += len(s)
832
- ch1 = s[idx]
833
- idx1 = key.index(ch1)
834
- except ValueError: # 找不到ch1
835
- return None
836
- except IndexError: # 下标越界,表示没有匹配到右括号
837
- return None
838
- idx2 = len(key) - idx1 - 1
839
- ch2 = key[idx2]
840
- step = 1 if idx2 > idx1 else -1
841
- cnt = 1
842
- i = idx + step
843
- if i < 0:
844
- i += len(s)
845
- while 0 <= i < len(s):
846
- if s[i] == ch1:
847
- cnt += 1
848
- elif s[i] == ch2:
849
- cnt -= 1
850
- if cnt == 0:
851
- return i
852
- i += step
853
- return None
854
-
855
-
856
- def bracket_match2(s, idx):
857
- r"""与“bracket_match”相比,会考虑"\{"转义字符的影响
858
-
859
- >>> bracket_match2('a{b{}b}c', 1)
860
- 6
861
- >>> bracket_match2('a{b{\}b}c}d', 1)
862
- 9
863
- """
864
- key = '{[(<>)]}'
865
- try:
866
- if idx < 0:
867
- idx += len(s)
868
- ch1 = s[idx]
869
- idx1 = key.index(ch1)
870
- except ValueError: # 找不到ch1
871
- return None
872
- except IndexError: # 下标越界,表示没有匹配到右括号
873
- return None
874
- idx2 = len(key) - idx1 - 1
875
- ch2 = key[idx2]
876
- step = 1 if idx2 > idx1 else -1
877
- cnt = 1
878
- i = idx + step
879
- if i < 0:
880
- i += len(s)
881
- while 0 <= i < len(s):
882
- if i and s[i - 1] == '\\':
883
- pass
884
- elif s[i] == ch1:
885
- cnt += 1
886
- elif s[i] == ch2:
887
- cnt -= 1
888
- if cnt == 0:
889
- return i
890
- i += step
891
- return None
892
-
893
-
894
- ____section_3_ensure_content = """
895
- 从任意类型文件读取文本数据的功能
896
- """
897
-
898
-
899
- def readtext(filename, encoding=None):
900
- """读取普通的文本文件
901
- 会根据tex、py文件情况指定默认编码
902
- """
903
- try:
904
- with open(filename, 'rb') as f: # 以二进制读取文件,注意二进制没有\r\n参数
905
- bstr = f.read()
906
- except FileNotFoundError:
907
- return None
908
-
909
- if not encoding:
910
- encoding = get_encoding(bstr)
911
- s = bstr.decode(encoding=encoding, errors='ignore')
912
- if '\r' in s: # 注意这个问题跟gb2312和gbk是独立的,用gbk编码也要做这个处理
913
- s = s.replace('\r\n', '\n') # 如果用\r\n作为换行符会有一些意外不好处理
914
- return s
915
-
916
-
917
- def ensure_content(ob=None, encoding=None):
918
- """
919
- :param ob:
920
- 未输入:从控制台获取文本
921
- 存在的文件名:读取文件的内容返回
922
- tex、py、
923
- docx、doc
924
- pdf
925
- 有read可调用成员方法:返回f.read()
926
- 其他字符串:返回原值
927
- :param encoding: 强制指定编码
928
- """
929
- # TODO: 如果输入的是一个文件指针,也能调用f.read()返回所有内容
930
- # TODO: 增加鲁棒性判断,如果输入的不是字符串类型也要有出错判断
931
- if ob is None:
932
- return sys.stdin.read() # 注意输入是按 Ctrl + D 结束
933
- elif Path(ob).is_file(): # 如果存在这样的文件,那就读取文件内容(bug点:如果输入是目录名会PermissionError)
934
- if ob.endswith('.docx'): # 这里还要再扩展pdf、doc文件的读取
935
- try:
936
- import textract
937
- except ModuleNotFoundError:
938
- dprint() # 缺少textract模块,安装详见: https://blog.csdn.net/code4101/article/details/79328636
939
- raise ModuleNotFoundError
940
- text = textract.process(ob)
941
- return text.decode('utf8', errors='ignore')
942
- elif ob.endswith('.doc'):
943
- raise NotImplementedError
944
- elif ob.endswith('.pdf'):
945
- raise NotImplementedError
946
- else: # 按照普通的文本文件读取内容
947
- return readtext(ob, encoding)
948
- else: # 判断不了的情况,也认为是字符串
949
- return ob
950
-
951
-
952
- def file_lastlines(fn, n):
953
- """获得一个文件最后的几行内容
954
- 参考资料: https://stackoverflow.com/questions/136168/get-last-n-lines-of-a-file-with-python-similar-to-tail
955
-
956
- >> s = FileLastLine('book.log', 1)
957
- 'Output written on book.dvi (2 pages, 7812 bytes).'
958
- """
959
- f = ensure_content(fn)
960
- assert n >= 0
961
- pos, lines = n + 1, []
962
- while len(lines) <= n:
963
- try:
964
- f.seek(-pos, 2)
965
- except IOError:
966
- f.seek(0)
967
- break
968
- finally:
969
- lines = list(f)
970
- pos *= 2
971
- f.close()
972
- return ''.join(lines[-n:])
973
-
974
-
975
- ____section_4_spell_check = """
976
- 拼写检查
977
- 190923周一21:54,源自 完形填空ocr 识别项目
978
- """
979
-
980
-
981
- class MySpellChecker(SpellChecker):
982
- def __init__(self, language="en", local_dictionary=None, distance=2, tokenizer=None, case_sensitive=False,
983
- df=None):
984
- from collections import defaultdict, Counter
985
-
986
- # 1 原初始化功能
987
- super(MySpellChecker, self).__init__(language=language, local_dictionary=local_dictionary,
988
- distance=distance, tokenizer=tokenizer,
989
- case_sensitive=case_sensitive)
990
-
991
- # 2 自己要增加一个分析用的字典
992
- self.checkdict = defaultdict(Counter)
993
- for k, v in self.word_frequency._dictionary.items():
994
- self.checkdict[k][k] = v
995
-
996
- # 3 如果输入了一个df对象要进行更新
997
- if df: self.update_by_dataframe(df)
998
-
999
- def update_by_dataframe(self, df, weight_times=1):
1000
- """
1001
- :param df: 这里的df有要求,是DataFrame对象,并且含有这些属性列:old、new、count
1002
- :param weight_times: 对要加的count乘以一个倍率
1003
- :return:
1004
- """
1005
- # 1 是否要处理大小写
1006
- # 如果不区分大小写,需要对df先做预处理,全部转小写
1007
- # 而大小写不敏感的时候,self.word_frequency._dictionary在init时已经转小写,不用操心
1008
- if not self._case_sensitive:
1009
- df.loc[:, 'old'] = df.loc[:, 'old'].str.lower()
1010
- df.loc[:, 'new'] = df.loc[:, 'new'].str.lower()
1011
-
1012
- # 2 df对self.word_frequency._dictionary、self.check的影响
1013
- d = self.word_frequency._dictionary
1014
- for index, row in df.iterrows():
1015
- old, new, count = row['old'].decode(), row['new'].decode(), row['count'] * weight_times
1016
- d[old] += count if old == new else -count
1017
- # if row['id']==300: dprint(old, new, count)
1018
- self.checkdict[old][new] += count
1019
-
1020
- # 3 去除d中负值的key
1021
- self.word_frequency.remove_words([k for k in d.keys() if d[k] <= 0])
1022
-
1023
- def _ensure_term(self, term):
1024
- if term not in self.checkdict:
1025
- d = {k: self.word_frequency._dictionary[k] for k in self.candidates(term)}
1026
- self.checkdict[term] = d
1027
-
1028
- def correction(self, term):
1029
- # 1 本来就是正确的
1030
- w = term if self._case_sensitive else term.lower()
1031
- if w in self.word_frequency._dictionary: return term
1032
-
1033
- # 2 如果是错的,且是没有记录的错误情况,则做一次候选项运算
1034
- self._ensure_term(w)
1035
-
1036
- # 3 返回权重最大的结果
1037
- res = max(self.checkdict[w], key=self.checkdict[w].get)
1038
- val = self.checkdict[w].get(res)
1039
- if val <= 0: res = '^' + res # 是一个错误单词,但是没有推荐修改结果,就打一个^标记
1040
- return res
1041
-
1042
- def correction_detail(self, term):
1043
- """更加详细,给出所有候选项的纠正
1044
-
1045
- >> a.correction_detail('d')
1046
- [('d', 9131), ('do', 1), ('old', 1)]
1047
- """
1048
- w = term if self._case_sensitive else term.lower()
1049
- self._ensure_term(w)
1050
- ls = [(k, v) for k, v in self.checkdict[w].items()]
1051
- ls = sorted(ls, key=lambda x: x[1], reverse=True)
1052
- return ls
1053
-
1054
-
1055
- def demo_myspellchecker():
1056
- # 类的初始化大概要0.4秒
1057
- a = MySpellChecker()
1058
-
1059
- # sql的加载更新大概要1秒
1060
- # hsql = HistudySQL('ckz', 'tr_develop')
1061
- # df = hsql.query('SELECT * FROM spell_check')
1062
- # a.update_by_dataframe(df)
1063
-
1064
- # dprint(a.correction_detail('d'))
1065
- # dprint(a.correction_detail('wrod')) # wrod有很多种可能性,但word权重是最大的
1066
- # dprint(a.correction_detail('ckzckzckzckzckzckz')) # wrod有很多种可能性,但word权重是最大的
1067
- # dprint(a.correction('ckzckzckzckzckzckz')) # wrod有很多种可能性,但word权重是最大的
1068
- dprint(a.correction_detail('ike'))
1069
- dprint(a.correction_detail('dean'))
1070
- dprint(a.correction_detail('stud'))
1071
- dprint(a.correction_detail('U'))
1072
-
1073
-
1074
- ____section_temp = """
1075
- 临时添加的新功能
1076
- """
1077
-
1078
-
1079
- def count_word(s, *patterns):
1080
- """ 统计一串文本中,各种规律串出现的次数
1081
- :param s: 文本内容
1082
- :param patterns:
1083
- 匹配的多个目标模式list
1084
- 按优先级一个一个往后处理,被处理掉的部分会用\x00代替
1085
- :return: Counter.most_common() 对象
1086
- """
1087
- s = str(s)
1088
-
1089
- if not patterns: # 不写参数的时候,默认统计所有单个字符
1090
- return collections.Counter(list(s)).most_common()
1091
-
1092
- ls = []
1093
- for t in patterns:
1094
- ls += re.findall(t, s)
1095
- s = re.sub(t, '\x00', s)
1096
- # s = re.sub(r'\x00+', '\x00', s) # 将连续的特殊删除设为1,减短字符串长度,还未试验这段代码精确度与效率
1097
- ct = collections.Counter(ls)
1098
-
1099
- ls = ct.most_common()
1100
- for i in range(len(ls)):
1101
- ls[i] = (ls[i][1], repr(ls[i][0])[1:-1])
1102
- return ls
1103
-
1104
-
1105
- class Base85Coder:
1106
- """base85编码、解码器
1107
-
1108
- 对明文,加密/编码/encode 后已经是乱了看不懂,但是对这个结果还要二次转义
1109
- 对乱码,解密/解码/decode 时顺序要反正来,先处理二次转义,再处理base85
1110
-
1111
- 使用示例:
1112
- key = 'xV~>Y|@muL<UK$*agCQp=t4c0R_y`Z2;q%s?o8S9(3D5W^-NA&}6v){Twj7MzGePJEfik1bBhn!d#I+HlXFOr'
1113
- coder = Base85Coder(key)
1114
- b = coder.encode('陈坤泽 abc')
1115
- dprint(b) # b<str>=d@7;B}ww?}zfGP#;1
1116
- s = coder.decode(b)
1117
- dprint(s) # s<str>=陈坤泽 abc
1118
- """
1119
- DEFAULT_KEY = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~'
1120
- CHARS_SET = set(DEFAULT_KEY)
1121
-
1122
- def __init__(self, key=None):
1123
- """key,允许设置密钥,必须是"""
1124
- # 1 分析key是否合法
1125
- if key:
1126
- if len(key) != 85 or set(key) != Base85Coder.CHARS_SET:
1127
- dprint(key) # 输入key无效
1128
- key = None
1129
- self.key = key
1130
-
1131
- # 2 制作转换表 trantab
1132
- if key:
1133
- self.encode_trantab = str.maketrans(Base85Coder.DEFAULT_KEY, key)
1134
- self.decode_trantab = str.maketrans(key, Base85Coder.DEFAULT_KEY)
1135
- else:
1136
- self.encode_trantab = self.decode_trantab = None
1137
-
1138
- def encode(self, s):
1139
- """将字符串转字节"""
1140
- b = base64.b85encode(s.encode('utf8'))
1141
- b = str(b)[2:-1]
1142
- if self.encode_trantab:
1143
- b = b.translate(self.encode_trantab)
1144
- return b
1145
-
1146
- def decode(self, b):
1147
- if self.decode_trantab:
1148
- b = b.translate(self.decode_trantab)
1149
- b = b.encode('ascii')
1150
- s = base64.b85decode(b).decode('utf8')
1151
- return s
1152
-
1153
-
1154
- def demo_spellchecker():
1155
- """演示如何使用spellchecker库
1156
- 官方介绍文档 pyspellchecker · PyPI: https://pypi.org/project/pyspellchecker/
1157
- 190909周一15:58,from 陈坤泽
1158
- """
1159
- # 0 安装库和导入库
1160
- # spellchecker模块主要有两个类,SpellChecker和WordFrequency
1161
- # WordFrequency是一个词频类
1162
- # 一般导入SpellChecker就行了:from spellchecker import SpellChecker
1163
- try: # 拼写检查库,即词汇库
1164
- from spellchecker import SpellChecker
1165
- except ModuleNotFoundError:
1166
- subprocess.run(['pip3', 'install', 'pyspellchecker'])
1167
- from spellchecker import SpellChecker
1168
-
1169
- # 1 创建对象
1170
- # 可以设置语言、大小写敏感、拼写检查的最大距离
1171
- # 默认'en'英语,大小写不敏感
1172
- spell = SpellChecker()
1173
- # 如果是英语,SpellChecker会自动加载语言包site-packages\spellchecker\resources\en.json.gz,大概12万个词汇,包括词频权重
1174
- d = spell.word_frequency # 这里的d是WordFrequency对象,其底层用了Counter类进行数据存储
1175
- dprint(d.unique_words, d.total_words) # 词汇数,权重总和
1176
-
1177
- # 2 修改词频表 spell.word_frequency
1178
- dprint(d['ckz']) # 不存在的词汇直接输出0
1179
- d.add('ckz') # 可以添加ckz词汇的一次词频
1180
- d.load_words(['ckz', 'ckz', 'lyb']) # 可以批量添加词汇
1181
- dprint(d['ckz'], d['lyb']) # d['ckz']=3 d['lyb']=1
1182
- d.load_words(['ckz'] * 100 + ['lyb'] * 500) # 可以用这种技巧进行大权重的添加
1183
- dprint(d['ckz'], d['lyb']) # d['ckz']=103 d['lyb']=501
1184
-
1185
- # 同理,去除也有remove和remove_words两种方法
1186
- d.remove('ckz')
1187
- # d.remove_words(['ckz', 'lyb']) # 不过注意不能删除已经不存在的key('ckz'),否则会报KeyError
1188
- dprint(d['ckz'], d['lyb']) # d['ckz']=0 d['lyb']=501
1189
- # remove是完全去除单词,如果只是要减权重可以访问底层的_dictionary对象操作
1190
- d._dictionary['lyb'] -= 100 # 当然不太建议直接访问下划线开头的成员变量~~
1191
- dprint(d['lyb']) # ['lyb']=401
1192
-
1193
- # 还可以按阈值删除词频不超过设置阈值的词汇
1194
- d.remove_by_threshold(5)
1195
-
1196
- # 3 spell的基本功能
1197
- # (1)用unknown可以找到可能拼写错误的单词,再用correction可以获得最佳修改意见
1198
- misspelled = spell.unknown(['something', 'is', 'hapenning', 'here'])
1199
- dprint(misspelled) # misspelled<set>={'hapenning'}
1200
-
1201
- for word in misspelled:
1202
- # Get the one `most likely` answer
1203
- dprint(spell.correction(word)) # <str>='happening'
1204
- # Get a list of `likely` options
1205
- dprint(spell.candidates(word)) # <set>={'henning', 'happening', 'penning'}
1206
-
1207
- # 注意默认的spell不区分大小写,如果词库存储了100次'ckz'
1208
- # 此时判断任意大小写形式组合的'CKZ'都是返回原值
1209
- # 例如 spell.correction('ckZ') => 'ckZ'
1210
-
1211
- # (2)可以通过修改spell.word_frequency影响correction的计算结果
1212
- dprint(d['henning'], d['happening'], d['penning'])
1213
- # d['henning']<int>=53 d['happening']<int>=4538 d['penning']<int>=23
1214
- d._dictionary['henning'] += 10000
1215
- dprint(spell.correction('hapenning')) # <str>='henning'
1216
-
1217
- # (3)词汇在整个字典里占的权重
1218
- dprint(spell.word_probability('henning')) # <float>=0.0001040741914298211
1219
-
1220
-
1221
- def check_text_row_column(s):
1222
- """对一段文本s,用换行符分割行,用至少4个空格或\t分割列,分析数据的行、列数
1223
- :return:
1224
- (n, m),每列的列数相等,则会返回n、m>=0的tuple
1225
- (m1, m2, ...),如果有列数不相等,则会返回每行的列数组成的tuple
1226
- 每个元素用负值代表不匹配
1227
- """
1228
- # 拆开每行的列
1229
- if not s: return (0, 0)
1230
- lines = [re.sub(r'( {4,}|\t)+', r'\t', line.strip()).split('\t') for line in s.splitlines()]
1231
- cols = [len(line) for line in lines] # 计算每行的列数
1232
- if min(cols) == max(cols):
1233
- return len(lines), cols[0]
1234
- else:
1235
- return [-col for col in cols]
1236
-
1237
-
1238
- class ListingFormat:
1239
- r"""列表格式化工具
1240
-
1241
- >>> li = ListingFormat('(1)')
1242
- >>> li
1243
- (1)
1244
- >>> li.next()
1245
- >>> li
1246
- (2)
1247
-
1248
- >>> li = ListingFormat(('一、选择题', '二、填空题', '三、解答题'))
1249
- >>> li
1250
- 一、选择题
1251
- >>> li.next()
1252
- >>> li
1253
- 二、填空题
1254
- """
1255
- formats = {'[零一二三四五六七八九十]+': (chinese2digits, digits2chinese),
1256
- r'\d+': (int, str),
1257
- '[A-Z]': (lambda x: ord(x) - ord('A') + 1, lambda x: chr(ord('A') + x - 1)),
1258
- '[a-z]': (lambda x: ord(x) - ord('a') + 1, lambda x: chr(ord('a') + x - 1)),
1259
- '[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]': (circlednumber2digits, digits2circlednumber),
1260
- '[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]': (roman2digits, digits2roman)}
1261
-
1262
- def __init__(self, s='1'):
1263
- """
1264
- :param s: 列表的格式,含数值和装饰
1265
- 数值形式,目前有六种形式:一 1 A a ① Ⅰ
1266
- 起始值可以不是1,例如写'三'、'D'等
1267
- 装饰的格式,常见的有:'({})' '({})' '{}、' '{}.' '{}. '
1268
- list或tuple,按顺序取用,用完后不再设置前缀
1269
- >> ListingFormat('一', '{}、')
1270
-
1271
- TODO 目前只考虑值较小的情况,如果值太大,有些情况会出bug、报错
1272
- """
1273
- if isinstance(s, str):
1274
- for k, funcs in ListingFormat.formats.items():
1275
- if re.search(k, s):
1276
- self.form = re.sub(k, '{}', s)
1277
- self.value = int(funcs[0](re.search(k, s).group()))
1278
- self.func = funcs[1]
1279
- break
1280
- else:
1281
- raise ValueError('列表初始化格式不对 s=' + str(s))
1282
- elif isinstance(s, (list, tuple)):
1283
- self.form = s
1284
- self.value = 0
1285
- self.func = None
1286
- else:
1287
- raise ValueError('列表初始化格式不对 s=' + str(s))
1288
-
1289
- def reset(self, start=1):
1290
- """重置初始值"""
1291
- self.value = start
1292
-
1293
- def next(self):
1294
- self.value += 1
1295
-
1296
- def __repr__(self):
1297
- if self.func:
1298
- return self.form.format(self.func(self.value))
1299
- else:
1300
- return self.form[self.value]
1301
-
1302
-
1303
- def latexstrip(s):
1304
- """latex版的strip"""
1305
- return s.strip('\t\n ~')