pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +9 -2
- pyxllib/algo/__init__.py +8 -0
- pyxllib/algo/disjoint.py +54 -0
- pyxllib/algo/geo.py +541 -0
- pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
- pyxllib/algo/matcher.py +389 -0
- pyxllib/algo/newbie.py +166 -0
- pyxllib/algo/pupil.py +629 -0
- pyxllib/algo/shapelylib.py +67 -0
- pyxllib/algo/specialist.py +241 -0
- pyxllib/algo/stat.py +494 -0
- pyxllib/algo/treelib.py +149 -0
- pyxllib/algo/unitlib.py +66 -0
- pyxllib/autogui/__init__.py +5 -0
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/autogui/autogui.py +852 -0
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/virtualkey.py +102 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/__init__.py +1 -11
- pyxllib/cv/expert.py +267 -0
- pyxllib/cv/{imlib.py → imfile.py} +18 -83
- pyxllib/cv/imhash.py +39 -0
- pyxllib/cv/pupil.py +9 -0
- pyxllib/cv/rgbfmt.py +1525 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/cv/trackbartools.py +163 -49
- pyxllib/cv/xlcvlib.py +1040 -0
- pyxllib/cv/xlpillib.py +423 -0
- pyxllib/data/__init__.py +0 -0
- pyxllib/data/echarts.py +240 -0
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/{util/oss2_.py → data/oss.py} +11 -9
- pyxllib/data/pglib.py +1127 -0
- pyxllib/data/sqlite.py +568 -0
- pyxllib/{util → data}/sqllib.py +13 -31
- pyxllib/ext/JLineViewer.py +505 -0
- pyxllib/ext/__init__.py +6 -0
- pyxllib/{util → ext}/demolib.py +119 -35
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +12 -0
- pyxllib/{util/main.py → ext/old.py} +122 -284
- pyxllib/ext/qt.py +449 -0
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/seleniumlib.py +76 -0
- pyxllib/{util/tklib.py → ext/tk.py} +10 -11
- pyxllib/ext/unixlib.py +827 -0
- pyxllib/ext/utools.py +351 -0
- pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
- pyxllib/ext/win32lib.py +40 -0
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1105 -0
- pyxllib/file/__init__.py +17 -0
- pyxllib/file/docxlib.py +761 -0
- pyxllib/{util → file}/gitlib.py +40 -27
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +148 -0
- pyxllib/file/newbie.py +10 -0
- pyxllib/file/onenotelib.py +1469 -0
- pyxllib/file/packlib/__init__.py +330 -0
- pyxllib/{util → file/packlib}/zipfile.py +598 -195
- pyxllib/file/pdflib.py +426 -0
- pyxllib/file/pupil.py +185 -0
- pyxllib/file/specialist/__init__.py +685 -0
- pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
- pyxllib/file/specialist/download.py +193 -0
- pyxllib/file/specialist/filelib.py +2829 -0
- pyxllib/file/xlsxlib.py +3131 -0
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/__init__.py +5 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/deprecatedlib.py +233 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/ipyexec.py +253 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +451 -0
- pyxllib/prog/pupil.py +1197 -0
- pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
- pyxllib/prog/specialist/__init__.py +391 -0
- pyxllib/prog/specialist/bc.py +203 -0
- pyxllib/prog/specialist/browser.py +497 -0
- pyxllib/prog/specialist/common.py +347 -0
- pyxllib/prog/specialist/datetime.py +199 -0
- pyxllib/prog/specialist/tictoc.py +240 -0
- pyxllib/prog/specialist/xllog.py +180 -0
- pyxllib/prog/xlosenv.py +108 -0
- pyxllib/stdlib/__init__.py +17 -0
- pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
- pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
- pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
- pyxllib/text/__init__.py +8 -0
- pyxllib/text/ahocorasick.py +39 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +121 -0
- pyxllib/text/jiebalib.py +267 -0
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +922 -0
- pyxllib/text/latex/__init__.py +158 -0
- pyxllib/text/levenshtein.py +303 -0
- pyxllib/text/nestenv.py +1215 -0
- pyxllib/text/newbie.py +300 -0
- pyxllib/text/pupil/__init__.py +8 -0
- pyxllib/text/pupil/common.py +1121 -0
- pyxllib/text/pupil/xlalign.py +326 -0
- pyxllib/text/pycode.py +47 -0
- pyxllib/text/specialist/__init__.py +8 -0
- pyxllib/text/specialist/common.py +112 -0
- pyxllib/text/specialist/ptag.py +186 -0
- pyxllib/text/spellchecker.py +172 -0
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/vbacode.py +17 -0
- pyxllib/text/xmllib.py +747 -0
- pyxllib/xl.py +39 -0
- pyxllib/xlcv.py +17 -0
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
- pyxllib/basic/_1_strlib.py +0 -945
- pyxllib/basic/_2_timelib.py +0 -488
- pyxllib/basic/_3_pathlib.py +0 -916
- pyxllib/basic/_4_loglib.py +0 -419
- pyxllib/basic/__init__.py +0 -54
- pyxllib/basic/arrow_.py +0 -250
- pyxllib/basic/chardet_.py +0 -66
- pyxllib/basic/dirlib.py +0 -529
- pyxllib/basic/dprint.py +0 -202
- pyxllib/basic/extension.py +0 -12
- pyxllib/basic/judge.py +0 -31
- pyxllib/basic/log.py +0 -204
- pyxllib/basic/pathlib_.py +0 -705
- pyxllib/basic/pytictoc.py +0 -102
- pyxllib/basic/qiniu_.py +0 -61
- pyxllib/basic/strlib.py +0 -761
- pyxllib/basic/timer.py +0 -132
- pyxllib/cv/cv.py +0 -834
- pyxllib/cv/cvlib/_1_geo.py +0 -543
- pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
- pyxllib/cv/cvlib/_2_imgproc.py +0 -594
- pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
- pyxllib/cv/cvlib/_4_cvimg.py +0 -211
- pyxllib/cv/cvlib/__init__.py +0 -10
- pyxllib/cv/debugtools.py +0 -82
- pyxllib/cv/fitz_.py +0 -300
- pyxllib/cv/installer.py +0 -42
- pyxllib/debug/_0_installer.py +0 -38
- pyxllib/debug/_1_typelib.py +0 -277
- pyxllib/debug/_2_chrome.py +0 -198
- pyxllib/debug/_3_showdir.py +0 -161
- pyxllib/debug/_4_bcompare.py +0 -140
- pyxllib/debug/__init__.py +0 -49
- pyxllib/debug/bcompare.py +0 -132
- pyxllib/debug/chrome.py +0 -198
- pyxllib/debug/installer.py +0 -38
- pyxllib/debug/showdir.py +0 -158
- pyxllib/debug/typelib.py +0 -278
- pyxllib/image/__init__.py +0 -12
- pyxllib/torch/__init__.py +0 -20
- pyxllib/torch/modellib.py +0 -37
- pyxllib/torch/trainlib.py +0 -344
- pyxllib/util/__init__.py +0 -20
- pyxllib/util/aip_.py +0 -141
- pyxllib/util/casiadb.py +0 -59
- pyxllib/util/excellib.py +0 -495
- pyxllib/util/filelib.py +0 -612
- pyxllib/util/jsondata.py +0 -27
- pyxllib/util/jsondata2.py +0 -92
- pyxllib/util/labelmelib.py +0 -139
- pyxllib/util/onepy/__init__.py +0 -29
- pyxllib/util/onepy/onepy.py +0 -574
- pyxllib/util/onepy/onmanager.py +0 -170
- pyxllib/util/pyautogui_.py +0 -219
- pyxllib/util/textlib.py +0 -1305
- pyxllib/util/unorder.py +0 -22
- pyxllib/util/xmllib.py +0 -639
- pyxllib-0.0.43.dist-info/METADATA +0 -39
- pyxllib-0.0.43.dist-info/RECORD +0 -80
- pyxllib-0.0.43.dist-info/top_level.txt +0 -1
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/util/textlib.py
DELETED
@@ -1,1305 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Data : 2018/12/27
|
6
|
-
|
7
|
-
|
8
|
-
"""
|
9
|
-
文本处理、常用正则匹配模式
|
10
|
-
|
11
|
-
下面大量的函数前缀含义:
|
12
|
-
grp,generate regular pattern,生成正则模式字符串
|
13
|
-
grr,generate regular replace,生成正则替换目标格式
|
14
|
-
"""
|
15
|
-
|
16
|
-
from pyxllib.util.mathlib import *
|
17
|
-
|
18
|
-
import base64
|
19
|
-
import bisect
|
20
|
-
import itertools
|
21
|
-
|
22
|
-
____section_0_import = """
|
23
|
-
try ... except不影响效率的
|
24
|
-
主要是导入特殊包,好像是比较耗费时间,这里要占用掉0.1秒多时间
|
25
|
-
"""
|
26
|
-
|
27
|
-
# 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
|
28
|
-
# 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
|
29
|
-
try:
|
30
|
-
# MatchSimString计算编辑距离需要
|
31
|
-
import Levenshtein
|
32
|
-
except ModuleNotFoundError:
|
33
|
-
subprocess.run(['pip', 'install', 'python-Levenshtein'])
|
34
|
-
import Levenshtein
|
35
|
-
|
36
|
-
# import textract # ensure_content读取word文档需要
|
37
|
-
|
38
|
-
try: # 拼写检查库,即词汇库
|
39
|
-
from spellchecker import SpellChecker
|
40
|
-
except ModuleNotFoundError:
|
41
|
-
subprocess.run(['pip', 'install', 'pyspellchecker'])
|
42
|
-
from spellchecker import SpellChecker
|
43
|
-
|
44
|
-
____section_1_text = """
|
45
|
-
一些文本处理函数和类
|
46
|
-
"""
|
47
|
-
|
48
|
-
|
49
|
-
class ContentLine(object):
|
50
|
-
"""用行数的特性分析一段文本"""
|
51
|
-
|
52
|
-
def __init__(self, content):
|
53
|
-
"""用一段文本初始化"""
|
54
|
-
self.content = ensure_content(content) # 原始文本
|
55
|
-
self.linepos = list() # linepos[i-1] = v:第i行终止位置(\n)所在下标为v
|
56
|
-
for i in range(len(self.content)):
|
57
|
-
if self.content[i] == '\n':
|
58
|
-
self.linepos.append(i)
|
59
|
-
self.linepos.append(len(self.content))
|
60
|
-
self.lines = self.content.splitlines() # 每一行的文本内容
|
61
|
-
|
62
|
-
def line_start_pos(self, line):
|
63
|
-
"""第line行的其实pos位置"""
|
64
|
-
pass
|
65
|
-
|
66
|
-
def lines_num(self):
|
67
|
-
"""返回总行数"""
|
68
|
-
return self.content.count('\n')
|
69
|
-
|
70
|
-
def match_lines(self, pattern):
|
71
|
-
"""返回符合正则规则的行号
|
72
|
-
|
73
|
-
180515扩展: pattern也能输入一个函数
|
74
|
-
"""
|
75
|
-
# 1 定义函数句柄
|
76
|
-
if not callable(pattern):
|
77
|
-
def f(s):
|
78
|
-
return re.search(pattern, s)
|
79
|
-
else:
|
80
|
-
f = pattern
|
81
|
-
# 2 循环判断
|
82
|
-
res = list()
|
83
|
-
for i, line in enumerate(self.lines):
|
84
|
-
if f(line):
|
85
|
-
res.append(i)
|
86
|
-
return res
|
87
|
-
|
88
|
-
def in_line(self, ob):
|
89
|
-
"""输入关键词ob,返回行号"""
|
90
|
-
|
91
|
-
if hasattr(ob, 'span'):
|
92
|
-
return self.in_line(ob.span()[0])
|
93
|
-
elif isinstance(ob, int):
|
94
|
-
"如果给入一个下标值,如23,计算第23个字符处于原文中第几行"
|
95
|
-
return bisect.bisect_right(self.linepos, ob - 1) + 1
|
96
|
-
elif isinstance(ob, str):
|
97
|
-
"输入一段文本,判断该文中有哪些行与该行内容相同"
|
98
|
-
res = list()
|
99
|
-
for i, line in enumerate(self.lines):
|
100
|
-
if line == ob:
|
101
|
-
res.append(i + 1)
|
102
|
-
return res
|
103
|
-
elif isinstance(ob, (list, tuple, collections.Iterable)):
|
104
|
-
return list(map(self.in_line, ob))
|
105
|
-
else:
|
106
|
-
dprint(typename(ob)) # 类型错误
|
107
|
-
raise ValueError
|
108
|
-
|
109
|
-
def regular_search(self, re_str):
|
110
|
-
"""同InLine,但是支持正则搜索"""
|
111
|
-
return self.in_line(re.finditer(re_str, self.content))
|
112
|
-
|
113
|
-
def lines_content(self, lines) -> str:
|
114
|
-
"""返回lines集合中数字所对行号的所有内容
|
115
|
-
|
116
|
-
注意输入的lines起始编号是1
|
117
|
-
"""
|
118
|
-
lines = sorted(set(lines)) # 去重
|
119
|
-
res = map(lambda n: '{:6} {}'.format(n, self.lines[n - 1]), lines)
|
120
|
-
return '\n'.join(res)
|
121
|
-
|
122
|
-
def __str__(self):
|
123
|
-
return self.content
|
124
|
-
|
125
|
-
|
126
|
-
def binary_cut_str(s, fmt='0'):
|
127
|
-
"""180801坤泽:“二分”切割字符串
|
128
|
-
:param s: 要截取的全字符串
|
129
|
-
:param fmt: 截取格式,本来是想只支持0、1的,后来想想支持23456789也行
|
130
|
-
0:左边一半
|
131
|
-
1:右边的1/2
|
132
|
-
2:右边的1/3
|
133
|
-
3:右边的1/4
|
134
|
-
...
|
135
|
-
9:右边的1/10
|
136
|
-
:return: 截取后的字符串
|
137
|
-
|
138
|
-
>>> binary_cut_str('1234', '0')
|
139
|
-
'12'
|
140
|
-
>>> binary_cut_str('1234', '1')
|
141
|
-
'34'
|
142
|
-
>>> binary_cut_str('1234', '10')
|
143
|
-
'3'
|
144
|
-
>>> binary_cut_str('123456789', '20')
|
145
|
-
'7'
|
146
|
-
>>> binary_cut_str('123456789', '210') # 向下取整,'21'获得了9,然后'0'取到空字符串
|
147
|
-
''
|
148
|
-
"""
|
149
|
-
for t in fmt:
|
150
|
-
t = int(t)
|
151
|
-
n = len(s) // (1 + max(1, t))
|
152
|
-
if t == 0:
|
153
|
-
s = s[:n]
|
154
|
-
else:
|
155
|
-
s = s[(len(s) - n):]
|
156
|
-
return s
|
157
|
-
|
158
|
-
|
159
|
-
def digits2chinese(n):
|
160
|
-
"""TODO:目前处理范围有限,还需要再扩展
|
161
|
-
"""
|
162
|
-
s = '十一二三四五六七八九'
|
163
|
-
if n == 0:
|
164
|
-
return '零'
|
165
|
-
elif n <= 10:
|
166
|
-
return s[n % 10]
|
167
|
-
elif n < 20:
|
168
|
-
return '十' + s[n % 10]
|
169
|
-
elif n < 100:
|
170
|
-
return s[n // 10] + s[n % 10]
|
171
|
-
else:
|
172
|
-
raise NotImplementedError
|
173
|
-
|
174
|
-
|
175
|
-
def chinese2digits(chinese_str):
|
176
|
-
"""把汉字变为阿拉伯数字
|
177
|
-
https://blog.csdn.net/leon_wzm/article/details/78963082
|
178
|
-
"""
|
179
|
-
|
180
|
-
def inner(m):
|
181
|
-
t = m.group()
|
182
|
-
if t is None or t.strip() == '':
|
183
|
-
raise ValueError(f'input error for {chinese_str}')
|
184
|
-
t = t.strip()
|
185
|
-
t = t.replace('百十', '百一十')
|
186
|
-
common_used_numerals = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
|
187
|
-
'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
|
188
|
-
total = 0
|
189
|
-
r = 1 # right,右边一位的值
|
190
|
-
for i in range(len(t) - 1, -1, -1): # 从右往左一位一位读取
|
191
|
-
val = common_used_numerals.get(t[i]) # 使用get不存在会返回None
|
192
|
-
if val is None:
|
193
|
-
# dprint(chinese_str)
|
194
|
-
return chinese_str
|
195
|
-
# raise ValueError(f't[i]={t[i]} can not be accepted.')
|
196
|
-
if val >= 10 and i == 0: # 最左位是“十百千万亿”这样的单位数词
|
197
|
-
if val > r: # 一般是“十三”这类会进入这个if分支
|
198
|
-
r = val
|
199
|
-
total += val
|
200
|
-
else:
|
201
|
-
r *= val
|
202
|
-
elif val >= 10:
|
203
|
-
if val > r: # 跳了单位数词(正常情况都会跳),例如 一万一百零三
|
204
|
-
r = val
|
205
|
-
else: # 单位数词叠加情况,例如 一千亿
|
206
|
-
r *= val
|
207
|
-
else: # 不是单位数词的数词,如果上一步是单位数词,增加一个单位量
|
208
|
-
total += r * val
|
209
|
-
return str(total)
|
210
|
-
|
211
|
-
return re.sub(r'[零一二两三四五六七八九十百千万亿]+', inner, chinese_str)
|
212
|
-
|
213
|
-
|
214
|
-
def digits2roman(d):
|
215
|
-
"""
|
216
|
-
>>> digits2roman(2)
|
217
|
-
'Ⅱ'
|
218
|
-
>>> digits2roman(12)
|
219
|
-
'Ⅻ'
|
220
|
-
"""
|
221
|
-
rmn = '~ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ' # roman数字number的缩写
|
222
|
-
|
223
|
-
d = int(d) # 确保是整数类型
|
224
|
-
if d <= 12:
|
225
|
-
return rmn[d]
|
226
|
-
else:
|
227
|
-
raise NotImplementedError
|
228
|
-
|
229
|
-
|
230
|
-
def roman2digits(d):
|
231
|
-
"""
|
232
|
-
>>> roman2digits('Ⅱ')
|
233
|
-
2
|
234
|
-
>>> roman2digits('Ⅻ')
|
235
|
-
12
|
236
|
-
"""
|
237
|
-
rmn = '~ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'
|
238
|
-
if d in rmn:
|
239
|
-
return rmn.index(d)
|
240
|
-
else:
|
241
|
-
raise NotImplemented
|
242
|
-
|
243
|
-
|
244
|
-
def digits2circlednumber(d):
|
245
|
-
d = int(d)
|
246
|
-
if 0 < d <= 20:
|
247
|
-
return '①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳'[d - 1]
|
248
|
-
else:
|
249
|
-
raise NotImplemented
|
250
|
-
|
251
|
-
|
252
|
-
def circlednumber2digits(d):
|
253
|
-
t = '①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳'
|
254
|
-
if d in t:
|
255
|
-
return t.index(d) + 1
|
256
|
-
else:
|
257
|
-
raise NotImplemented
|
258
|
-
|
259
|
-
|
260
|
-
def gettag_name(tagstr):
|
261
|
-
"""
|
262
|
-
>>> gettag_name('%<topic type=danxuan description=单选题>')
|
263
|
-
'topic'
|
264
|
-
>>> gettag_name('</topic>')
|
265
|
-
'topic'
|
266
|
-
"""
|
267
|
-
m = re.search(r'</?([a-zA-Z_]+)', tagstr)
|
268
|
-
if m:
|
269
|
-
return m.group(1)
|
270
|
-
else:
|
271
|
-
return None
|
272
|
-
|
273
|
-
|
274
|
-
def settag_name(tagstr, *, new_name=None, switch=None):
|
275
|
-
"""设置标签名称,或者将标签类型设为close类型
|
276
|
-
|
277
|
-
>>> settag_name('%<topic type=danxuan description=单选题>', new_name='mdzz')
|
278
|
-
'%<mdzz type=danxuan description=单选题>'
|
279
|
-
>>> settag_name('<topic type=danxuan description=单选题>', switch=False)
|
280
|
-
'</topic>'
|
281
|
-
"""
|
282
|
-
if new_name: # 是否设置新名称
|
283
|
-
tagstr = re.sub(r'(</?)([a-zA-Z_]+)', lambda m: m.group(1) + new_name, tagstr)
|
284
|
-
|
285
|
-
if switch is not None: # 是否设置标签开关
|
286
|
-
if switch: # 将标签改为开
|
287
|
-
tagstr = tagstr.replace('</', '<')
|
288
|
-
else: # 将标签改为关
|
289
|
-
name = gettag_name(tagstr)
|
290
|
-
res = f'</{name}>' # 会删除所有attr属性
|
291
|
-
tagstr = '%' + res if '%<' in tagstr else res
|
292
|
-
|
293
|
-
return tagstr
|
294
|
-
|
295
|
-
|
296
|
-
def gettag_attr(tagstr, attrname):
|
297
|
-
r"""tagstr是一个标签字符串,attrname是要索引的名字
|
298
|
-
返回属性值,如果不存在该属性则返回None
|
299
|
-
|
300
|
-
>>> gettag_attr('%<topic type=danxuan description=单选题> 123\n<a b=c></a>', 'type')
|
301
|
-
'danxuan'
|
302
|
-
>>> gettag_attr('%<topic type="dan xu an" description=单选题>', 'type')
|
303
|
-
'dan xu an'
|
304
|
-
>>> gettag_attr("%<topic type='dan xu an' description=单选题>", 'type')
|
305
|
-
'dan xu an'
|
306
|
-
>>> gettag_attr('%<topic type=dan xu an description=单选题>', 'description')
|
307
|
-
'单选题'
|
308
|
-
>>> gettag_attr('%<topic type=dan xu an description=单选题>', 'type')
|
309
|
-
'dan'
|
310
|
-
>>> gettag_attr('%<topic type=danxuan description=单选题 >', 'description')
|
311
|
-
'单选题'
|
312
|
-
>>> gettag_attr('%<topic type=danxuan description=单选题 >', 'description123') is None
|
313
|
-
True
|
314
|
-
"""
|
315
|
-
import bs4
|
316
|
-
soup = BeautifulSoup(tagstr, 'lxml')
|
317
|
-
try:
|
318
|
-
for tag in soup.p.contents:
|
319
|
-
if isinstance(tag, bs4.Tag):
|
320
|
-
return tag.get(attrname, None)
|
321
|
-
except AttributeError:
|
322
|
-
dprint(tagstr)
|
323
|
-
return None
|
324
|
-
|
325
|
-
|
326
|
-
def settag_attr(tagstr, attrname, target_value):
|
327
|
-
r"""tagstr是一个标签字符串,attrname是要索引的名字
|
328
|
-
重设该属性的值,设置成功则返回新的tagstr;否则返回原始值
|
329
|
-
|
330
|
-
close类型不能用这个命令,用了的话不进行任何处理,直接返回
|
331
|
-
|
332
|
-
>>> settag_attr('%<topic type=danxuan> 123\n<a></a>', 'type', 'tiankong')
|
333
|
-
'%<topic type="tiankong"> 123\n<a></a>'
|
334
|
-
>>> settag_attr('%<topic>', 'type', 'tiankong')
|
335
|
-
'%<topic type="tiankong">'
|
336
|
-
>>> settag_attr('</topic>', 'type', 'tiankong')
|
337
|
-
'</topic>'
|
338
|
-
>>> settag_attr('<seq value="1">', 'value', '练习1.2')
|
339
|
-
'<seq value="练习1.2">'
|
340
|
-
>>> settag_attr('<seq type=123 value=1>', 'type', '') # 删除attr操作
|
341
|
-
'<seq value=1>'
|
342
|
-
>>> settag_attr('<seq type=123 value=1>', 'value', '') # 删除attr操作
|
343
|
-
'<seq type=123>'
|
344
|
-
>>> settag_attr('<seq type=123 value=1>', 'haha', '') # 删除attr操作
|
345
|
-
'<seq type=123 value=1>'
|
346
|
-
"""
|
347
|
-
# 如果是close类型是不处理的
|
348
|
-
if tagstr.startswith('</'): return tagstr
|
349
|
-
|
350
|
-
# 预处理targetValue的值,删除空白
|
351
|
-
target_value = re.sub(r'\s', '', target_value)
|
352
|
-
r = re.compile(r'(<|\s)(' + attrname + r'=)(.+?)(\s+\w+=|\s*>)')
|
353
|
-
gs = r.search(tagstr)
|
354
|
-
if target_value:
|
355
|
-
if not gs: # 如果未找到则添加attr与value
|
356
|
-
n = tagstr.find('>')
|
357
|
-
return tagstr[:n] + ' ' + attrname + '="' + target_value + '"' + tagstr[n:]
|
358
|
-
else: # 如果找到则更改value
|
359
|
-
# TODO: 目前的替换值是直接放到正则式里了,这样会有很大的风险,后续看看能不能优化这个处理算法
|
360
|
-
return r.sub(r'\1\g<2>"' + target_value + r'"\4', tagstr)
|
361
|
-
else:
|
362
|
-
if gs:
|
363
|
-
return r.sub(r'\4', tagstr)
|
364
|
-
else:
|
365
|
-
return tagstr
|
366
|
-
|
367
|
-
|
368
|
-
def briefstr(s):
|
369
|
-
"""对文本内容进行一些修改,从而简化其内容,提取关键信息
|
370
|
-
一般用于字符串近似对比
|
371
|
-
"""
|
372
|
-
# 1 删除所有空白字符
|
373
|
-
# debuglib.dprint(debuglib.typename(s))
|
374
|
-
s = re.sub(r'\s+', '', s)
|
375
|
-
# 2 转小写字符
|
376
|
-
s = s.casefold()
|
377
|
-
return s
|
378
|
-
|
379
|
-
|
380
|
-
def brieftexstr(s):
|
381
|
-
"""对比两段tex文本
|
382
|
-
"""
|
383
|
-
# 1 删除百分注
|
384
|
-
s = re.sub(r'%' + grp_bracket(2, '<', '>'), r'', s)
|
385
|
-
# 2 删除所有空白字符
|
386
|
-
# debuglib.dprint(debuglib.typename(s))
|
387
|
-
s = re.sub(r'\s+', '', s)
|
388
|
-
# 3 转小写字符
|
389
|
-
s = s.casefold()
|
390
|
-
return s
|
391
|
-
|
392
|
-
|
393
|
-
class MatchSimString:
|
394
|
-
"""匹配近似字符串
|
395
|
-
|
396
|
-
mss = MatchSimString()
|
397
|
-
|
398
|
-
# 1 添加候选对象
|
399
|
-
mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
|
400
|
-
mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
|
401
|
-
mss.append_candidate('删除所有标签中间多余的空白')
|
402
|
-
|
403
|
-
# 2 需要匹配的对象1
|
404
|
-
s = '奕本初一福周厦门培油'
|
405
|
-
|
406
|
-
idx, sim = mss.match(s)
|
407
|
-
print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
|
408
|
-
print('相似度:', sim) # 相似度: 0.22
|
409
|
-
|
410
|
-
# 3 需要匹配的对象2
|
411
|
-
s = '圆柱与【圆锥】_教案空白版'
|
412
|
-
|
413
|
-
idx, sim = mss.match(s)
|
414
|
-
print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
|
415
|
-
print('相似度:', sim) # 相似度: 0.375
|
416
|
-
|
417
|
-
如果append_candidate有传递2个扩展信息参数,可以索引获取:
|
418
|
-
mss.ext_value[idx]
|
419
|
-
"""
|
420
|
-
|
421
|
-
def __init__(self, method=briefstr):
|
422
|
-
self.preproc = method
|
423
|
-
self.origin_str = list() # 原始字符串内容
|
424
|
-
self.key_str = list() # 对原始字符串进行处理后的字符
|
425
|
-
self.ext_value = list() # 扩展存储一些信息
|
426
|
-
|
427
|
-
def __getitem__(self, item):
|
428
|
-
return self.origin_str[item]
|
429
|
-
|
430
|
-
def __len__(self):
|
431
|
-
return len(self.key_str)
|
432
|
-
|
433
|
-
def append_candidate(self, k, v=None):
|
434
|
-
self.origin_str.append(k)
|
435
|
-
if callable(self.preproc):
|
436
|
-
k = self.preproc(k)
|
437
|
-
self.key_str.append(k)
|
438
|
-
self.ext_value.append(v)
|
439
|
-
|
440
|
-
def match(self, s):
|
441
|
-
"""跟候选字符串进行匹配,返回最佳匹配结果
|
442
|
-
"""
|
443
|
-
idx, sim = -1, 0
|
444
|
-
for i in range(len(self)):
|
445
|
-
k, v = self.key_str[i], self.ext_value[i]
|
446
|
-
sim_ = Levenshtein.ratio(k, s)
|
447
|
-
if sim_ > sim:
|
448
|
-
sim = sim_
|
449
|
-
idx = i
|
450
|
-
i += 1
|
451
|
-
return idx, sim
|
452
|
-
|
453
|
-
def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
|
454
|
-
"""输入一个字符串s,和候选项做近似匹配
|
455
|
-
|
456
|
-
:param s: 需要进行匹配的字符串s
|
457
|
-
:param count: 只输出部分匹配结果
|
458
|
-
-1:输出所有匹配结果
|
459
|
-
0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
|
460
|
-
整数:输出匹配度最高的count个结果
|
461
|
-
:param showstr: 字符串显示效果
|
462
|
-
"""
|
463
|
-
# 1 计算编辑距离,存储结果到res
|
464
|
-
res = []
|
465
|
-
n = len(self)
|
466
|
-
for i in range(n):
|
467
|
-
k, v = self.key_str[i], self.ext_value[i]
|
468
|
-
sim = Levenshtein.ratio(k, s)
|
469
|
-
res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
|
470
|
-
i += 1
|
471
|
-
|
472
|
-
# 2 排序、节选结果
|
473
|
-
res = sorted(res, key=lambda x: -x[2])
|
474
|
-
if 0 < count < 1:
|
475
|
-
n = max(1, int(n * count))
|
476
|
-
elif isinstance(count, int) and count > 0:
|
477
|
-
n = min(count, n)
|
478
|
-
res = res[:n]
|
479
|
-
|
480
|
-
# 3 输出
|
481
|
-
df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
|
482
|
-
s = dataframe_str(df)
|
483
|
-
s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
|
484
|
-
print(s)
|
485
|
-
|
486
|
-
|
487
|
-
def endswith(s, tags):
|
488
|
-
"""除了模拟str.endswith方法,输入的tag也可以是可迭代对象
|
489
|
-
|
490
|
-
>>> endswith('a.dvi', ('.log', '.aux', '.dvi', 'busy'))
|
491
|
-
True
|
492
|
-
"""
|
493
|
-
if isinstance(tags, str):
|
494
|
-
return s.endswith(tags)
|
495
|
-
elif isinstance(tags, (list, tuple)):
|
496
|
-
for t in tags:
|
497
|
-
if s.endswith(t):
|
498
|
-
return True
|
499
|
-
else:
|
500
|
-
raise TypeError
|
501
|
-
return False
|
502
|
-
|
503
|
-
|
504
|
-
def mydictstr(d, key_value_delimit='=', item_delimit=' '):
|
505
|
-
"""将一个字典转成字符串"""
|
506
|
-
res = []
|
507
|
-
for k, v in d.items():
|
508
|
-
res.append(str(k) + key_value_delimit + str(v).replace('\n', r'\n'))
|
509
|
-
res = item_delimit.join(res)
|
510
|
-
return res
|
511
|
-
|
512
|
-
|
513
|
-
def findnth(haystack, needle, n):
|
514
|
-
"""https://stackoverflow.com/questions/1883980/find-the-nth-occurrence-of-substring-in-a-string"""
|
515
|
-
if n < 0:
|
516
|
-
n += haystack.count(needle)
|
517
|
-
if n < 0:
|
518
|
-
return -1
|
519
|
-
|
520
|
-
parts = haystack.split(needle, n + 1)
|
521
|
-
if len(parts) <= n + 1:
|
522
|
-
return -1
|
523
|
-
return len(haystack) - len(parts[-1]) - len(needle)
|
524
|
-
|
525
|
-
|
526
|
-
def refine_digits_set(digits):
|
527
|
-
"""美化连续数字的输出效果
|
528
|
-
|
529
|
-
>>> refine_digits_set([210, 207, 207, 208, 211, 212])
|
530
|
-
'207,208,210-212'
|
531
|
-
"""
|
532
|
-
arr = sorted(list(set(digits))) # 去重
|
533
|
-
n = len(arr)
|
534
|
-
res = ''
|
535
|
-
i = 0
|
536
|
-
while i < n:
|
537
|
-
j = i + 2
|
538
|
-
if j < n and arr[i] + 2 == arr[j]:
|
539
|
-
while j < n and arr[j] - arr[i] == j - i:
|
540
|
-
j += 1
|
541
|
-
j = j if j < n else n - 1
|
542
|
-
res += str(arr[i]) + '-' + str(arr[j]) + ','
|
543
|
-
i = j + 1
|
544
|
-
else:
|
545
|
-
res += str(arr[i]) + ','
|
546
|
-
i += 1
|
547
|
-
return res[:-1] # -1是去掉最后一个','
|
548
|
-
|
549
|
-
|
550
|
-
def printoneline(s):
|
551
|
-
"""将输出控制在单行,适应终端大小"""
|
552
|
-
try:
|
553
|
-
columns = os.get_terminal_size().columns - 3 # 获取终端的窗口宽度
|
554
|
-
except OSError: # 如果没和终端相连,会抛出异常
|
555
|
-
# 这应该就是在PyCharm,直接来个大值吧
|
556
|
-
columns = 500
|
557
|
-
s = shorten(s, columns)
|
558
|
-
print(s)
|
559
|
-
|
560
|
-
|
561
|
-
def del_tail_newline(s):
|
562
|
-
"""删除末尾的换行"""
|
563
|
-
if len(s) > 1 and s[-1] == '\n':
|
564
|
-
s = s[:-1]
|
565
|
-
return s
|
566
|
-
|
567
|
-
|
568
|
-
____section_2_regular = """
|
569
|
-
跟正则相关的一些文本处理函数和类
|
570
|
-
"""
|
571
|
-
|
572
|
-
|
573
|
-
def grp_bracket(depth=0, left='{', right=None):
|
574
|
-
r"""括号匹配,默认花括号匹配,也可以改为圆括号、方括号匹配。
|
575
|
-
|
576
|
-
效果类似于“{.*?}”,
|
577
|
-
但是左右花括号是确保匹配的,有可选参数可以提升支持的嵌套层级,
|
578
|
-
数字越大匹配嵌套能力越强,但是速度性能会一定程度降低。
|
579
|
-
例如“grp_bracket(5)”。
|
580
|
-
|
581
|
-
:param depth: 括号递归深度
|
582
|
-
:param left: 左边字符:(、[、{
|
583
|
-
:param right: 右边字符
|
584
|
-
:return:
|
585
|
-
|
586
|
-
先了解一下正则常识:
|
587
|
-
>>> re.sub(r'[^\[\]]', r'', r'a[b]a[]') # 删除非方括号
|
588
|
-
'[][]'
|
589
|
-
>>> re.sub(r'[^\(\)]', r'', r'a(b)a()') # 删除非圆括号
|
590
|
-
'()()'
|
591
|
-
>>> re.sub(r'[^()]', r'', r'a(b)a()') # 不用\也可以
|
592
|
-
'()()'
|
593
|
-
|
594
|
-
该函数使用效果:
|
595
|
-
>>> re.sub(grp_bracket(5), r'', r'x{aaa{b{d}b}ccc{d{{}e}ff}gg}y')
|
596
|
-
'xy'
|
597
|
-
>>> re.sub(grp_bracket(5, '(', ')'), r'', r'x(aaa(b(d)b)ccc(d(()e)ff)gg)y')
|
598
|
-
'xy'
|
599
|
-
>>> re.sub(grp_bracket(5, '[', ']'), r'', r'x[aaa[b[d]b]ccc[d[[]e]ff]gg]y')
|
600
|
-
'xy'
|
601
|
-
"""
|
602
|
-
# 用a, b简化引用名称
|
603
|
-
a, b = left, right
|
604
|
-
if b is None:
|
605
|
-
if a == '(':
|
606
|
-
b = ')'
|
607
|
-
elif a == '[':
|
608
|
-
b = ']'
|
609
|
-
elif a == '{':
|
610
|
-
b = '}'
|
611
|
-
else:
|
612
|
-
raise NotImplementedError
|
613
|
-
# 特殊符号需要转义
|
614
|
-
if a in '([':
|
615
|
-
a = '\\' + a
|
616
|
-
if b in ')]':
|
617
|
-
b = '\\' + b
|
618
|
-
c = f'[^{a}{b}]'
|
619
|
-
# 建立匹配素材
|
620
|
-
pattern_0 = f'{a}{c}*{b}'
|
621
|
-
pat_left = f'{a}(?:{c}|'
|
622
|
-
pat_right = f')*{b}'
|
623
|
-
|
624
|
-
# 生成匹配规则的函数
|
625
|
-
def gen(pattern, depth=0):
|
626
|
-
while depth:
|
627
|
-
pattern = pat_left + pattern + pat_right
|
628
|
-
depth -= 1
|
629
|
-
return pattern
|
630
|
-
|
631
|
-
s = gen(pattern_0, depth=depth)
|
632
|
-
return s
|
633
|
-
|
634
|
-
|
635
|
-
# 定义常用的几种格式,并且只匹配抓取花括号里面的值,不要花括号本身
|
636
|
-
SQUARE3 = r'\\[(' + grp_bracket(3, '[')[3:-3] + r')\\]'
|
637
|
-
BRACE1 = '{(' + grp_bracket(1)[1:-1] + ')}'
|
638
|
-
BRACE2 = '{(' + grp_bracket(2)[1:-1] + ')}'
|
639
|
-
BRACE3 = '{(' + grp_bracket(3)[1:-1] + ')}'
|
640
|
-
BRACE4 = '{(' + grp_bracket(4)[1:-1] + ')}'
|
641
|
-
BRACE5 = '{(' + grp_bracket(5)[1:-1] + ')}'
|
642
|
-
"""使用示例
|
643
|
-
>> m = re.search(r'\\multicolumn' + BRACE3*3, r'\multicolumn{2}{|c|}{$2^{12}$个数}')
|
644
|
-
>> m.groups()
|
645
|
-
('2', '|c|', '$2^{12}$个数')
|
646
|
-
"""
|
647
|
-
|
648
|
-
|
649
|
-
def grp_figure(cnt_groups=0, parpic=False):
|
650
|
-
"""生成跟图片匹配相关的表达式
|
651
|
-
|
652
|
-
D:\2017LaTeX\D招培试卷\高中地理,用过 \captionfig{3-3.eps}{图~3}
|
653
|
-
奕本从2018秋季教材开始使用多种图片格式
|
654
|
-
|
655
|
-
191224周二18:20 更新:匹配到的图片名不带花括号
|
656
|
-
"""
|
657
|
-
if cnt_groups == 0: # 不分组
|
658
|
-
s = r'\\(?:includegraphics|figt|figc|figr|fig).*?' + grp_bracket(3) # 注意第1组fig要放最后面
|
659
|
-
elif cnt_groups == 1: # 只分1组,那么只对图片括号内的内容分组
|
660
|
-
s = r'\\(?:includegraphics|figt|figc|figr|fig).*?' + BRACE3
|
661
|
-
elif cnt_groups == 2: # 只分2组,那么只对插图命令和图片分组
|
662
|
-
s = r'\\(includegraphics|figt|figc|figr|fig).*?' + BRACE3
|
663
|
-
elif cnt_groups == 3:
|
664
|
-
s = r'\\(includegraphics|figt|figc|figr|fig)(.*?)' + BRACE3
|
665
|
-
else:
|
666
|
-
s = None
|
667
|
-
|
668
|
-
if s and parpic:
|
669
|
-
s = r'{?\\parpic(?:\[.\])?{' + s + r'}*'
|
670
|
-
|
671
|
-
return s
|
672
|
-
|
673
|
-
|
674
|
-
def grp_topic(*, type_value=None):
|
675
|
-
"""定位topic
|
676
|
-
|
677
|
-
:param type_value: 设置题目类型(TODO: 功能尚未开发)
|
678
|
-
"""
|
679
|
-
s = r'%<topic.*?%</topic>' # 注意外部使用的re要开flags=re.DOTALL
|
680
|
-
return s
|
681
|
-
|
682
|
-
|
683
|
-
def grp_chinese_char():
|
684
|
-
return r'[\u4e00-\u9fa5,。;?()【】、①-⑨]'
|
685
|
-
|
686
|
-
|
687
|
-
def grr_check(m):
|
688
|
-
"""用来检查匹配情况"""
|
689
|
-
s0 = m.group()
|
690
|
-
pass # 还没想好什么样的功能是和写到re.sub里面的repl
|
691
|
-
return s0
|
692
|
-
|
693
|
-
|
694
|
-
def regularcheck(pattern, string, flags=0):
|
695
|
-
arr = []
|
696
|
-
cl = ContentLine(string)
|
697
|
-
for i, m in enumerate(re.finditer(pattern, string, flags)):
|
698
|
-
ss = map(lambda x: textwrap.shorten(x, 200), m.groups())
|
699
|
-
arr.append([i + 1, cl.in_line(m.start(0)), *ss])
|
700
|
-
tablehead = ['行号'] + list(map(lambda x: f'第{x}组', range(len_in_dim2(arr) - 2)))
|
701
|
-
df = pd.DataFrame.from_records(arr, columns=tablehead)
|
702
|
-
res = f'正则模式:{pattern},匹配结果:\n' + dataframe_str(df)
|
703
|
-
return res
|
704
|
-
|
705
|
-
|
706
|
-
class StrIdxBack:
|
707
|
-
r"""字符串删除部分干扰字符后,对新字符串匹配并回溯找原字符串的下标
|
708
|
-
|
709
|
-
>>> ob = StrIdxBack('bxx ax xbxax')
|
710
|
-
>>> ob.delchars(r'[ x]+')
|
711
|
-
>>> ob # 删除空格、删除字符x
|
712
|
-
baba
|
713
|
-
>>> print(ob.idx) # keystr中与原字符串对应位置:(0, 5, 9, 11)
|
714
|
-
(0, 5, 9, 11)
|
715
|
-
>>> m = re.match(r'b(ab)', ob.keystr)
|
716
|
-
>>> m = ob.matchback(m)
|
717
|
-
>>> m.group(1)
|
718
|
-
'ax xb'
|
719
|
-
>>> ob.search('ab') # 找出原字符串中内容:'ax xb'
|
720
|
-
'ax xb'
|
721
|
-
"""
|
722
|
-
|
723
|
-
def __init__(self, s):
|
724
|
-
self.oristr = s
|
725
|
-
self.idx = tuple(range(len(s))) # 存储还保留着内容的下标
|
726
|
-
self.keystr = s
|
727
|
-
|
728
|
-
def delchars(self, pattern, flags=0):
|
729
|
-
"""模仿正则的替换语法
|
730
|
-
但是不用输入替换目标s,以及目标格式,因为都是删除操作
|
731
|
-
|
732
|
-
利用正则可以知道被删除的是哪个区间范围
|
733
|
-
>>> ob = StrIdxBack('abc123df4a'); ob.delchars(r'\d+'); str(ob)
|
734
|
-
'abcdfa'
|
735
|
-
>>> ob.idx
|
736
|
-
(0, 1, 2, 6, 7, 9)
|
737
|
-
"""
|
738
|
-
k = 0
|
739
|
-
idxs = []
|
740
|
-
|
741
|
-
def repl(m):
|
742
|
-
nonlocal k, idxs
|
743
|
-
idxs.append(self.idx[k:m.start(0)])
|
744
|
-
k = m.end(0)
|
745
|
-
return ''
|
746
|
-
|
747
|
-
self.keystr = re.sub(pattern, repl, self.keystr, flags=flags)
|
748
|
-
idxs.append(self.idx[k:])
|
749
|
-
self.idx = tuple(itertools.chain(*idxs))
|
750
|
-
|
751
|
-
def compare_newstr(self, limit=300):
|
752
|
-
r"""比较直观的比较字符串前后变化
|
753
|
-
|
754
|
-
newstr相对于oldnew作展开,比较直观的显示字符串前后变化差异
|
755
|
-
>>> ob = StrIdxBack('abab'); ob.delchars('b'); ob.compare_newstr()
|
756
|
-
'a a '
|
757
|
-
"""
|
758
|
-
s1 = self.oristr
|
759
|
-
dd = set(self.idx)
|
760
|
-
|
761
|
-
s2 = []
|
762
|
-
k = 0
|
763
|
-
for i in range(min(len(s1), limit)):
|
764
|
-
if i in dd:
|
765
|
-
s2.append(s1[i])
|
766
|
-
k += 1
|
767
|
-
else:
|
768
|
-
if ord(s1[i]) < 128:
|
769
|
-
if s1[i] == ' ': # 原来是空格的,删除后要用_表示
|
770
|
-
s2.append('_')
|
771
|
-
else: # 原始不是空格的,可以用空格表示已被删除
|
772
|
-
s2.append(' ')
|
773
|
-
else: # 中文字符要用两个空格表示才能对齐
|
774
|
-
s2.append(' ')
|
775
|
-
s2 = ''.join(s2)
|
776
|
-
s2 = s2.replace('\n', r'\n')
|
777
|
-
|
778
|
-
return s2
|
779
|
-
|
780
|
-
def compare(self, limit=300):
|
781
|
-
"""比较直观的比较字符串前后变化"""
|
782
|
-
s1 = self.oristr
|
783
|
-
|
784
|
-
s1 = s1.replace('\n', r'\n')[:limit]
|
785
|
-
s2 = self.compare_newstr(limit)
|
786
|
-
|
787
|
-
return s1 + '\n' + s2 + '\n'
|
788
|
-
|
789
|
-
def matchback(self, m):
|
790
|
-
"""输入一个keystr匹配的match对象,将其映射回oristr的match对象"""
|
791
|
-
regs = []
|
792
|
-
for rs in getattr(m, 'regs'):
|
793
|
-
regs.append((self.idx[rs[0]], self.idx[rs[1] - 1] + 1)) # 注意右边界的处理有细节
|
794
|
-
return ReMatch(regs, self.oristr, m.pos, len(self.oristr), m.lastindex, m.lastgroup, m.re)
|
795
|
-
|
796
|
-
def search(self, pattern):
|
797
|
-
"""在新字符串上查找模式,但是返回的是原字符串的相关下标数据"""
|
798
|
-
m = re.search(pattern, self.keystr)
|
799
|
-
if m:
|
800
|
-
m = self.matchback(m) # pycharm这里会提示m没有regs的成员变量,其实是正常的,没问题
|
801
|
-
return m.group()
|
802
|
-
else:
|
803
|
-
return ''
|
804
|
-
|
805
|
-
def __repr__(self):
|
806
|
-
"""返回处理后当前的新字符串"""
|
807
|
-
return self.keystr
|
808
|
-
|
809
|
-
|
810
|
-
def bracket_match(s, idx):
|
811
|
-
"""括号匹配位置
|
812
|
-
这里以{、}为例,注意也要适用于'[]', '()'
|
813
|
-
>>> bracket_match('{123}', 0)
|
814
|
-
4
|
815
|
-
>>> bracket_match('0{23{5}}89', 1)
|
816
|
-
7
|
817
|
-
>>> bracket_match('0{23{5}}89', 7)
|
818
|
-
1
|
819
|
-
>>> bracket_match('0{23{5}78', 1) is None
|
820
|
-
True
|
821
|
-
>>> bracket_match('0{23{5}78', 20) is None
|
822
|
-
True
|
823
|
-
>>> bracket_match('0[2[4]{7}]01', 9)
|
824
|
-
1
|
825
|
-
>>> bracket_match('0{[34{6}89}', -4)
|
826
|
-
5
|
827
|
-
"""
|
828
|
-
key = '{[(<>)]}'
|
829
|
-
try:
|
830
|
-
if idx < 0:
|
831
|
-
idx += len(s)
|
832
|
-
ch1 = s[idx]
|
833
|
-
idx1 = key.index(ch1)
|
834
|
-
except ValueError: # 找不到ch1
|
835
|
-
return None
|
836
|
-
except IndexError: # 下标越界,表示没有匹配到右括号
|
837
|
-
return None
|
838
|
-
idx2 = len(key) - idx1 - 1
|
839
|
-
ch2 = key[idx2]
|
840
|
-
step = 1 if idx2 > idx1 else -1
|
841
|
-
cnt = 1
|
842
|
-
i = idx + step
|
843
|
-
if i < 0:
|
844
|
-
i += len(s)
|
845
|
-
while 0 <= i < len(s):
|
846
|
-
if s[i] == ch1:
|
847
|
-
cnt += 1
|
848
|
-
elif s[i] == ch2:
|
849
|
-
cnt -= 1
|
850
|
-
if cnt == 0:
|
851
|
-
return i
|
852
|
-
i += step
|
853
|
-
return None
|
854
|
-
|
855
|
-
|
856
|
-
def bracket_match2(s, idx):
|
857
|
-
r"""与“bracket_match”相比,会考虑"\{"转义字符的影响
|
858
|
-
|
859
|
-
>>> bracket_match2('a{b{}b}c', 1)
|
860
|
-
6
|
861
|
-
>>> bracket_match2('a{b{\}b}c}d', 1)
|
862
|
-
9
|
863
|
-
"""
|
864
|
-
key = '{[(<>)]}'
|
865
|
-
try:
|
866
|
-
if idx < 0:
|
867
|
-
idx += len(s)
|
868
|
-
ch1 = s[idx]
|
869
|
-
idx1 = key.index(ch1)
|
870
|
-
except ValueError: # 找不到ch1
|
871
|
-
return None
|
872
|
-
except IndexError: # 下标越界,表示没有匹配到右括号
|
873
|
-
return None
|
874
|
-
idx2 = len(key) - idx1 - 1
|
875
|
-
ch2 = key[idx2]
|
876
|
-
step = 1 if idx2 > idx1 else -1
|
877
|
-
cnt = 1
|
878
|
-
i = idx + step
|
879
|
-
if i < 0:
|
880
|
-
i += len(s)
|
881
|
-
while 0 <= i < len(s):
|
882
|
-
if i and s[i - 1] == '\\':
|
883
|
-
pass
|
884
|
-
elif s[i] == ch1:
|
885
|
-
cnt += 1
|
886
|
-
elif s[i] == ch2:
|
887
|
-
cnt -= 1
|
888
|
-
if cnt == 0:
|
889
|
-
return i
|
890
|
-
i += step
|
891
|
-
return None
|
892
|
-
|
893
|
-
|
894
|
-
____section_3_ensure_content = """
|
895
|
-
从任意类型文件读取文本数据的功能
|
896
|
-
"""
|
897
|
-
|
898
|
-
|
899
|
-
def readtext(filename, encoding=None):
|
900
|
-
"""读取普通的文本文件
|
901
|
-
会根据tex、py文件情况指定默认编码
|
902
|
-
"""
|
903
|
-
try:
|
904
|
-
with open(filename, 'rb') as f: # 以二进制读取文件,注意二进制没有\r\n参数
|
905
|
-
bstr = f.read()
|
906
|
-
except FileNotFoundError:
|
907
|
-
return None
|
908
|
-
|
909
|
-
if not encoding:
|
910
|
-
encoding = get_encoding(bstr)
|
911
|
-
s = bstr.decode(encoding=encoding, errors='ignore')
|
912
|
-
if '\r' in s: # 注意这个问题跟gb2312和gbk是独立的,用gbk编码也要做这个处理
|
913
|
-
s = s.replace('\r\n', '\n') # 如果用\r\n作为换行符会有一些意外不好处理
|
914
|
-
return s
|
915
|
-
|
916
|
-
|
917
|
-
def ensure_content(ob=None, encoding=None):
|
918
|
-
"""
|
919
|
-
:param ob:
|
920
|
-
未输入:从控制台获取文本
|
921
|
-
存在的文件名:读取文件的内容返回
|
922
|
-
tex、py、
|
923
|
-
docx、doc
|
924
|
-
pdf
|
925
|
-
有read可调用成员方法:返回f.read()
|
926
|
-
其他字符串:返回原值
|
927
|
-
:param encoding: 强制指定编码
|
928
|
-
"""
|
929
|
-
# TODO: 如果输入的是一个文件指针,也能调用f.read()返回所有内容
|
930
|
-
# TODO: 增加鲁棒性判断,如果输入的不是字符串类型也要有出错判断
|
931
|
-
if ob is None:
|
932
|
-
return sys.stdin.read() # 注意输入是按 Ctrl + D 结束
|
933
|
-
elif Path(ob).is_file(): # 如果存在这样的文件,那就读取文件内容(bug点:如果输入是目录名会PermissionError)
|
934
|
-
if ob.endswith('.docx'): # 这里还要再扩展pdf、doc文件的读取
|
935
|
-
try:
|
936
|
-
import textract
|
937
|
-
except ModuleNotFoundError:
|
938
|
-
dprint() # 缺少textract模块,安装详见: https://blog.csdn.net/code4101/article/details/79328636
|
939
|
-
raise ModuleNotFoundError
|
940
|
-
text = textract.process(ob)
|
941
|
-
return text.decode('utf8', errors='ignore')
|
942
|
-
elif ob.endswith('.doc'):
|
943
|
-
raise NotImplementedError
|
944
|
-
elif ob.endswith('.pdf'):
|
945
|
-
raise NotImplementedError
|
946
|
-
else: # 按照普通的文本文件读取内容
|
947
|
-
return readtext(ob, encoding)
|
948
|
-
else: # 判断不了的情况,也认为是字符串
|
949
|
-
return ob
|
950
|
-
|
951
|
-
|
952
|
-
def file_lastlines(fn, n):
|
953
|
-
"""获得一个文件最后的几行内容
|
954
|
-
参考资料: https://stackoverflow.com/questions/136168/get-last-n-lines-of-a-file-with-python-similar-to-tail
|
955
|
-
|
956
|
-
>> s = FileLastLine('book.log', 1)
|
957
|
-
'Output written on book.dvi (2 pages, 7812 bytes).'
|
958
|
-
"""
|
959
|
-
f = ensure_content(fn)
|
960
|
-
assert n >= 0
|
961
|
-
pos, lines = n + 1, []
|
962
|
-
while len(lines) <= n:
|
963
|
-
try:
|
964
|
-
f.seek(-pos, 2)
|
965
|
-
except IOError:
|
966
|
-
f.seek(0)
|
967
|
-
break
|
968
|
-
finally:
|
969
|
-
lines = list(f)
|
970
|
-
pos *= 2
|
971
|
-
f.close()
|
972
|
-
return ''.join(lines[-n:])
|
973
|
-
|
974
|
-
|
975
|
-
____section_4_spell_check = """
|
976
|
-
拼写检查
|
977
|
-
190923周一21:54,源自 完形填空ocr 识别项目
|
978
|
-
"""
|
979
|
-
|
980
|
-
|
981
|
-
class MySpellChecker(SpellChecker):
|
982
|
-
def __init__(self, language="en", local_dictionary=None, distance=2, tokenizer=None, case_sensitive=False,
|
983
|
-
df=None):
|
984
|
-
from collections import defaultdict, Counter
|
985
|
-
|
986
|
-
# 1 原初始化功能
|
987
|
-
super(MySpellChecker, self).__init__(language=language, local_dictionary=local_dictionary,
|
988
|
-
distance=distance, tokenizer=tokenizer,
|
989
|
-
case_sensitive=case_sensitive)
|
990
|
-
|
991
|
-
# 2 自己要增加一个分析用的字典
|
992
|
-
self.checkdict = defaultdict(Counter)
|
993
|
-
for k, v in self.word_frequency._dictionary.items():
|
994
|
-
self.checkdict[k][k] = v
|
995
|
-
|
996
|
-
# 3 如果输入了一个df对象要进行更新
|
997
|
-
if df: self.update_by_dataframe(df)
|
998
|
-
|
999
|
-
def update_by_dataframe(self, df, weight_times=1):
|
1000
|
-
"""
|
1001
|
-
:param df: 这里的df有要求,是DataFrame对象,并且含有这些属性列:old、new、count
|
1002
|
-
:param weight_times: 对要加的count乘以一个倍率
|
1003
|
-
:return:
|
1004
|
-
"""
|
1005
|
-
# 1 是否要处理大小写
|
1006
|
-
# 如果不区分大小写,需要对df先做预处理,全部转小写
|
1007
|
-
# 而大小写不敏感的时候,self.word_frequency._dictionary在init时已经转小写,不用操心
|
1008
|
-
if not self._case_sensitive:
|
1009
|
-
df.loc[:, 'old'] = df.loc[:, 'old'].str.lower()
|
1010
|
-
df.loc[:, 'new'] = df.loc[:, 'new'].str.lower()
|
1011
|
-
|
1012
|
-
# 2 df对self.word_frequency._dictionary、self.check的影响
|
1013
|
-
d = self.word_frequency._dictionary
|
1014
|
-
for index, row in df.iterrows():
|
1015
|
-
old, new, count = row['old'].decode(), row['new'].decode(), row['count'] * weight_times
|
1016
|
-
d[old] += count if old == new else -count
|
1017
|
-
# if row['id']==300: dprint(old, new, count)
|
1018
|
-
self.checkdict[old][new] += count
|
1019
|
-
|
1020
|
-
# 3 去除d中负值的key
|
1021
|
-
self.word_frequency.remove_words([k for k in d.keys() if d[k] <= 0])
|
1022
|
-
|
1023
|
-
def _ensure_term(self, term):
|
1024
|
-
if term not in self.checkdict:
|
1025
|
-
d = {k: self.word_frequency._dictionary[k] for k in self.candidates(term)}
|
1026
|
-
self.checkdict[term] = d
|
1027
|
-
|
1028
|
-
def correction(self, term):
|
1029
|
-
# 1 本来就是正确的
|
1030
|
-
w = term if self._case_sensitive else term.lower()
|
1031
|
-
if w in self.word_frequency._dictionary: return term
|
1032
|
-
|
1033
|
-
# 2 如果是错的,且是没有记录的错误情况,则做一次候选项运算
|
1034
|
-
self._ensure_term(w)
|
1035
|
-
|
1036
|
-
# 3 返回权重最大的结果
|
1037
|
-
res = max(self.checkdict[w], key=self.checkdict[w].get)
|
1038
|
-
val = self.checkdict[w].get(res)
|
1039
|
-
if val <= 0: res = '^' + res # 是一个错误单词,但是没有推荐修改结果,就打一个^标记
|
1040
|
-
return res
|
1041
|
-
|
1042
|
-
def correction_detail(self, term):
|
1043
|
-
"""更加详细,给出所有候选项的纠正
|
1044
|
-
|
1045
|
-
>> a.correction_detail('d')
|
1046
|
-
[('d', 9131), ('do', 1), ('old', 1)]
|
1047
|
-
"""
|
1048
|
-
w = term if self._case_sensitive else term.lower()
|
1049
|
-
self._ensure_term(w)
|
1050
|
-
ls = [(k, v) for k, v in self.checkdict[w].items()]
|
1051
|
-
ls = sorted(ls, key=lambda x: x[1], reverse=True)
|
1052
|
-
return ls
|
1053
|
-
|
1054
|
-
|
1055
|
-
def demo_myspellchecker():
|
1056
|
-
# 类的初始化大概要0.4秒
|
1057
|
-
a = MySpellChecker()
|
1058
|
-
|
1059
|
-
# sql的加载更新大概要1秒
|
1060
|
-
# hsql = HistudySQL('ckz', 'tr_develop')
|
1061
|
-
# df = hsql.query('SELECT * FROM spell_check')
|
1062
|
-
# a.update_by_dataframe(df)
|
1063
|
-
|
1064
|
-
# dprint(a.correction_detail('d'))
|
1065
|
-
# dprint(a.correction_detail('wrod')) # wrod有很多种可能性,但word权重是最大的
|
1066
|
-
# dprint(a.correction_detail('ckzckzckzckzckzckz')) # wrod有很多种可能性,但word权重是最大的
|
1067
|
-
# dprint(a.correction('ckzckzckzckzckzckz')) # wrod有很多种可能性,但word权重是最大的
|
1068
|
-
dprint(a.correction_detail('ike'))
|
1069
|
-
dprint(a.correction_detail('dean'))
|
1070
|
-
dprint(a.correction_detail('stud'))
|
1071
|
-
dprint(a.correction_detail('U'))
|
1072
|
-
|
1073
|
-
|
1074
|
-
____section_temp = """
|
1075
|
-
临时添加的新功能
|
1076
|
-
"""
|
1077
|
-
|
1078
|
-
|
1079
|
-
def count_word(s, *patterns):
|
1080
|
-
""" 统计一串文本中,各种规律串出现的次数
|
1081
|
-
:param s: 文本内容
|
1082
|
-
:param patterns:
|
1083
|
-
匹配的多个目标模式list
|
1084
|
-
按优先级一个一个往后处理,被处理掉的部分会用\x00代替
|
1085
|
-
:return: Counter.most_common() 对象
|
1086
|
-
"""
|
1087
|
-
s = str(s)
|
1088
|
-
|
1089
|
-
if not patterns: # 不写参数的时候,默认统计所有单个字符
|
1090
|
-
return collections.Counter(list(s)).most_common()
|
1091
|
-
|
1092
|
-
ls = []
|
1093
|
-
for t in patterns:
|
1094
|
-
ls += re.findall(t, s)
|
1095
|
-
s = re.sub(t, '\x00', s)
|
1096
|
-
# s = re.sub(r'\x00+', '\x00', s) # 将连续的特殊删除设为1,减短字符串长度,还未试验这段代码精确度与效率
|
1097
|
-
ct = collections.Counter(ls)
|
1098
|
-
|
1099
|
-
ls = ct.most_common()
|
1100
|
-
for i in range(len(ls)):
|
1101
|
-
ls[i] = (ls[i][1], repr(ls[i][0])[1:-1])
|
1102
|
-
return ls
|
1103
|
-
|
1104
|
-
|
1105
|
-
class Base85Coder:
|
1106
|
-
"""base85编码、解码器
|
1107
|
-
|
1108
|
-
对明文,加密/编码/encode 后已经是乱了看不懂,但是对这个结果还要二次转义
|
1109
|
-
对乱码,解密/解码/decode 时顺序要反正来,先处理二次转义,再处理base85
|
1110
|
-
|
1111
|
-
使用示例:
|
1112
|
-
key = 'xV~>Y|@muL<UK$*agCQp=t4c0R_y`Z2;q%s?o8S9(3D5W^-NA&}6v){Twj7MzGePJEfik1bBhn!d#I+HlXFOr'
|
1113
|
-
coder = Base85Coder(key)
|
1114
|
-
b = coder.encode('陈坤泽 abc')
|
1115
|
-
dprint(b) # b<str>=d@7;B}ww?}zfGP#;1
|
1116
|
-
s = coder.decode(b)
|
1117
|
-
dprint(s) # s<str>=陈坤泽 abc
|
1118
|
-
"""
|
1119
|
-
DEFAULT_KEY = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~'
|
1120
|
-
CHARS_SET = set(DEFAULT_KEY)
|
1121
|
-
|
1122
|
-
def __init__(self, key=None):
|
1123
|
-
"""key,允许设置密钥,必须是"""
|
1124
|
-
# 1 分析key是否合法
|
1125
|
-
if key:
|
1126
|
-
if len(key) != 85 or set(key) != Base85Coder.CHARS_SET:
|
1127
|
-
dprint(key) # 输入key无效
|
1128
|
-
key = None
|
1129
|
-
self.key = key
|
1130
|
-
|
1131
|
-
# 2 制作转换表 trantab
|
1132
|
-
if key:
|
1133
|
-
self.encode_trantab = str.maketrans(Base85Coder.DEFAULT_KEY, key)
|
1134
|
-
self.decode_trantab = str.maketrans(key, Base85Coder.DEFAULT_KEY)
|
1135
|
-
else:
|
1136
|
-
self.encode_trantab = self.decode_trantab = None
|
1137
|
-
|
1138
|
-
def encode(self, s):
|
1139
|
-
"""将字符串转字节"""
|
1140
|
-
b = base64.b85encode(s.encode('utf8'))
|
1141
|
-
b = str(b)[2:-1]
|
1142
|
-
if self.encode_trantab:
|
1143
|
-
b = b.translate(self.encode_trantab)
|
1144
|
-
return b
|
1145
|
-
|
1146
|
-
def decode(self, b):
|
1147
|
-
if self.decode_trantab:
|
1148
|
-
b = b.translate(self.decode_trantab)
|
1149
|
-
b = b.encode('ascii')
|
1150
|
-
s = base64.b85decode(b).decode('utf8')
|
1151
|
-
return s
|
1152
|
-
|
1153
|
-
|
1154
|
-
def demo_spellchecker():
|
1155
|
-
"""演示如何使用spellchecker库
|
1156
|
-
官方介绍文档 pyspellchecker · PyPI: https://pypi.org/project/pyspellchecker/
|
1157
|
-
190909周一15:58,from 陈坤泽
|
1158
|
-
"""
|
1159
|
-
# 0 安装库和导入库
|
1160
|
-
# spellchecker模块主要有两个类,SpellChecker和WordFrequency
|
1161
|
-
# WordFrequency是一个词频类
|
1162
|
-
# 一般导入SpellChecker就行了:from spellchecker import SpellChecker
|
1163
|
-
try: # 拼写检查库,即词汇库
|
1164
|
-
from spellchecker import SpellChecker
|
1165
|
-
except ModuleNotFoundError:
|
1166
|
-
subprocess.run(['pip3', 'install', 'pyspellchecker'])
|
1167
|
-
from spellchecker import SpellChecker
|
1168
|
-
|
1169
|
-
# 1 创建对象
|
1170
|
-
# 可以设置语言、大小写敏感、拼写检查的最大距离
|
1171
|
-
# 默认'en'英语,大小写不敏感
|
1172
|
-
spell = SpellChecker()
|
1173
|
-
# 如果是英语,SpellChecker会自动加载语言包site-packages\spellchecker\resources\en.json.gz,大概12万个词汇,包括词频权重
|
1174
|
-
d = spell.word_frequency # 这里的d是WordFrequency对象,其底层用了Counter类进行数据存储
|
1175
|
-
dprint(d.unique_words, d.total_words) # 词汇数,权重总和
|
1176
|
-
|
1177
|
-
# 2 修改词频表 spell.word_frequency
|
1178
|
-
dprint(d['ckz']) # 不存在的词汇直接输出0
|
1179
|
-
d.add('ckz') # 可以添加ckz词汇的一次词频
|
1180
|
-
d.load_words(['ckz', 'ckz', 'lyb']) # 可以批量添加词汇
|
1181
|
-
dprint(d['ckz'], d['lyb']) # d['ckz']=3 d['lyb']=1
|
1182
|
-
d.load_words(['ckz'] * 100 + ['lyb'] * 500) # 可以用这种技巧进行大权重的添加
|
1183
|
-
dprint(d['ckz'], d['lyb']) # d['ckz']=103 d['lyb']=501
|
1184
|
-
|
1185
|
-
# 同理,去除也有remove和remove_words两种方法
|
1186
|
-
d.remove('ckz')
|
1187
|
-
# d.remove_words(['ckz', 'lyb']) # 不过注意不能删除已经不存在的key('ckz'),否则会报KeyError
|
1188
|
-
dprint(d['ckz'], d['lyb']) # d['ckz']=0 d['lyb']=501
|
1189
|
-
# remove是完全去除单词,如果只是要减权重可以访问底层的_dictionary对象操作
|
1190
|
-
d._dictionary['lyb'] -= 100 # 当然不太建议直接访问下划线开头的成员变量~~
|
1191
|
-
dprint(d['lyb']) # ['lyb']=401
|
1192
|
-
|
1193
|
-
# 还可以按阈值删除词频不超过设置阈值的词汇
|
1194
|
-
d.remove_by_threshold(5)
|
1195
|
-
|
1196
|
-
# 3 spell的基本功能
|
1197
|
-
# (1)用unknown可以找到可能拼写错误的单词,再用correction可以获得最佳修改意见
|
1198
|
-
misspelled = spell.unknown(['something', 'is', 'hapenning', 'here'])
|
1199
|
-
dprint(misspelled) # misspelled<set>={'hapenning'}
|
1200
|
-
|
1201
|
-
for word in misspelled:
|
1202
|
-
# Get the one `most likely` answer
|
1203
|
-
dprint(spell.correction(word)) # <str>='happening'
|
1204
|
-
# Get a list of `likely` options
|
1205
|
-
dprint(spell.candidates(word)) # <set>={'henning', 'happening', 'penning'}
|
1206
|
-
|
1207
|
-
# 注意默认的spell不区分大小写,如果词库存储了100次'ckz'
|
1208
|
-
# 此时判断任意大小写形式组合的'CKZ'都是返回原值
|
1209
|
-
# 例如 spell.correction('ckZ') => 'ckZ'
|
1210
|
-
|
1211
|
-
# (2)可以通过修改spell.word_frequency影响correction的计算结果
|
1212
|
-
dprint(d['henning'], d['happening'], d['penning'])
|
1213
|
-
# d['henning']<int>=53 d['happening']<int>=4538 d['penning']<int>=23
|
1214
|
-
d._dictionary['henning'] += 10000
|
1215
|
-
dprint(spell.correction('hapenning')) # <str>='henning'
|
1216
|
-
|
1217
|
-
# (3)词汇在整个字典里占的权重
|
1218
|
-
dprint(spell.word_probability('henning')) # <float>=0.0001040741914298211
|
1219
|
-
|
1220
|
-
|
1221
|
-
def check_text_row_column(s):
|
1222
|
-
"""对一段文本s,用换行符分割行,用至少4个空格或\t分割列,分析数据的行、列数
|
1223
|
-
:return:
|
1224
|
-
(n, m),每列的列数相等,则会返回n、m>=0的tuple
|
1225
|
-
(m1, m2, ...),如果有列数不相等,则会返回每行的列数组成的tuple
|
1226
|
-
每个元素用负值代表不匹配
|
1227
|
-
"""
|
1228
|
-
# 拆开每行的列
|
1229
|
-
if not s: return (0, 0)
|
1230
|
-
lines = [re.sub(r'( {4,}|\t)+', r'\t', line.strip()).split('\t') for line in s.splitlines()]
|
1231
|
-
cols = [len(line) for line in lines] # 计算每行的列数
|
1232
|
-
if min(cols) == max(cols):
|
1233
|
-
return len(lines), cols[0]
|
1234
|
-
else:
|
1235
|
-
return [-col for col in cols]
|
1236
|
-
|
1237
|
-
|
1238
|
-
class ListingFormat:
|
1239
|
-
r"""列表格式化工具
|
1240
|
-
|
1241
|
-
>>> li = ListingFormat('(1)')
|
1242
|
-
>>> li
|
1243
|
-
(1)
|
1244
|
-
>>> li.next()
|
1245
|
-
>>> li
|
1246
|
-
(2)
|
1247
|
-
|
1248
|
-
>>> li = ListingFormat(('一、选择题', '二、填空题', '三、解答题'))
|
1249
|
-
>>> li
|
1250
|
-
一、选择题
|
1251
|
-
>>> li.next()
|
1252
|
-
>>> li
|
1253
|
-
二、填空题
|
1254
|
-
"""
|
1255
|
-
formats = {'[零一二三四五六七八九十]+': (chinese2digits, digits2chinese),
|
1256
|
-
r'\d+': (int, str),
|
1257
|
-
'[A-Z]': (lambda x: ord(x) - ord('A') + 1, lambda x: chr(ord('A') + x - 1)),
|
1258
|
-
'[a-z]': (lambda x: ord(x) - ord('a') + 1, lambda x: chr(ord('a') + x - 1)),
|
1259
|
-
'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]': (circlednumber2digits, digits2circlednumber),
|
1260
|
-
'[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]': (roman2digits, digits2roman)}
|
1261
|
-
|
1262
|
-
def __init__(self, s='1'):
|
1263
|
-
"""
|
1264
|
-
:param s: 列表的格式,含数值和装饰
|
1265
|
-
数值形式,目前有六种形式:一 1 A a ① Ⅰ
|
1266
|
-
起始值可以不是1,例如写'三'、'D'等
|
1267
|
-
装饰的格式,常见的有:'({})' '({})' '{}、' '{}.' '{}. '
|
1268
|
-
list或tuple,按顺序取用,用完后不再设置前缀
|
1269
|
-
>> ListingFormat('一', '{}、')
|
1270
|
-
|
1271
|
-
TODO 目前只考虑值较小的情况,如果值太大,有些情况会出bug、报错
|
1272
|
-
"""
|
1273
|
-
if isinstance(s, str):
|
1274
|
-
for k, funcs in ListingFormat.formats.items():
|
1275
|
-
if re.search(k, s):
|
1276
|
-
self.form = re.sub(k, '{}', s)
|
1277
|
-
self.value = int(funcs[0](re.search(k, s).group()))
|
1278
|
-
self.func = funcs[1]
|
1279
|
-
break
|
1280
|
-
else:
|
1281
|
-
raise ValueError('列表初始化格式不对 s=' + str(s))
|
1282
|
-
elif isinstance(s, (list, tuple)):
|
1283
|
-
self.form = s
|
1284
|
-
self.value = 0
|
1285
|
-
self.func = None
|
1286
|
-
else:
|
1287
|
-
raise ValueError('列表初始化格式不对 s=' + str(s))
|
1288
|
-
|
1289
|
-
def reset(self, start=1):
|
1290
|
-
"""重置初始值"""
|
1291
|
-
self.value = start
|
1292
|
-
|
1293
|
-
def next(self):
|
1294
|
-
self.value += 1
|
1295
|
-
|
1296
|
-
def __repr__(self):
|
1297
|
-
if self.func:
|
1298
|
-
return self.form.format(self.func(self.value))
|
1299
|
-
else:
|
1300
|
-
return self.form[self.value]
|
1301
|
-
|
1302
|
-
|
1303
|
-
def latexstrip(s):
|
1304
|
-
"""latex版的strip"""
|
1305
|
-
return s.strip('\t\n ~')
|