pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +9 -2
- pyxllib/algo/__init__.py +8 -0
- pyxllib/algo/disjoint.py +54 -0
- pyxllib/algo/geo.py +541 -0
- pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
- pyxllib/algo/matcher.py +389 -0
- pyxllib/algo/newbie.py +166 -0
- pyxllib/algo/pupil.py +629 -0
- pyxllib/algo/shapelylib.py +67 -0
- pyxllib/algo/specialist.py +241 -0
- pyxllib/algo/stat.py +494 -0
- pyxllib/algo/treelib.py +149 -0
- pyxllib/algo/unitlib.py +66 -0
- pyxllib/autogui/__init__.py +5 -0
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/autogui/autogui.py +852 -0
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/virtualkey.py +102 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/__init__.py +1 -11
- pyxllib/cv/expert.py +267 -0
- pyxllib/cv/{imlib.py → imfile.py} +18 -83
- pyxllib/cv/imhash.py +39 -0
- pyxllib/cv/pupil.py +9 -0
- pyxllib/cv/rgbfmt.py +1525 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/cv/trackbartools.py +163 -49
- pyxllib/cv/xlcvlib.py +1040 -0
- pyxllib/cv/xlpillib.py +423 -0
- pyxllib/data/__init__.py +0 -0
- pyxllib/data/echarts.py +240 -0
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/{util/oss2_.py → data/oss.py} +11 -9
- pyxllib/data/pglib.py +1127 -0
- pyxllib/data/sqlite.py +568 -0
- pyxllib/{util → data}/sqllib.py +13 -31
- pyxllib/ext/JLineViewer.py +505 -0
- pyxllib/ext/__init__.py +6 -0
- pyxllib/{util → ext}/demolib.py +119 -35
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +12 -0
- pyxllib/{util/main.py → ext/old.py} +122 -284
- pyxllib/ext/qt.py +449 -0
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/seleniumlib.py +76 -0
- pyxllib/{util/tklib.py → ext/tk.py} +10 -11
- pyxllib/ext/unixlib.py +827 -0
- pyxllib/ext/utools.py +351 -0
- pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
- pyxllib/ext/win32lib.py +40 -0
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1105 -0
- pyxllib/file/__init__.py +17 -0
- pyxllib/file/docxlib.py +761 -0
- pyxllib/{util → file}/gitlib.py +40 -27
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +148 -0
- pyxllib/file/newbie.py +10 -0
- pyxllib/file/onenotelib.py +1469 -0
- pyxllib/file/packlib/__init__.py +330 -0
- pyxllib/{util → file/packlib}/zipfile.py +598 -195
- pyxllib/file/pdflib.py +426 -0
- pyxllib/file/pupil.py +185 -0
- pyxllib/file/specialist/__init__.py +685 -0
- pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
- pyxllib/file/specialist/download.py +193 -0
- pyxllib/file/specialist/filelib.py +2829 -0
- pyxllib/file/xlsxlib.py +3131 -0
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/__init__.py +5 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/deprecatedlib.py +233 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/ipyexec.py +253 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +451 -0
- pyxllib/prog/pupil.py +1197 -0
- pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
- pyxllib/prog/specialist/__init__.py +391 -0
- pyxllib/prog/specialist/bc.py +203 -0
- pyxllib/prog/specialist/browser.py +497 -0
- pyxllib/prog/specialist/common.py +347 -0
- pyxllib/prog/specialist/datetime.py +199 -0
- pyxllib/prog/specialist/tictoc.py +240 -0
- pyxllib/prog/specialist/xllog.py +180 -0
- pyxllib/prog/xlosenv.py +108 -0
- pyxllib/stdlib/__init__.py +17 -0
- pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
- pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
- pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
- pyxllib/text/__init__.py +8 -0
- pyxllib/text/ahocorasick.py +39 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +121 -0
- pyxllib/text/jiebalib.py +267 -0
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +922 -0
- pyxllib/text/latex/__init__.py +158 -0
- pyxllib/text/levenshtein.py +303 -0
- pyxllib/text/nestenv.py +1215 -0
- pyxllib/text/newbie.py +300 -0
- pyxllib/text/pupil/__init__.py +8 -0
- pyxllib/text/pupil/common.py +1121 -0
- pyxllib/text/pupil/xlalign.py +326 -0
- pyxllib/text/pycode.py +47 -0
- pyxllib/text/specialist/__init__.py +8 -0
- pyxllib/text/specialist/common.py +112 -0
- pyxllib/text/specialist/ptag.py +186 -0
- pyxllib/text/spellchecker.py +172 -0
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/vbacode.py +17 -0
- pyxllib/text/xmllib.py +747 -0
- pyxllib/xl.py +39 -0
- pyxllib/xlcv.py +17 -0
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
- pyxllib/basic/_1_strlib.py +0 -945
- pyxllib/basic/_2_timelib.py +0 -488
- pyxllib/basic/_3_pathlib.py +0 -916
- pyxllib/basic/_4_loglib.py +0 -419
- pyxllib/basic/__init__.py +0 -54
- pyxllib/basic/arrow_.py +0 -250
- pyxllib/basic/chardet_.py +0 -66
- pyxllib/basic/dirlib.py +0 -529
- pyxllib/basic/dprint.py +0 -202
- pyxllib/basic/extension.py +0 -12
- pyxllib/basic/judge.py +0 -31
- pyxllib/basic/log.py +0 -204
- pyxllib/basic/pathlib_.py +0 -705
- pyxllib/basic/pytictoc.py +0 -102
- pyxllib/basic/qiniu_.py +0 -61
- pyxllib/basic/strlib.py +0 -761
- pyxllib/basic/timer.py +0 -132
- pyxllib/cv/cv.py +0 -834
- pyxllib/cv/cvlib/_1_geo.py +0 -543
- pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
- pyxllib/cv/cvlib/_2_imgproc.py +0 -594
- pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
- pyxllib/cv/cvlib/_4_cvimg.py +0 -211
- pyxllib/cv/cvlib/__init__.py +0 -10
- pyxllib/cv/debugtools.py +0 -82
- pyxllib/cv/fitz_.py +0 -300
- pyxllib/cv/installer.py +0 -42
- pyxllib/debug/_0_installer.py +0 -38
- pyxllib/debug/_1_typelib.py +0 -277
- pyxllib/debug/_2_chrome.py +0 -198
- pyxllib/debug/_3_showdir.py +0 -161
- pyxllib/debug/_4_bcompare.py +0 -140
- pyxllib/debug/__init__.py +0 -49
- pyxllib/debug/bcompare.py +0 -132
- pyxllib/debug/chrome.py +0 -198
- pyxllib/debug/installer.py +0 -38
- pyxllib/debug/showdir.py +0 -158
- pyxllib/debug/typelib.py +0 -278
- pyxllib/image/__init__.py +0 -12
- pyxllib/torch/__init__.py +0 -20
- pyxllib/torch/modellib.py +0 -37
- pyxllib/torch/trainlib.py +0 -344
- pyxllib/util/__init__.py +0 -20
- pyxllib/util/aip_.py +0 -141
- pyxllib/util/casiadb.py +0 -59
- pyxllib/util/excellib.py +0 -495
- pyxllib/util/filelib.py +0 -612
- pyxllib/util/jsondata.py +0 -27
- pyxllib/util/jsondata2.py +0 -92
- pyxllib/util/labelmelib.py +0 -139
- pyxllib/util/onepy/__init__.py +0 -29
- pyxllib/util/onepy/onepy.py +0 -574
- pyxllib/util/onepy/onmanager.py +0 -170
- pyxllib/util/pyautogui_.py +0 -219
- pyxllib/util/textlib.py +0 -1305
- pyxllib/util/unorder.py +0 -22
- pyxllib/util/xmllib.py +0 -639
- pyxllib-0.0.43.dist-info/METADATA +0 -39
- pyxllib-0.0.43.dist-info/RECORD +0 -80
- pyxllib-0.0.43.dist-info/top_level.txt +0 -1
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1121 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2021/06/03 22:56
|
6
|
+
|
7
|
+
"""
|
8
|
+
文本处理、常用正则匹配模式
|
9
|
+
|
10
|
+
下面大量的函数前缀含义:
|
11
|
+
grp,generate regular pattern,生成正则模式字符串
|
12
|
+
grr,generate regular replace,生成正则替换目标格式
|
13
|
+
"""
|
14
|
+
|
15
|
+
import base64
|
16
|
+
import bisect
|
17
|
+
import collections
|
18
|
+
import io
|
19
|
+
import json
|
20
|
+
import logging
|
21
|
+
import os
|
22
|
+
import re
|
23
|
+
import sys
|
24
|
+
|
25
|
+
from pyxllib.prog.newbie import RunOnlyOnce
|
26
|
+
from pyxllib.prog.pupil import dprint, safe_div
|
27
|
+
from pyxllib.text.newbie import circlednumber2digits, digits2circlednumber, roman2digits, digits2roman
|
28
|
+
|
29
|
+
|
30
|
+
def shorten(s, width=200, placeholder='...'):
|
31
|
+
"""
|
32
|
+
:param width: 这个长度是上限,即使用placeholder时的字符串总长度也在这个范围内
|
33
|
+
|
34
|
+
>>> shorten('aaa', 10)
|
35
|
+
'aaa'
|
36
|
+
>>> shorten('hell world! 0123456789 0123456789', 11)
|
37
|
+
'hell wor...'
|
38
|
+
>>> shorten("Hello world!", width=12)
|
39
|
+
'Hello world!'
|
40
|
+
>>> shorten("Hello world!", width=11)
|
41
|
+
'Hello wo...'
|
42
|
+
>>> shorten('0123456789 0123456789', 2, 'xyz') # 自己写的shorten
|
43
|
+
'xy'
|
44
|
+
|
45
|
+
注意textwrap.shorten的缩略只针对空格隔开的单词有效,我这里的功能与其不太一样
|
46
|
+
>>> textwrap.shorten('0123456789 0123456789', 11) # 全部字符都被折叠了
|
47
|
+
'[...]'
|
48
|
+
>>> shorten('0123456789 0123456789', 11) # 自己写的shorten
|
49
|
+
'01234567...'
|
50
|
+
"""
|
51
|
+
s = re.sub(r'\s+', ' ', str(s))
|
52
|
+
n, m = len(s), len(placeholder)
|
53
|
+
if n > width:
|
54
|
+
s = s[:max(width - m, 0)] + placeholder
|
55
|
+
return s[:width] # 加了placeholder在特殊情况下也会超,再做个截断最保险
|
56
|
+
|
57
|
+
# return textwrap.shorten(str(s), width)
|
58
|
+
|
59
|
+
|
60
|
+
def strfind(fullstr, objstr, *, start=None, times=0, overlap=False):
|
61
|
+
r""" 进行强大功能扩展的的字符串查找函数
|
62
|
+
|
63
|
+
TODO 性能有待优化
|
64
|
+
|
65
|
+
:param fullstr: 原始完整字符串
|
66
|
+
>>> strfind('aabbaabb', 'bb') # 函数基本用法
|
67
|
+
2
|
68
|
+
|
69
|
+
:param objstr: 需要查找的目标字符串,可以是一个list或tuple
|
70
|
+
TODO 有空看下AC自动机,看这里是否可以优化提速,或者找现成的库接口
|
71
|
+
>>> strfind('bbaaaabb', 'bb') # 查找第1次出现的位置
|
72
|
+
0
|
73
|
+
>>> strfind('aabbaabb', 'bb', times=1) # 查找第2次出现的位置
|
74
|
+
6
|
75
|
+
>>> strfind('aabbaabb', 'cc') # 不存在时返回-1
|
76
|
+
-1
|
77
|
+
>>> strfind('aabbaabb', ['aa', 'bb'], times=2)
|
78
|
+
4
|
79
|
+
|
80
|
+
:param start: 起始查找位置。默认值为0,当times<0时start的默认值为-1。
|
81
|
+
>>> strfind('aabbaabb', 'bb', start=2) # 恰好在起始位置
|
82
|
+
2
|
83
|
+
>>> strfind('aabbaabb', 'bb', start=3)
|
84
|
+
6
|
85
|
+
>>> strfind('aabbaabb', ['aa', 'bb'], start=5)
|
86
|
+
6
|
87
|
+
|
88
|
+
:param times: 定位第几次出现的位置,默认值为0,即从前往后第1次出现的位置。
|
89
|
+
如果是负数,则反向查找,并返回的是目标字符串的起始位置。
|
90
|
+
>>> strfind('aabbaabb', 'aa', times=-1)
|
91
|
+
4
|
92
|
+
>>> strfind('aabbaabb', 'aa', start=5, times=-1)
|
93
|
+
4
|
94
|
+
>>> strfind('aabbaabb', 'aa', start=3, times=-1)
|
95
|
+
0
|
96
|
+
>>> strfind('aabbaabb', 'bb', start=7, times=-1)
|
97
|
+
6
|
98
|
+
|
99
|
+
:param overlap: 重叠情况是否重复计数
|
100
|
+
>>> strfind('aaaa', 'aa', times=1) # 默认不计算重叠部分
|
101
|
+
2
|
102
|
+
>>> strfind('aaaa', 'aa', times=1, overlap=True)
|
103
|
+
1
|
104
|
+
|
105
|
+
>>> strfind(r'\item=\item+', (r'\item', r'\test'), start=1)
|
106
|
+
6
|
107
|
+
"""
|
108
|
+
|
109
|
+
def nonnegative_min_value(*arr):
|
110
|
+
"""计算出最小非负整数,如果没有非负数,则返回-1"""
|
111
|
+
arr = tuple(filter(lambda x: x >= 0, arr))
|
112
|
+
return min(arr) if arr else -1
|
113
|
+
|
114
|
+
def nonnegative_max_value(*arr):
|
115
|
+
"""计算出最大非负整数,如果没有非负数,则返回-1"""
|
116
|
+
arr = tuple(filter(lambda x: x >= 0, arr))
|
117
|
+
return max(arr) if arr else -1
|
118
|
+
|
119
|
+
# 1 根据times不同,start的初始默认值设置方式也不同
|
120
|
+
if times < 0 and start is None:
|
121
|
+
start = len(fullstr) - 1 # 反向查找start设到末尾字符-1
|
122
|
+
if start is None:
|
123
|
+
start = 0 # 正向查找start设为0
|
124
|
+
p = -1 # 记录答案位置,默认找不到
|
125
|
+
|
126
|
+
# 2 单串匹配
|
127
|
+
if isinstance(objstr, str): # 单串匹配
|
128
|
+
offset = 1 if overlap else len(objstr) # overlap影响每次偏移量
|
129
|
+
|
130
|
+
# A、正向查找
|
131
|
+
if times >= 0:
|
132
|
+
p = start - offset
|
133
|
+
for _ in range(times + 1):
|
134
|
+
p = fullstr.find(objstr, p + offset)
|
135
|
+
if p == -1:
|
136
|
+
return -1
|
137
|
+
|
138
|
+
# B、反向查找
|
139
|
+
else:
|
140
|
+
p = start + offset + 1
|
141
|
+
for _ in range(-times):
|
142
|
+
p = fullstr.rfind(objstr, 0, p - offset)
|
143
|
+
if p == -1:
|
144
|
+
return -1
|
145
|
+
|
146
|
+
# 3 多模式匹配(递归调用,依赖单串匹配功能)
|
147
|
+
else:
|
148
|
+
# A、正向查找
|
149
|
+
if times >= 0:
|
150
|
+
p = start - 1
|
151
|
+
for _ in range(times + 1):
|
152
|
+
# 把每个目标串都找一遍下一次出现的位置,取最近的一个
|
153
|
+
# 因为只找第一次出现的位置,所以overlap参数传不传都没有影响
|
154
|
+
# TODO 需要进行性能对比分析,有必要的话后续可以改AC自动机实现多模式匹配
|
155
|
+
ls = tuple(map(lambda x: strfind(fullstr, x, start=p + 1, overlap=overlap), objstr))
|
156
|
+
p = nonnegative_min_value(*ls)
|
157
|
+
if p == -1:
|
158
|
+
return -1
|
159
|
+
|
160
|
+
# B、反向查找
|
161
|
+
else:
|
162
|
+
p = start + 1
|
163
|
+
for _ in range(-times): # 需要循环处理的次数
|
164
|
+
# 使用map对每个要查找的目标调用strfind
|
165
|
+
ls = tuple(map(lambda x: strfind(fullstr, x, start=p - 1, times=-1, overlap=overlap), objstr))
|
166
|
+
p = nonnegative_max_value(*ls)
|
167
|
+
if p == -1:
|
168
|
+
return -1
|
169
|
+
|
170
|
+
return p
|
171
|
+
|
172
|
+
|
173
|
+
def findspan(src, sub, start=0, end=None):
|
174
|
+
""" str.find的封装
|
175
|
+
|
176
|
+
:param sub:
|
177
|
+
str,普通的字符串查找
|
178
|
+
re.Pattern,正则模式的查找
|
179
|
+
:return: (start, end)
|
180
|
+
找不到的时候返回 (-1, -1)
|
181
|
+
否则返回区间的左开右闭位置
|
182
|
+
"""
|
183
|
+
if end is None:
|
184
|
+
end = len(src)
|
185
|
+
|
186
|
+
if isinstance(sub, str):
|
187
|
+
pos = src.find(sub, start, end)
|
188
|
+
elif isinstance(sub, re.Pattern):
|
189
|
+
pattern = sub
|
190
|
+
m = pattern.search(src[start:end])
|
191
|
+
if m:
|
192
|
+
pos = m.start() + start
|
193
|
+
sub = m.group()
|
194
|
+
else:
|
195
|
+
pos = -1
|
196
|
+
else:
|
197
|
+
raise TypeError
|
198
|
+
|
199
|
+
if pos == -1:
|
200
|
+
return -1, -1
|
201
|
+
else:
|
202
|
+
return pos, pos + len(sub)
|
203
|
+
|
204
|
+
|
205
|
+
def substr_count(src, sub, overlape=False):
|
206
|
+
""" 判断字符串src中符合pattern的字串有几个 """
|
207
|
+
if overlape:
|
208
|
+
raise NotImplementedError
|
209
|
+
else:
|
210
|
+
if isinstance(sub, str):
|
211
|
+
cnt = src.count(sub)
|
212
|
+
elif isinstance(sub, re.Pattern):
|
213
|
+
cnt = len(sub.findall(src))
|
214
|
+
else:
|
215
|
+
raise TypeError
|
216
|
+
|
217
|
+
return cnt
|
218
|
+
|
219
|
+
|
220
|
+
class Stdout:
|
221
|
+
"""重定向标准输出流,切换print标准输出位置
|
222
|
+
|
223
|
+
使用with语法调用
|
224
|
+
"""
|
225
|
+
|
226
|
+
def __init__(self, path=None, mode='w'):
|
227
|
+
"""
|
228
|
+
:param path: 可选参数
|
229
|
+
如果是一个合法的文件名,在__exit__时,会将结果写入文件
|
230
|
+
如果不合法不报错,只是没有功能效果
|
231
|
+
:param mode: 写入模式
|
232
|
+
'w': 默认模式,直接覆盖写入
|
233
|
+
'a': 追加写入
|
234
|
+
"""
|
235
|
+
self.origin_stdout = sys.stdout
|
236
|
+
self._path = path
|
237
|
+
self._mode = mode
|
238
|
+
self.strout = io.StringIO()
|
239
|
+
self.result = None
|
240
|
+
|
241
|
+
def __enter__(self):
|
242
|
+
sys.stdout = self.strout
|
243
|
+
return self
|
244
|
+
|
245
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
246
|
+
sys.stdout = self.origin_stdout
|
247
|
+
self.result = str(self)
|
248
|
+
|
249
|
+
# 如果输入的是一个合法的文件名,则将中间结果写入
|
250
|
+
if not self._path:
|
251
|
+
return
|
252
|
+
|
253
|
+
try:
|
254
|
+
with open(self._path, self._mode, encodings='utf8', ignore_errors=True) as f:
|
255
|
+
f.write(self.result)
|
256
|
+
except TypeError as e:
|
257
|
+
logging.exception(e)
|
258
|
+
except FileNotFoundError as e:
|
259
|
+
logging.exception(e)
|
260
|
+
|
261
|
+
self.strout.close()
|
262
|
+
|
263
|
+
def __str__(self):
|
264
|
+
""" 在这个期间获得的文本内容 """
|
265
|
+
if self.result:
|
266
|
+
return self.result
|
267
|
+
else:
|
268
|
+
return self.strout.getvalue()
|
269
|
+
|
270
|
+
|
271
|
+
def int2myalphaenum(n):
|
272
|
+
"""
|
273
|
+
:param n: 0~52的数字
|
274
|
+
"""
|
275
|
+
if 0 <= n <= 52:
|
276
|
+
return '_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'[n]
|
277
|
+
else:
|
278
|
+
print('警告:不在处理范围内的数值', n)
|
279
|
+
raise ValueError
|
280
|
+
|
281
|
+
|
282
|
+
def ensure_gbk(s):
|
283
|
+
"""检查一个字符串的所有内容是否能正常转为gbk,
|
284
|
+
如果不能则ignore掉不能转换的部分"""
|
285
|
+
try:
|
286
|
+
s.encode('gbk')
|
287
|
+
except UnicodeEncodeError:
|
288
|
+
origin_s = s
|
289
|
+
s = s.encode('gbk', errors='ignore').decode('gbk')
|
290
|
+
print('警告:字符串存在无法转为gbk的字符', origin_s, s)
|
291
|
+
return s
|
292
|
+
|
293
|
+
|
294
|
+
def digit2weektag(d):
|
295
|
+
""" 输入数字1~7,转为“周一~周日”
|
296
|
+
|
297
|
+
>>> digit2weektag(1)
|
298
|
+
'周一'
|
299
|
+
>>> digit2weektag('7')
|
300
|
+
'周日'
|
301
|
+
"""
|
302
|
+
d = int(d)
|
303
|
+
if 1 <= d <= 7:
|
304
|
+
return '周' + '一二三四五六日'[d - 1]
|
305
|
+
else:
|
306
|
+
raise ValueError
|
307
|
+
|
308
|
+
|
309
|
+
def fullwidth2halfwidth(ustring):
|
310
|
+
""" 把字符串全角转半角
|
311
|
+
|
312
|
+
python3环境下的全角与半角转换代码和测试_大数据挖掘SparkExpert的博客-CSDN博客:
|
313
|
+
https://blog.csdn.net/sparkexpert/article/details/82749207
|
314
|
+
|
315
|
+
>>> fullwidth2halfwidth("你好pythonabdalduizxcvbnm")
|
316
|
+
'你好pythonabdalduizxcvbnm'
|
317
|
+
"""
|
318
|
+
ss = []
|
319
|
+
for s in ustring:
|
320
|
+
for uchar in s:
|
321
|
+
inside_code = ord(uchar)
|
322
|
+
if inside_code == 12288: # 全角空格直接转换
|
323
|
+
inside_code = 32
|
324
|
+
elif 65281 <= inside_code <= 65374: # 全角字符(除空格)根据关系转化
|
325
|
+
inside_code -= 65248
|
326
|
+
ss.append(chr(inside_code))
|
327
|
+
return ''.join(ss)
|
328
|
+
|
329
|
+
|
330
|
+
def fullwidth2halfwidth2(ustring):
|
331
|
+
""" 不处理标点符号的版本
|
332
|
+
|
333
|
+
>>> fullwidth2halfwidth2("你好pythonabda,lduizxcvbnm")
|
334
|
+
'你好pythonabda,lduizxcvbnm'
|
335
|
+
"""
|
336
|
+
ss = []
|
337
|
+
for s in ustring:
|
338
|
+
for uchar in s:
|
339
|
+
if uchar in ':;!(),?".':
|
340
|
+
ss.append(uchar)
|
341
|
+
else:
|
342
|
+
inside_code = ord(uchar)
|
343
|
+
if inside_code == 12288: # 全角空格直接转换
|
344
|
+
inside_code = 32
|
345
|
+
elif 65281 <= inside_code <= 65374: # 全角字符(除空格)根据关系转化
|
346
|
+
inside_code -= 65248
|
347
|
+
ss.append(chr(inside_code))
|
348
|
+
return ''.join(ss)
|
349
|
+
|
350
|
+
|
351
|
+
def halfwidth2fullwidth(ustring):
|
352
|
+
""" 把字符串全角转半角
|
353
|
+
|
354
|
+
>>> halfwidth2fullwidth("你好pythonabdalduizxcvbnm")
|
355
|
+
'你好pythonabdalduizxcvbnm'
|
356
|
+
"""
|
357
|
+
ss = []
|
358
|
+
for s in ustring:
|
359
|
+
for uchar in s:
|
360
|
+
inside_code = ord(uchar)
|
361
|
+
if inside_code == 32: # 全角空格直接转换
|
362
|
+
inside_code = 12288
|
363
|
+
elif 33 <= inside_code <= 126: # 全角字符(除空格)根据关系转化
|
364
|
+
inside_code += 65248
|
365
|
+
ss.append(chr(inside_code))
|
366
|
+
return ''.join(ss)
|
367
|
+
|
368
|
+
|
369
|
+
class ContentPartSpliter:
|
370
|
+
""" 文本内容分块处理 """
|
371
|
+
|
372
|
+
@classmethod
|
373
|
+
def multi_blank_lines(cls, content, leastlines=2):
|
374
|
+
""" 用多个空行隔开的情况
|
375
|
+
|
376
|
+
:param leastlines: 最少2个空行隔开,为新的一块内容
|
377
|
+
"""
|
378
|
+
fmt = r'\n{' + str(leastlines) + ',}'
|
379
|
+
parts = [x.strip() for x in re.split(fmt, content)]
|
380
|
+
parts = list(filter(bool, parts)) # 删除空行
|
381
|
+
return parts
|
382
|
+
|
383
|
+
|
384
|
+
class ContentLine(object):
|
385
|
+
""" 用行数的特性分析一段文本 """
|
386
|
+
|
387
|
+
def __init__(self, content):
|
388
|
+
"""用一段文本初始化"""
|
389
|
+
self.content = content # 原始文本
|
390
|
+
self.linepos = list() # linepos[i-1] = v:第i行终止位置(\n)所在下标为v
|
391
|
+
for i in range(len(self.content)):
|
392
|
+
if self.content[i] == '\n':
|
393
|
+
self.linepos.append(i)
|
394
|
+
self.linepos.append(len(self.content))
|
395
|
+
self.lines = self.content.splitlines() # 每一行的文本内容
|
396
|
+
|
397
|
+
def line_start_pos(self, line):
|
398
|
+
"""第line行的其实pos位置"""
|
399
|
+
pass
|
400
|
+
|
401
|
+
def lines_num(self):
|
402
|
+
"""返回总行数"""
|
403
|
+
return self.content.count('\n')
|
404
|
+
|
405
|
+
def match_lines(self, pattern):
|
406
|
+
"""返回符合正则规则的行号
|
407
|
+
|
408
|
+
180515扩展: pattern也能输入一个函数
|
409
|
+
"""
|
410
|
+
# 1 定义函数句柄
|
411
|
+
if not callable(pattern):
|
412
|
+
def f(s):
|
413
|
+
return re.search(pattern, s)
|
414
|
+
else:
|
415
|
+
f = pattern
|
416
|
+
# 2 循环判断
|
417
|
+
res = list()
|
418
|
+
for i, line in enumerate(self.lines):
|
419
|
+
if f(line):
|
420
|
+
res.append(i)
|
421
|
+
return res
|
422
|
+
|
423
|
+
def in_line(self, ob):
|
424
|
+
"""输入关键词ob,返回行号"""
|
425
|
+
|
426
|
+
if hasattr(ob, 'span'):
|
427
|
+
return self.in_line(ob.span()[0])
|
428
|
+
elif isinstance(ob, int):
|
429
|
+
"如果给入一个下标值,如23,计算第23个字符处于原文中第几行"
|
430
|
+
return bisect.bisect_right(self.linepos, ob - 1) + 1
|
431
|
+
elif isinstance(ob, str):
|
432
|
+
"输入一段文本,判断该文中有哪些行与该行内容相同"
|
433
|
+
res = list()
|
434
|
+
for i, line in enumerate(self.lines):
|
435
|
+
if line == ob:
|
436
|
+
res.append(i + 1)
|
437
|
+
return res
|
438
|
+
elif isinstance(ob, (list, tuple, collections.Iterable)):
|
439
|
+
return list(map(self.in_line, ob))
|
440
|
+
else:
|
441
|
+
raise ValueError(f'类型错误 {type(ob)}')
|
442
|
+
|
443
|
+
def regular_search(self, re_str):
|
444
|
+
"""同InLine,但是支持正则搜索"""
|
445
|
+
return self.in_line(re.finditer(re_str, self.content))
|
446
|
+
|
447
|
+
def lines_content(self, lines) -> str:
|
448
|
+
"""返回lines集合中数字所对行号的所有内容
|
449
|
+
|
450
|
+
注意输入的lines起始编号是1
|
451
|
+
"""
|
452
|
+
lines = sorted(set(lines)) # 去重
|
453
|
+
res = map(lambda n: '{:6} {}'.format(n, self.lines[n - 1]), lines)
|
454
|
+
return '\n'.join(res)
|
455
|
+
|
456
|
+
def __str__(self):
|
457
|
+
return self.content
|
458
|
+
|
459
|
+
|
460
|
+
def digits2chinese(n):
|
461
|
+
"""TODO:目前处理范围有限,还需要再扩展
|
462
|
+
"""
|
463
|
+
s = '十一二三四五六七八九'
|
464
|
+
if n == 0:
|
465
|
+
return '零'
|
466
|
+
elif n <= 10:
|
467
|
+
return s[n % 10]
|
468
|
+
elif n < 20:
|
469
|
+
return '十' + s[n % 10]
|
470
|
+
elif n < 100:
|
471
|
+
return s[n // 10] + s[n % 10]
|
472
|
+
else:
|
473
|
+
raise NotImplementedError
|
474
|
+
|
475
|
+
|
476
|
+
def chinese2digits(chinese_str):
|
477
|
+
"""把汉字变为阿拉伯数字
|
478
|
+
https://blog.csdn.net/leon_wzm/article/details/78963082
|
479
|
+
"""
|
480
|
+
|
481
|
+
def inner(m):
|
482
|
+
t = m.group()
|
483
|
+
if t is None or t.strip() == '':
|
484
|
+
raise ValueError(f'input error for {chinese_str}')
|
485
|
+
t = t.strip()
|
486
|
+
t = t.replace('百十', '百一十')
|
487
|
+
common_used_numerals = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5,
|
488
|
+
'六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
489
|
+
'百': 100, '千': 1000, '万': 10000, '亿': 100000000}
|
490
|
+
total = 0
|
491
|
+
r = 1 # right,右边一位的值
|
492
|
+
for i in range(len(t) - 1, -1, -1): # 从右往左一位一位读取
|
493
|
+
val = common_used_numerals.get(t[i]) # 使用get不存在会返回None
|
494
|
+
if val is None:
|
495
|
+
# dprint(chinese_str)
|
496
|
+
return chinese_str
|
497
|
+
# raise ValueError(f't[i]={t[i]} can not be accepted.')
|
498
|
+
if val >= 10 and i == 0: # 最左位是“十百千万亿”这样的单位数词
|
499
|
+
if val > r: # 一般是“十三”这类会进入这个if分支
|
500
|
+
r = val
|
501
|
+
total += val
|
502
|
+
else:
|
503
|
+
r *= val
|
504
|
+
elif val >= 10:
|
505
|
+
if val > r: # 跳了单位数词(正常情况都会跳),例如 一万一百零三
|
506
|
+
r = val
|
507
|
+
else: # 单位数词叠加情况,例如 一千亿
|
508
|
+
r *= val
|
509
|
+
else: # 不是单位数词的数词,如果上一步是单位数词,增加一个单位量
|
510
|
+
total += r * val
|
511
|
+
return str(total)
|
512
|
+
|
513
|
+
return re.sub(r'[零一二两三四五六七八九十百千万亿]+', inner, chinese_str)
|
514
|
+
|
515
|
+
|
516
|
+
def briefstr(s):
|
517
|
+
"""对文本内容进行一些修改,从而简化其内容,提取关键信息
|
518
|
+
一般用于字符串近似对比
|
519
|
+
"""
|
520
|
+
# 1 删除所有空白字符
|
521
|
+
# debuglib.dprint(debuglib.typename(s))
|
522
|
+
s = re.sub(r'\s+', '', s)
|
523
|
+
# 2 转小写字符
|
524
|
+
s = s.casefold()
|
525
|
+
return s
|
526
|
+
|
527
|
+
|
528
|
+
@RunOnlyOnce
|
529
|
+
def grp_bracket(depth=0, left='{', right=None, inner=False):
|
530
|
+
r"""括号匹配,默认花括号匹配,也可以改为圆括号、方括号匹配。
|
531
|
+
|
532
|
+
效果类似于“{.*?}”,
|
533
|
+
但是左右花括号是确保匹配的,有可选参数可以提升支持的嵌套层级,
|
534
|
+
数字越大匹配嵌套能力越强,但是速度性能会一定程度降低。
|
535
|
+
例如“grp_bracket(5)”。
|
536
|
+
|
537
|
+
:param depth: 括号递归深度
|
538
|
+
:param left: 左边字符:(、[、{
|
539
|
+
:param right: 右边字符
|
540
|
+
:param inner: 默认只是返回匹配的正则表达式,不编组
|
541
|
+
如果设置inner=True,则会对括号内的内容编组
|
542
|
+
该功能用来代替原来的BRACE5等机制
|
543
|
+
|
544
|
+
:return:
|
545
|
+
|
546
|
+
先了解一下正则常识:
|
547
|
+
>>> re.sub(r'[^\[\]]', r'', r'a[b]a[]') # 删除非方括号
|
548
|
+
'[][]'
|
549
|
+
>>> re.sub(r'[^\(\)]', r'', r'a(b)a()') # 删除非圆括号
|
550
|
+
'()()'
|
551
|
+
>>> re.sub(r'[^()]', r'', r'a(b)a()') # 不用\也可以
|
552
|
+
'()()'
|
553
|
+
|
554
|
+
该函数使用效果:
|
555
|
+
>>> re.sub(grp_bracket(5), r'', r'x{aaa{b{d}b}ccc{d{{}e}ff}gg}y')
|
556
|
+
'xy'
|
557
|
+
>>> re.sub(grp_bracket(5, '(', ')'), r'', r'x(aaa(b(d)b)ccc(d(()e)ff)gg)y')
|
558
|
+
'xy'
|
559
|
+
>>> re.sub(grp_bracket(5, '[', ']'), r'', r'x[aaa[b[d]b]ccc[d[[]e]ff]gg]y')
|
560
|
+
'xy'
|
561
|
+
"""
|
562
|
+
# 用a, b简化引用名称
|
563
|
+
a, b = left, right
|
564
|
+
b = b or {'(': ')', '[': ']', '{': '}'}[a]
|
565
|
+
# 特殊符号需要转义
|
566
|
+
if a in '([':
|
567
|
+
a = '\\' + a
|
568
|
+
if b in ')]':
|
569
|
+
b = '\\' + b
|
570
|
+
c = f'[^{a}{b}]'
|
571
|
+
# 建立匹配素材
|
572
|
+
pattern_0 = f'{a}{c}*{b}'
|
573
|
+
pat_left = f'{a}(?:{c}|'
|
574
|
+
pat_right = f')*{b}'
|
575
|
+
|
576
|
+
# 生成匹配规则的函数
|
577
|
+
def gen(pattern, depth=0):
|
578
|
+
while depth:
|
579
|
+
pattern = pat_left + pattern + pat_right
|
580
|
+
depth -= 1
|
581
|
+
return pattern
|
582
|
+
|
583
|
+
s = gen(pattern_0, depth=depth)
|
584
|
+
|
585
|
+
# inner
|
586
|
+
if inner:
|
587
|
+
return f'{a}({s[len(a):len(s) - len(b)]}){b}'
|
588
|
+
else:
|
589
|
+
return s
|
590
|
+
|
591
|
+
|
592
|
+
def grp_chinese_char():
|
593
|
+
return r'[\u4e00-\u9fa5,。;?()【】、①-⑨]'
|
594
|
+
|
595
|
+
|
596
|
+
def calc_chinese_ratio(s):
|
597
|
+
""" 计算中文字符比例
|
598
|
+
|
599
|
+
>>> calc_chinese_ratio('abc')
|
600
|
+
0.0
|
601
|
+
>>> calc_chinese_ratio('abc中文')
|
602
|
+
0.5714285714285714
|
603
|
+
"""
|
604
|
+
s2 = re.sub(grp_chinese_char(), '', s)
|
605
|
+
b = len(s2)
|
606
|
+
a = 2 * (len(s) - b)
|
607
|
+
# 一个汉字占2个字符权重
|
608
|
+
return safe_div(a, a + b)
|
609
|
+
|
610
|
+
|
611
|
+
def grr_check(m):
|
612
|
+
"""用来检查匹配情况"""
|
613
|
+
s0 = m.group()
|
614
|
+
pass # 还没想好什么样的功能是和写到re.sub里面的repl
|
615
|
+
return s0
|
616
|
+
|
617
|
+
|
618
|
+
def printoneline(s):
|
619
|
+
"""将输出控制在单行,适应终端大小"""
|
620
|
+
try:
|
621
|
+
columns = os.get_terminal_size().columns - 3 # 获取终端的窗口宽度
|
622
|
+
except OSError: # 如果没和终端相连,会抛出异常
|
623
|
+
# 这应该就是在PyCharm,直接来个大值吧
|
624
|
+
columns = 500
|
625
|
+
s = shorten(s, columns)
|
626
|
+
print(s)
|
627
|
+
|
628
|
+
|
629
|
+
def count_word(s, *patterns):
|
630
|
+
""" 统计一串文本中,各种规律串出现的次数
|
631
|
+
|
632
|
+
:param s: 文本内容
|
633
|
+
:param patterns: (正则规则)
|
634
|
+
匹配的多个目标模式list
|
635
|
+
按优先级一个一个往后处理,被处理掉的部分会用\x00代替
|
636
|
+
:return: Counter.most_common() 对象
|
637
|
+
"""
|
638
|
+
s = str(s)
|
639
|
+
|
640
|
+
if not patterns: # 不写参数的时候,默认统计所有单个字符
|
641
|
+
return collections.Counter(list(s)).most_common()
|
642
|
+
|
643
|
+
ls = []
|
644
|
+
for t in patterns:
|
645
|
+
ls += re.findall(t, s)
|
646
|
+
s = re.sub(t, '\x00', s)
|
647
|
+
# s = re.sub(r'\x00+', '\x00', s) # 将连续的特殊删除设为1,减短字符串长度,还未试验这段代码精确度与效率
|
648
|
+
ct = collections.Counter(ls)
|
649
|
+
|
650
|
+
ls = ct.most_common()
|
651
|
+
for i in range(len(ls)):
|
652
|
+
ls[i] = (ls[i][1], repr(ls[i][0])[1:-1])
|
653
|
+
return ls
|
654
|
+
|
655
|
+
|
656
|
+
class Base85Coder:
|
657
|
+
"""base85编码、解码器
|
658
|
+
|
659
|
+
对明文,加密/编码/encode 后已经是乱了看不懂,但是对这个结果还要二次转义
|
660
|
+
对乱码,解密/解码/decode 时顺序要反正来,先处理二次转义,再处理base85
|
661
|
+
|
662
|
+
使用示例:
|
663
|
+
key = 'xV~>Y|@muL<UK$*agCQp=t4c0R_y`Z2;q%s?o8S9(3D5W^-NA&}6v){Twj7MzGePJEfik1bBhn!d#I+HlXFOr'
|
664
|
+
coder = Base85Coder(key)
|
665
|
+
b = coder.encode('陈坤泽 abc')
|
666
|
+
dprint(b) # b<str>=d@7;B}ww?}zfGP#;1
|
667
|
+
s = coder.decode(b)
|
668
|
+
dprint(s) # s<str>=陈坤泽 abc
|
669
|
+
"""
|
670
|
+
DEFAULT_KEY = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~'
|
671
|
+
CHARS_SET = set(DEFAULT_KEY)
|
672
|
+
|
673
|
+
def __init__(self, key=None):
|
674
|
+
"""key,允许设置密钥,必须是"""
|
675
|
+
# 1 分析key是否合法
|
676
|
+
if key:
|
677
|
+
if len(key) != 85 or set(key) != Base85Coder.CHARS_SET:
|
678
|
+
dprint(key) # 输入key无效
|
679
|
+
key = None
|
680
|
+
self.key = key
|
681
|
+
|
682
|
+
# 2 制作转换表 trantab
|
683
|
+
if key:
|
684
|
+
self.encode_trantab = str.maketrans(Base85Coder.DEFAULT_KEY, key)
|
685
|
+
self.decode_trantab = str.maketrans(key, Base85Coder.DEFAULT_KEY)
|
686
|
+
else:
|
687
|
+
self.encode_trantab = self.decode_trantab = None
|
688
|
+
|
689
|
+
def encode(self, s):
|
690
|
+
"""将字符串转字节"""
|
691
|
+
b = base64.b85encode(s.encode('utf8'))
|
692
|
+
b = str(b)[2:-1]
|
693
|
+
if self.encode_trantab:
|
694
|
+
b = b.translate(self.encode_trantab)
|
695
|
+
return b
|
696
|
+
|
697
|
+
def decode(self, b):
|
698
|
+
if self.decode_trantab:
|
699
|
+
b = b.translate(self.decode_trantab)
|
700
|
+
b = b.encode('ascii')
|
701
|
+
s = base64.b85decode(b).decode('utf8')
|
702
|
+
return s
|
703
|
+
|
704
|
+
|
705
|
+
def check_text_row_column(s):
|
706
|
+
"""对一段文本s,用换行符分割行,用至少4个空格或\t分割列,分析数据的行、列数
|
707
|
+
:return:
|
708
|
+
(n, m),每列的列数相等,则会返回n、m>=0的tuple
|
709
|
+
(m1, m2, ...),如果有列数不相等,则会返回每行的列数组成的tuple
|
710
|
+
每个元素用负值代表不匹配
|
711
|
+
"""
|
712
|
+
# 拆开每行的列
|
713
|
+
if not s: return (0, 0)
|
714
|
+
lines = [re.sub(r'( {4,}|\t)+', r'\t', line.strip()).split('\t') for line in s.splitlines()]
|
715
|
+
cols = [len(line) for line in lines] # 计算每行的列数
|
716
|
+
if min(cols) == max(cols):
|
717
|
+
return len(lines), cols[0]
|
718
|
+
else:
|
719
|
+
return [-col for col in cols]
|
720
|
+
|
721
|
+
|
722
|
+
class ListingFormat:
|
723
|
+
r"""列表格式化工具
|
724
|
+
|
725
|
+
>>> li = ListingFormat('(1)')
|
726
|
+
>>> li
|
727
|
+
(1)
|
728
|
+
>>> li.next()
|
729
|
+
>>> li
|
730
|
+
(2)
|
731
|
+
|
732
|
+
>>> li = ListingFormat(('一、选择题', '二、填空题', '三、解答题'))
|
733
|
+
>>> li
|
734
|
+
一、选择题
|
735
|
+
>>> li.next()
|
736
|
+
>>> li
|
737
|
+
二、填空题
|
738
|
+
"""
|
739
|
+
formats = {'[零一二三四五六七八九十]+': (chinese2digits, digits2chinese),
|
740
|
+
r'\d+': (int, str),
|
741
|
+
'[A-Z]': (lambda x: ord(x) - ord('A') + 1, lambda x: chr(ord('A') + x - 1)),
|
742
|
+
'[a-z]': (lambda x: ord(x) - ord('a') + 1, lambda x: chr(ord('a') + x - 1)),
|
743
|
+
'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]': (circlednumber2digits, digits2circlednumber),
|
744
|
+
'[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]': (roman2digits, digits2roman)}
|
745
|
+
|
746
|
+
def __init__(self, s='1'):
|
747
|
+
"""
|
748
|
+
:param s: 列表的格式,含数值和装饰
|
749
|
+
数值形式,目前有六种形式:一 1 A a ① Ⅰ
|
750
|
+
起始值可以不是1,例如写'三'、'D'等
|
751
|
+
装饰的格式,常见的有:'({})' '({})' '{}、' '{}.' '{}. '
|
752
|
+
list或tuple,按顺序取用,用完后不再设置前缀
|
753
|
+
>> ListingFormat('一', '{}、')
|
754
|
+
|
755
|
+
TODO 目前只考虑值较小的情况,如果值太大,有些情况会出bug、报错
|
756
|
+
"""
|
757
|
+
if isinstance(s, str):
|
758
|
+
for k, funcs in ListingFormat.formats.items():
|
759
|
+
if re.search(k, s):
|
760
|
+
self.form = re.sub(k, '{}', s)
|
761
|
+
self.value = int(funcs[0](re.search(k, s).group()))
|
762
|
+
self.func = funcs[1]
|
763
|
+
break
|
764
|
+
else:
|
765
|
+
raise ValueError('列表初始化格式不对 s=' + str(s))
|
766
|
+
elif isinstance(s, (list, tuple)):
|
767
|
+
self.form = s
|
768
|
+
self.value = 0
|
769
|
+
self.func = None
|
770
|
+
else:
|
771
|
+
raise ValueError('列表初始化格式不对 s=' + str(s))
|
772
|
+
|
773
|
+
def reset(self, start=1):
|
774
|
+
"""重置初始值"""
|
775
|
+
self.value = start
|
776
|
+
|
777
|
+
def next(self):
|
778
|
+
self.value += 1
|
779
|
+
|
780
|
+
@classmethod
|
781
|
+
def join(cls, start_idx, ls: list):
|
782
|
+
""" 给ls清单的元素,按照start_idx的格式,添加编号 """
|
783
|
+
lf = cls(start_idx)
|
784
|
+
ls2 = []
|
785
|
+
for x in ls:
|
786
|
+
ls2.append(str(lf) + x)
|
787
|
+
lf.next()
|
788
|
+
return '\n'.join(ls2)
|
789
|
+
|
790
|
+
def __repr__(self):
|
791
|
+
if self.func:
|
792
|
+
return self.form.format(self.func(self.value))
|
793
|
+
else:
|
794
|
+
return self.form[self.value]
|
795
|
+
|
796
|
+
|
797
|
+
class StrDiffType:
|
798
|
+
typename = {
|
799
|
+
0: '完全相同',
|
800
|
+
1: '忽略case后,相同',
|
801
|
+
2: '忽略blank后,相同',
|
802
|
+
3: '忽略case+blank后,相同',
|
803
|
+
4: 'dt是gt局部信息(精度ok,召回不行)',
|
804
|
+
5: '忽略case后,dt是gt局部信息',
|
805
|
+
6: '忽略blank后,dt是gt局部信息',
|
806
|
+
7: '忽略case+blank后,dt是gt局部信息',
|
807
|
+
8: 'gt是dt局部信息(召回ok,精度不行)',
|
808
|
+
9: '忽略case后,gt是dt局部信息',
|
809
|
+
10: '忽略blank后,gt是dt局部信息',
|
810
|
+
11: '忽略case+blank后,gt是dt局部信息',
|
811
|
+
16: '其他情况 (可以根据实验情况,后续继续细分类别)'
|
812
|
+
}
|
813
|
+
|
814
|
+
@classmethod
|
815
|
+
def main_difftype(cls, dt, gt):
|
816
|
+
if not dt or not gt:
|
817
|
+
return 16
|
818
|
+
elif dt == gt:
|
819
|
+
return 0
|
820
|
+
elif dt in gt:
|
821
|
+
return 4
|
822
|
+
elif gt in dt:
|
823
|
+
return 8
|
824
|
+
else:
|
825
|
+
return 16
|
826
|
+
|
827
|
+
@classmethod
|
828
|
+
def difftype(cls, dt, gt):
|
829
|
+
""" 判断两段字符串dt,gt的差异所属类别
|
830
|
+
"""
|
831
|
+
if not dt or not gt: return 16
|
832
|
+
|
833
|
+
t = cls.main_difftype(dt, gt)
|
834
|
+
if t < 16:
|
835
|
+
return t
|
836
|
+
|
837
|
+
t = cls.main_difftype(dt.lower(), gt.lower()) + 1
|
838
|
+
if t < 16:
|
839
|
+
return t
|
840
|
+
|
841
|
+
dt2, gt2 = re.sub(r'\s+', '', dt), re.sub(r'\s+', '', gt)
|
842
|
+
t = cls.main_difftype(dt2, gt2) + 2
|
843
|
+
if t < 16:
|
844
|
+
return t
|
845
|
+
|
846
|
+
dt3, gt3 = dt2.lower(), gt2.lower()
|
847
|
+
t = cls.main_difftype(dt3, gt3) + 3
|
848
|
+
if t < 16:
|
849
|
+
return t
|
850
|
+
else:
|
851
|
+
return 16
|
852
|
+
|
853
|
+
|
854
|
+
class BookContents:
|
855
|
+
""" 书本目录类 """
|
856
|
+
|
857
|
+
def __init__(self):
|
858
|
+
self.contents = [] # 目录条目按顺序保存在list中
|
859
|
+
|
860
|
+
def add(self, level, title, page=None):
|
861
|
+
"""
|
862
|
+
Args:
|
863
|
+
level:
|
864
|
+
title:
|
865
|
+
page: 不一定要放整数的页数,也可以放其他一些比例之类的数值
|
866
|
+
|
867
|
+
Returns:
|
868
|
+
|
869
|
+
"""
|
870
|
+
self.contents.append([level, title, page])
|
871
|
+
|
872
|
+
def format_numbers(self, number='normal', *, indent='', start_level=1, jump=False):
|
873
|
+
""" 每级目录的编号
|
874
|
+
|
875
|
+
:param number: 编号格式,目前有默认方式,以后有需要可以扩展其他模式
|
876
|
+
:param start_level: 开始展示的层级(高层级也会展示,只是不带编号和缩进)
|
877
|
+
可以设为负数,表示自动推算,比如-1
|
878
|
+
:param jump: 支持跳级,比如2级"3",跳到4级本来是"3.0.1",但开启该参数则会优化为"3.1"
|
879
|
+
:return: list,跟contents等长,表示每个标题的编号,可能为空''
|
880
|
+
"""
|
881
|
+
# 1
|
882
|
+
if start_level == -1:
|
883
|
+
# 自动推算合适的开始编号
|
884
|
+
# -1模式,表示第一个不只一项的level
|
885
|
+
levels = [x[0] for x in self.contents]
|
886
|
+
levels_cnt = collections.Counter(levels)
|
887
|
+
for i in range(min(levels), max(levels) + 1):
|
888
|
+
if levels_cnt[i] > 1:
|
889
|
+
start_level = i
|
890
|
+
break
|
891
|
+
|
892
|
+
# 2
|
893
|
+
ls = []
|
894
|
+
ct = collections.defaultdict(int)
|
895
|
+
for x in self.contents:
|
896
|
+
# print(x)
|
897
|
+
level = x[0]
|
898
|
+
sign = indent * (level - start_level)
|
899
|
+
|
900
|
+
# 处理计数器
|
901
|
+
ct[level] += 1
|
902
|
+
for k, v in ct.items():
|
903
|
+
if k > level:
|
904
|
+
ct[k] = 0
|
905
|
+
|
906
|
+
# 当前编号
|
907
|
+
if number == 'normal':
|
908
|
+
numbers = [ct[i] for i in range(start_level, level + 1)]
|
909
|
+
if jump: # 过滤0
|
910
|
+
numbers = [x for x in numbers if x]
|
911
|
+
sign += '.'.join(map(str, numbers))
|
912
|
+
else:
|
913
|
+
pass
|
914
|
+
|
915
|
+
ls.append(sign)
|
916
|
+
|
917
|
+
return ls
|
918
|
+
|
919
|
+
def format_str(self, indent='\t', *, number='normal', page=False, start_level=1, jump=False):
|
920
|
+
""" 转文本展示
|
921
|
+
|
922
|
+
:param indent: 每级展示的缩进量
|
923
|
+
:param page: 是否展示页码
|
924
|
+
"""
|
925
|
+
numbers = self.format_numbers(number, indent=indent, start_level=start_level, jump=jump)
|
926
|
+
|
927
|
+
# 2
|
928
|
+
ls = []
|
929
|
+
for num, x in zip(numbers, self.contents):
|
930
|
+
level, title, page_ = x
|
931
|
+
|
932
|
+
# 标题
|
933
|
+
if level < start_level:
|
934
|
+
sign = title
|
935
|
+
else:
|
936
|
+
sign = ' '.join([num, x[1]])
|
937
|
+
|
938
|
+
# 加后缀
|
939
|
+
if page:
|
940
|
+
sign += f',{page_}'
|
941
|
+
|
942
|
+
ls.append(sign)
|
943
|
+
|
944
|
+
return '\n'.join(ls)
|
945
|
+
|
946
|
+
|
947
|
+
def continuous_zero(s):
|
948
|
+
""" 返回一个字符串中连续0的位置
|
949
|
+
|
950
|
+
:param s: 一个字符串
|
951
|
+
|
952
|
+
做html转latex表格中,合并单元格的处理要用到这个函数计算cline
|
953
|
+
|
954
|
+
>>> continuous_zero('0100') # 从0开始编号,左闭右开区间
|
955
|
+
[(0, 1), (2, 4)]
|
956
|
+
"""
|
957
|
+
return [m.span() for m in re.finditer(r'0+', s)]
|
958
|
+
|
959
|
+
|
960
|
+
class JsonEditConverter:
|
961
|
+
""" 将json转为编辑格式,以及再从编辑格式转换回json """
|
962
|
+
|
963
|
+
def __init__(self):
|
964
|
+
self.edit_format = []
|
965
|
+
self.path_regex = re.compile(r"json_path=(.*)")
|
966
|
+
|
967
|
+
def json_to_edit_format(self, obj, path=""):
|
968
|
+
if isinstance(obj, dict):
|
969
|
+
for key, value in obj.items():
|
970
|
+
new_path = f"{path}.{key}" if path else f"{key}"
|
971
|
+
self.json_to_edit_format(value, new_path)
|
972
|
+
elif isinstance(obj, list):
|
973
|
+
for i, item in enumerate(obj):
|
974
|
+
new_path = f"{path}[{i}]"
|
975
|
+
self.json_to_edit_format(item, new_path)
|
976
|
+
else:
|
977
|
+
type_tag = ""
|
978
|
+
if isinstance(obj, str):
|
979
|
+
type_tag = ",str"
|
980
|
+
elif isinstance(obj, bool):
|
981
|
+
type_tag = ",bool"
|
982
|
+
elif isinstance(obj, (int, float)):
|
983
|
+
type_tag = ",num"
|
984
|
+
elif obj is None:
|
985
|
+
type_tag = ",null"
|
986
|
+
self.edit_format.append(f"json_path={path}{type_tag}\n{obj}")
|
987
|
+
|
988
|
+
def main_json_to_edit_format(self, obj):
|
989
|
+
self.edit_format = []
|
990
|
+
self.json_to_edit_format(obj)
|
991
|
+
return '\n\n'.join(self.edit_format)
|
992
|
+
|
993
|
+
def edit_format_to_json(self, edit_str):
|
994
|
+
obj = None
|
995
|
+
lines = edit_str.strip().split("\n")
|
996
|
+
i = 0
|
997
|
+
while i < len(lines):
|
998
|
+
if lines[i].startswith("json_path="):
|
999
|
+
path_type_pair = lines[i].split(',')
|
1000
|
+
path_str = path_type_pair[0][10:]
|
1001
|
+
type_tag = path_type_pair[1] if len(path_type_pair) > 1 else ""
|
1002
|
+
# 初始化 value_lines 列表收集可能的多行值
|
1003
|
+
value_lines = []
|
1004
|
+
i += 1 # 移动到可能的值的第一行
|
1005
|
+
while i < len(lines) and not lines[i].startswith("json_path="):
|
1006
|
+
value_lines.append(lines[i])
|
1007
|
+
i += 1
|
1008
|
+
# 将收集到的多行值合并为单一字符串
|
1009
|
+
value = "\n".join(value_lines).rstrip()
|
1010
|
+
if obj is None:
|
1011
|
+
obj = [] if path_str.strip().startswith("[") else {}
|
1012
|
+
value = self._convert_value_based_on_type(value, type_tag)
|
1013
|
+
if path_str:
|
1014
|
+
self._set_value_by_path(obj, path_str, value)
|
1015
|
+
else:
|
1016
|
+
obj = value
|
1017
|
+
else:
|
1018
|
+
i += 1
|
1019
|
+
return obj
|
1020
|
+
|
1021
|
+
def _convert_value_based_on_type(self, value, type_tag):
|
1022
|
+
if type_tag == "bool":
|
1023
|
+
return value.lower() == "true"
|
1024
|
+
elif type_tag == "num":
|
1025
|
+
try:
|
1026
|
+
return float(value) if '.' in value else int(value)
|
1027
|
+
except ValueError:
|
1028
|
+
return value # 保留为字符串如果转换失败
|
1029
|
+
elif type_tag == "null":
|
1030
|
+
return None
|
1031
|
+
return value # 默认返回原始字符串,适用于没有特定类型标记的情况
|
1032
|
+
|
1033
|
+
def _set_value_by_path(self, obj, path_str, value):
|
1034
|
+
path_elements = self._parse_path(path_str)
|
1035
|
+
current = obj
|
1036
|
+
for i, element in enumerate(path_elements[:-1]):
|
1037
|
+
if isinstance(element, int): # 处理列表
|
1038
|
+
while len(current) <= element:
|
1039
|
+
current.append(None)
|
1040
|
+
if current[element] is None:
|
1041
|
+
current[element] = [] if isinstance(path_elements[i + 1], int) else {}
|
1042
|
+
current = current[element]
|
1043
|
+
else: # 处理字典
|
1044
|
+
if element not in current:
|
1045
|
+
current[element] = [] if isinstance(path_elements[i + 1], int) else {}
|
1046
|
+
current = current[element]
|
1047
|
+
last_element = path_elements[-1]
|
1048
|
+
if isinstance(last_element, int) and isinstance(current, list):
|
1049
|
+
while len(current) <= last_element:
|
1050
|
+
current.append(None)
|
1051
|
+
current[last_element] = value
|
1052
|
+
else:
|
1053
|
+
current[last_element] = value
|
1054
|
+
|
1055
|
+
def _parse_path(self, path_str):
|
1056
|
+
"""解析路径字符串为一个由字典键和列表索引组成的列表"""
|
1057
|
+
elements = []
|
1058
|
+
for part in re.split(r'\.|\[|\]', path_str):
|
1059
|
+
if part.isdigit(): # 是数字,列表索引
|
1060
|
+
elements.append(int(part))
|
1061
|
+
elif part: # 非空字符串,字典键
|
1062
|
+
elements.append(part)
|
1063
|
+
return elements
|
1064
|
+
|
1065
|
+
|
1066
|
+
def trial_jsoneditconveter():
|
1067
|
+
def trial_data(json_obj):
|
1068
|
+
converter = JsonEditConverter()
|
1069
|
+
edit_text = converter.main_json_to_edit_format(json_obj)
|
1070
|
+
print("转换为编辑格式:\n" + edit_text)
|
1071
|
+
updated_json = converter.edit_format_to_json(edit_text)
|
1072
|
+
print("解析后的JSON:\n" + json.dumps(updated_json, indent=4))
|
1073
|
+
|
1074
|
+
# 测试样例1:嵌套结构
|
1075
|
+
json_obj1 = {
|
1076
|
+
"name": "Example",
|
1077
|
+
"details": {
|
1078
|
+
"description": 123,
|
1079
|
+
"tags": ["demo", "json"]
|
1080
|
+
},
|
1081
|
+
"items": [{"name": "Item 1"}, {"name": "Item 2"}]
|
1082
|
+
}
|
1083
|
+
#
|
1084
|
+
|
1085
|
+
# 测试样例2:list结构,以及各种特殊类型
|
1086
|
+
json_obj2 = [1, 2, 3, True, '123', None, {'a': 1}]
|
1087
|
+
|
1088
|
+
# 测试样例3:无容器结构
|
1089
|
+
json_obj3 = "Hello World!"
|
1090
|
+
json_obj4 = 123
|
1091
|
+
|
1092
|
+
trial_data(json_obj1)
|
1093
|
+
trial_data(json_obj2)
|
1094
|
+
trial_data(json_obj3)
|
1095
|
+
trial_data(json_obj4)
|
1096
|
+
|
1097
|
+
|
1098
|
+
class UrlQueryBuilder:
|
1099
|
+
def __init__(self):
|
1100
|
+
self.params = {}
|
1101
|
+
|
1102
|
+
def add_param(self, key, value):
|
1103
|
+
if value is not None:
|
1104
|
+
self.params[key] = value
|
1105
|
+
return self
|
1106
|
+
|
1107
|
+
def remove_param(self, key):
|
1108
|
+
if key in self.params:
|
1109
|
+
del self.params[key]
|
1110
|
+
return self
|
1111
|
+
|
1112
|
+
def update_param(self, key, value):
|
1113
|
+
self.add_param(key, value)
|
1114
|
+
return self
|
1115
|
+
|
1116
|
+
def build_query(self):
|
1117
|
+
query_string = '&'.join([f'{key}={value}' for key, value in self.params.items()])
|
1118
|
+
return ('?' + query_string) if query_string else ''
|
1119
|
+
|
1120
|
+
def build_url(self, url):
|
1121
|
+
return f'{url}{self.build_query()}'
|