pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +9 -2
- pyxllib/algo/__init__.py +8 -0
- pyxllib/algo/disjoint.py +54 -0
- pyxllib/algo/geo.py +541 -0
- pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
- pyxllib/algo/matcher.py +389 -0
- pyxllib/algo/newbie.py +166 -0
- pyxllib/algo/pupil.py +629 -0
- pyxllib/algo/shapelylib.py +67 -0
- pyxllib/algo/specialist.py +241 -0
- pyxllib/algo/stat.py +494 -0
- pyxllib/algo/treelib.py +149 -0
- pyxllib/algo/unitlib.py +66 -0
- pyxllib/autogui/__init__.py +5 -0
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/autogui/autogui.py +852 -0
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/virtualkey.py +102 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/__init__.py +1 -11
- pyxllib/cv/expert.py +267 -0
- pyxllib/cv/{imlib.py → imfile.py} +18 -83
- pyxllib/cv/imhash.py +39 -0
- pyxllib/cv/pupil.py +9 -0
- pyxllib/cv/rgbfmt.py +1525 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/cv/trackbartools.py +163 -49
- pyxllib/cv/xlcvlib.py +1040 -0
- pyxllib/cv/xlpillib.py +423 -0
- pyxllib/data/__init__.py +0 -0
- pyxllib/data/echarts.py +240 -0
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/{util/oss2_.py → data/oss.py} +11 -9
- pyxllib/data/pglib.py +1127 -0
- pyxllib/data/sqlite.py +568 -0
- pyxllib/{util → data}/sqllib.py +13 -31
- pyxllib/ext/JLineViewer.py +505 -0
- pyxllib/ext/__init__.py +6 -0
- pyxllib/{util → ext}/demolib.py +119 -35
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +12 -0
- pyxllib/{util/main.py → ext/old.py} +122 -284
- pyxllib/ext/qt.py +449 -0
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/seleniumlib.py +76 -0
- pyxllib/{util/tklib.py → ext/tk.py} +10 -11
- pyxllib/ext/unixlib.py +827 -0
- pyxllib/ext/utools.py +351 -0
- pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
- pyxllib/ext/win32lib.py +40 -0
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1105 -0
- pyxllib/file/__init__.py +17 -0
- pyxllib/file/docxlib.py +761 -0
- pyxllib/{util → file}/gitlib.py +40 -27
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +148 -0
- pyxllib/file/newbie.py +10 -0
- pyxllib/file/onenotelib.py +1469 -0
- pyxllib/file/packlib/__init__.py +330 -0
- pyxllib/{util → file/packlib}/zipfile.py +598 -195
- pyxllib/file/pdflib.py +426 -0
- pyxllib/file/pupil.py +185 -0
- pyxllib/file/specialist/__init__.py +685 -0
- pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
- pyxllib/file/specialist/download.py +193 -0
- pyxllib/file/specialist/filelib.py +2829 -0
- pyxllib/file/xlsxlib.py +3131 -0
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/__init__.py +5 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/deprecatedlib.py +233 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/ipyexec.py +253 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +451 -0
- pyxllib/prog/pupil.py +1197 -0
- pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
- pyxllib/prog/specialist/__init__.py +391 -0
- pyxllib/prog/specialist/bc.py +203 -0
- pyxllib/prog/specialist/browser.py +497 -0
- pyxllib/prog/specialist/common.py +347 -0
- pyxllib/prog/specialist/datetime.py +199 -0
- pyxllib/prog/specialist/tictoc.py +240 -0
- pyxllib/prog/specialist/xllog.py +180 -0
- pyxllib/prog/xlosenv.py +108 -0
- pyxllib/stdlib/__init__.py +17 -0
- pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
- pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
- pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
- pyxllib/text/__init__.py +8 -0
- pyxllib/text/ahocorasick.py +39 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +121 -0
- pyxllib/text/jiebalib.py +267 -0
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +922 -0
- pyxllib/text/latex/__init__.py +158 -0
- pyxllib/text/levenshtein.py +303 -0
- pyxllib/text/nestenv.py +1215 -0
- pyxllib/text/newbie.py +300 -0
- pyxllib/text/pupil/__init__.py +8 -0
- pyxllib/text/pupil/common.py +1121 -0
- pyxllib/text/pupil/xlalign.py +326 -0
- pyxllib/text/pycode.py +47 -0
- pyxllib/text/specialist/__init__.py +8 -0
- pyxllib/text/specialist/common.py +112 -0
- pyxllib/text/specialist/ptag.py +186 -0
- pyxllib/text/spellchecker.py +172 -0
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/vbacode.py +17 -0
- pyxllib/text/xmllib.py +747 -0
- pyxllib/xl.py +39 -0
- pyxllib/xlcv.py +17 -0
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
- pyxllib/basic/_1_strlib.py +0 -945
- pyxllib/basic/_2_timelib.py +0 -488
- pyxllib/basic/_3_pathlib.py +0 -916
- pyxllib/basic/_4_loglib.py +0 -419
- pyxllib/basic/__init__.py +0 -54
- pyxllib/basic/arrow_.py +0 -250
- pyxllib/basic/chardet_.py +0 -66
- pyxllib/basic/dirlib.py +0 -529
- pyxllib/basic/dprint.py +0 -202
- pyxllib/basic/extension.py +0 -12
- pyxllib/basic/judge.py +0 -31
- pyxllib/basic/log.py +0 -204
- pyxllib/basic/pathlib_.py +0 -705
- pyxllib/basic/pytictoc.py +0 -102
- pyxllib/basic/qiniu_.py +0 -61
- pyxllib/basic/strlib.py +0 -761
- pyxllib/basic/timer.py +0 -132
- pyxllib/cv/cv.py +0 -834
- pyxllib/cv/cvlib/_1_geo.py +0 -543
- pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
- pyxllib/cv/cvlib/_2_imgproc.py +0 -594
- pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
- pyxllib/cv/cvlib/_4_cvimg.py +0 -211
- pyxllib/cv/cvlib/__init__.py +0 -10
- pyxllib/cv/debugtools.py +0 -82
- pyxllib/cv/fitz_.py +0 -300
- pyxllib/cv/installer.py +0 -42
- pyxllib/debug/_0_installer.py +0 -38
- pyxllib/debug/_1_typelib.py +0 -277
- pyxllib/debug/_2_chrome.py +0 -198
- pyxllib/debug/_3_showdir.py +0 -161
- pyxllib/debug/_4_bcompare.py +0 -140
- pyxllib/debug/__init__.py +0 -49
- pyxllib/debug/bcompare.py +0 -132
- pyxllib/debug/chrome.py +0 -198
- pyxllib/debug/installer.py +0 -38
- pyxllib/debug/showdir.py +0 -158
- pyxllib/debug/typelib.py +0 -278
- pyxllib/image/__init__.py +0 -12
- pyxllib/torch/__init__.py +0 -20
- pyxllib/torch/modellib.py +0 -37
- pyxllib/torch/trainlib.py +0 -344
- pyxllib/util/__init__.py +0 -20
- pyxllib/util/aip_.py +0 -141
- pyxllib/util/casiadb.py +0 -59
- pyxllib/util/excellib.py +0 -495
- pyxllib/util/filelib.py +0 -612
- pyxllib/util/jsondata.py +0 -27
- pyxllib/util/jsondata2.py +0 -92
- pyxllib/util/labelmelib.py +0 -139
- pyxllib/util/onepy/__init__.py +0 -29
- pyxllib/util/onepy/onepy.py +0 -574
- pyxllib/util/onepy/onmanager.py +0 -170
- pyxllib/util/pyautogui_.py +0 -219
- pyxllib/util/textlib.py +0 -1305
- pyxllib/util/unorder.py +0 -22
- pyxllib/util/xmllib.py +0 -639
- pyxllib-0.0.43.dist-info/METADATA +0 -39
- pyxllib-0.0.43.dist-info/RECORD +0 -80
- pyxllib-0.0.43.dist-info/top_level.txt +0 -1
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/text/nestenv.py
ADDED
@@ -0,0 +1,1215 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : chenkz@histudy.com
|
5
|
+
# @Date : 2019/02/20 10:03
|
6
|
+
|
7
|
+
import bisect
|
8
|
+
import re
|
9
|
+
|
10
|
+
from pyxllib.algo.intervals import Intervals, ReMatch
|
11
|
+
from pyxllib.text.newbie import bracket_match2
|
12
|
+
from pyxllib.text.pupil import grp_bracket, strfind, findspan, substr_count
|
13
|
+
|
14
|
+
|
15
|
+
def pqmove(s, p, q):
|
16
|
+
"""在s[p:q]定位基础上,再类似strip去掉两边空白向内缩"""
|
17
|
+
if p == q:
|
18
|
+
return p, q
|
19
|
+
while s[p] in ' \t':
|
20
|
+
p += 1
|
21
|
+
if s[p] == '\n':
|
22
|
+
p += 1
|
23
|
+
|
24
|
+
if q == -1: q = len(s) # 没找到tail匹配,就以字符串末尾作为q
|
25
|
+
while s[q - 1] in ' \t':
|
26
|
+
q -= 1
|
27
|
+
if s[q - 1] == '\n':
|
28
|
+
q -= 1
|
29
|
+
|
30
|
+
return p, q
|
31
|
+
|
32
|
+
|
33
|
+
class __NestEnvBase:
|
34
|
+
__slots__ = ('s', 'intervals')
|
35
|
+
|
36
|
+
def __init__(self, s, intervals=None):
|
37
|
+
self.s = s
|
38
|
+
if intervals is None: intervals = Intervals([[0, len(s)]])
|
39
|
+
self.intervals = Intervals(intervals)
|
40
|
+
|
41
|
+
@classmethod
|
42
|
+
def from_fragments(cls, fragments):
|
43
|
+
""" 输入拆分好的文本片段进行初始化
|
44
|
+
|
45
|
+
:param list fragments: 第0个str是区间外文本,第1个str是区间内的文本,第2个是区间外,依次类推
|
46
|
+
如果起始是区间外,可以加个''空片段在最前面实现占位
|
47
|
+
|
48
|
+
>>> __NestEnvBase.from_fragments(['a', '12', 'bc', '3', 'e']).strings()
|
49
|
+
['12', '3']
|
50
|
+
>>> __NestEnvBase.from_fragments(['', '12', 'bc', '3', 'e']).strings()
|
51
|
+
['12', '3']
|
52
|
+
"""
|
53
|
+
intervals = []
|
54
|
+
start = 0
|
55
|
+
for i, t in enumerate(fragments):
|
56
|
+
if i % 2: # 区间内
|
57
|
+
intervals.append([start, start + len(t)])
|
58
|
+
start += len(t)
|
59
|
+
return cls(''.join(fragments), intervals)
|
60
|
+
|
61
|
+
def inner(self, head, tail=None):
|
62
|
+
r""" 0、匹配标记里,不含head、tail标记
|
63
|
+
|
64
|
+
>>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\ce{').inner(r'\ce{').replace('x')
|
65
|
+
'01\\ce{H2O\\ce{x}}01\\ce{1\\ce{x}5}'
|
66
|
+
>>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\cc{').string()
|
67
|
+
>>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\cc{').string()
|
68
|
+
'01\\ce{H2O\\ce{2}}01\\ce{1\\ce{3}5}'
|
69
|
+
|
70
|
+
TODO 注意 topic、analysis 这类定位 该函数目前还不支持,会有bug
|
71
|
+
TODO 0的标记其实不好,不方便功能组合,1和2是互斥的,但是0和2不是互斥的,是可以组合的,即范围外含标签的内容,4会更合适,但现在改也挺别扭的,就先记录着,以后再看
|
72
|
+
"""
|
73
|
+
li = []
|
74
|
+
for reg in self.intervals:
|
75
|
+
left, right = reg.start(), reg.end()
|
76
|
+
li.extend(substr_intervals(self.s[left:right], head, tail, inner=True) + left)
|
77
|
+
return type(self)(self.s, Intervals(li))
|
78
|
+
|
79
|
+
def inside(self, head, tail=None):
|
80
|
+
r""" 1、匹配标记里
|
81
|
+
|
82
|
+
>>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inside(r'\ce{').replace('x')
|
83
|
+
'01x01x'
|
84
|
+
"""
|
85
|
+
li = []
|
86
|
+
for reg in self.intervals:
|
87
|
+
left, right = reg.start(), reg.end()
|
88
|
+
li.extend(substr_intervals(self.s[left:right], head, tail) + left)
|
89
|
+
return type(self)(self.s, Intervals(li))
|
90
|
+
|
91
|
+
def outside(self, head, tail=None):
|
92
|
+
r""" 2、匹配标记外
|
93
|
+
|
94
|
+
>>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\ce{').replace(lambda s: 'x')
|
95
|
+
'x\\ce{H2O\\ce{2}}x\\ce{1\\ce{3}5}'
|
96
|
+
"""
|
97
|
+
li = []
|
98
|
+
for reg in self.intervals:
|
99
|
+
left, right = reg.start(), reg.end()
|
100
|
+
li.extend(substr_intervals(self.s[left:right], head, tail, invert=True) + left)
|
101
|
+
return type(self)(self.s, Intervals(li))
|
102
|
+
|
103
|
+
def expand(self, ne, adjacent=False):
|
104
|
+
r""" 在现有区间上,判断是否有被其他区间包含,有则进行延展
|
105
|
+
可以输入head、tail配对规则,也可以输入现成的区间
|
106
|
+
|
107
|
+
:param adjacent: 是否支持邻接的区间扩展
|
108
|
+
|
109
|
+
>>> ne = LatexNestEnv(r'aa$cc\ce{a}dd$bb\ce{d}h$h$')
|
110
|
+
>>> ne.latexcmd1(r'ce').expand(ne.formula()).strings()
|
111
|
+
['$cc\\ce{a}dd$', '\\ce{d}']
|
112
|
+
"""
|
113
|
+
if isinstance(ne, NestEnv):
|
114
|
+
b = ne.intervals
|
115
|
+
elif isinstance(ne, Intervals):
|
116
|
+
b = ne
|
117
|
+
else:
|
118
|
+
raise TypeError
|
119
|
+
if adjacent:
|
120
|
+
c = self.intervals + Intervals([x for x in b if (self.intervals.is_adjacent_and(x))])
|
121
|
+
c = Intervals(c.merge_intersect_interval(adjacent=True))
|
122
|
+
else:
|
123
|
+
c = self.intervals + Intervals([x for x in b if (self.intervals & x)])
|
124
|
+
return type(self)(self.s, c)
|
125
|
+
|
126
|
+
def filter(self, func):
|
127
|
+
r""" 传入一个自定义函数func,会将每个区间的s传入,只保留func(s)为True的区间
|
128
|
+
|
129
|
+
>>> LatexNestEnv('aa$bbb$ccc$d$eee$fff$g').formula().filter(lambda s: len(s) > 4).strings()
|
130
|
+
['$bbb$', '$fff$']
|
131
|
+
"""
|
132
|
+
li = list(filter(lambda x: func(self.s[x.start():x.end()]), self.intervals))
|
133
|
+
return type(self)(self.s, li)
|
134
|
+
|
135
|
+
def _parse_tags(self, tags):
|
136
|
+
if not isinstance(tags[0], (list, tuple)):
|
137
|
+
# 旧单维数组输入,要先转成二维结构
|
138
|
+
n = len(tags) // 3
|
139
|
+
assert n and n * 3 == len(tags)
|
140
|
+
tags = [tags[3 * i:3 * (i + 1)] for i in range(n)]
|
141
|
+
return tags
|
142
|
+
|
143
|
+
# def any(self, tags):
|
144
|
+
# r""" 区间集求并
|
145
|
+
#
|
146
|
+
# :param tags: 同nestenv的tags参数规则
|
147
|
+
#
|
148
|
+
# >>> NestEnv(r'12$34$56\ce{78}90').any(['$', '$', 1, r'\ce{', '}', 1]).replace(lambda s: 'x')
|
149
|
+
# '12x56x90'
|
150
|
+
# """
|
151
|
+
# tags, li = self._parse_tags(tags), []
|
152
|
+
# for tag in tags:
|
153
|
+
# head, tail, t = tag
|
154
|
+
# for reg in self.intervals:
|
155
|
+
# left, right = reg.start(), reg.end()
|
156
|
+
# li.extend(substr_intervals(self.s[left:right], head, tail, invert=(t == 2), inner=(t == 0)) + left)
|
157
|
+
# return NestEnv(self.s, Intervals(li))
|
158
|
+
|
159
|
+
# def all(self, tags):
|
160
|
+
# r""" 区间集求交
|
161
|
+
#
|
162
|
+
# :param tags: 同nestenv的tags参数规则
|
163
|
+
#
|
164
|
+
# # 删除即在公式里,也在ce里的内容
|
165
|
+
# >>> NestEnv(r'12$34$56\ce{78$x$}90').all([r'\ce{', '}', 1, '$', '$', 1]).replace(lambda s: '')
|
166
|
+
# '12$34$56\\ce{78}90'
|
167
|
+
#
|
168
|
+
# >>> NestEnv(r'12$34$56\ce{78$x$}90').all(['$', '$', 1, r'\ce{', '}', 1]).replace(lambda s: '')
|
169
|
+
# '12$34$56\\ce{78}90'
|
170
|
+
# """
|
171
|
+
# tags, intervals = self._parse_tags(tags), self.intervals
|
172
|
+
# for tag in tags:
|
173
|
+
# head, tail, t = tag
|
174
|
+
# li = []
|
175
|
+
# for reg in self.intervals:
|
176
|
+
# left, right = reg.start(), reg.end()
|
177
|
+
# li.extend(substr_intervals(self.s[left:right], head, tail, invert=(t == 2), inner=(t == 0)) + left)
|
178
|
+
# intervals &= Intervals(li)
|
179
|
+
# return NestEnv(self.s, intervals)
|
180
|
+
|
181
|
+
def __repr__(self):
|
182
|
+
"""不在定位范围内的非换行字符,全部替换为空格"""
|
183
|
+
t = self.intervals.replace(self.s, lambda s: s, out_repl=lambda s: re.sub(r'[^\n]', ' ', s))
|
184
|
+
return t
|
185
|
+
|
186
|
+
def __bool__(self):
|
187
|
+
"""NestEnv类的布尔逻辑由区间集的逻辑确定"""
|
188
|
+
return bool(self.intervals)
|
189
|
+
|
190
|
+
def __eq__(self, other):
|
191
|
+
return self.s == other.s and self.intervals == other.intervals
|
192
|
+
|
193
|
+
def string(self, idx=0):
|
194
|
+
"""第一个区间匹配的值
|
195
|
+
|
196
|
+
>>> NestEnv('11a22b33a44bcc').find2('a', 'b').string()
|
197
|
+
'a22b'
|
198
|
+
"""
|
199
|
+
if self.intervals and idx < len(self.intervals):
|
200
|
+
r = self.intervals.li[idx]
|
201
|
+
return self.s[r.start():r.end()]
|
202
|
+
else:
|
203
|
+
return None
|
204
|
+
|
205
|
+
def strings(self):
|
206
|
+
"""所有区间匹配的值"""
|
207
|
+
if self.intervals:
|
208
|
+
return [self.s[r.start():r.end()] for r in self.intervals]
|
209
|
+
else:
|
210
|
+
return []
|
211
|
+
|
212
|
+
def startlines(self, unique=False):
|
213
|
+
r""" 每个匹配到的区间处于原内容s的第几行
|
214
|
+
|
215
|
+
>>> NestEnv('{}\naa\n{}\n{}{}a\nb').inside('{', '}').startlines()
|
216
|
+
[1, 3, 4, 4]
|
217
|
+
"""
|
218
|
+
if not self.intervals: return []
|
219
|
+
# 1 辅助变量
|
220
|
+
linepos = [m.start() for m in re.finditer(r'\n', self.s)]
|
221
|
+
n = len(self.s)
|
222
|
+
if n and (not linepos or linepos[-1] != n): linepos.append(n)
|
223
|
+
# 2 每个子区间起始行号
|
224
|
+
lines = [bisect.bisect_right(linepos, x.start() - 1) + 1 for x in self.intervals]
|
225
|
+
if unique: lines = sorted(set(lines))
|
226
|
+
return lines
|
227
|
+
|
228
|
+
def group(self, idx=0):
|
229
|
+
""" 第idx个匹配区间,以match格式返回
|
230
|
+
|
231
|
+
注意子区间从0开始编号,注意这里0和re库group(0)的区别!
|
232
|
+
"""
|
233
|
+
if self.intervals and idx < len(self.intervals):
|
234
|
+
r = self.intervals.li[idx]
|
235
|
+
return ReMatch(r.regs, self.s, 0, len(self.s))
|
236
|
+
else:
|
237
|
+
return None
|
238
|
+
|
239
|
+
def groups(self):
|
240
|
+
"""所有匹配区间,以match格式返回"""
|
241
|
+
if self.intervals:
|
242
|
+
return [ReMatch(r.regs, self.s, 0, len(self.s)) for r in self.intervals]
|
243
|
+
else:
|
244
|
+
return []
|
245
|
+
|
246
|
+
# TODO def gettag、settag、gettags、settags 特殊的inside操作
|
247
|
+
# TODO def getattr、setattr、getattrs、setattrs
|
248
|
+
|
249
|
+
def sub(self, infunc=lambda m: m.group(), *, outfunc=lambda m: m.group(), adjacent=False) -> str:
|
250
|
+
"""类似re.sub正则模式的替换"""
|
251
|
+
return self.intervals.sub(self.s, infunc, out_repl=outfunc, adjacent=adjacent)
|
252
|
+
|
253
|
+
def replace(self, arg1, arg2=None, *, outfunc=lambda s: s, adjacent=False) -> str:
|
254
|
+
""" 类似字符串replace模式的替换
|
255
|
+
|
256
|
+
arg1可以输入自定义替换函数,也可以像str.replace(arg1, arg2)这样传入参数
|
257
|
+
"""
|
258
|
+
return self.intervals.replace(self.s, arg1, arg2, out_repl=outfunc, adjacent=adjacent)
|
259
|
+
|
260
|
+
def __invert__(self):
|
261
|
+
r"""
|
262
|
+
>>> (~NestEnv('aa$b$cc').find2('$', '$')).strings()
|
263
|
+
['aa', 'cc']
|
264
|
+
"""
|
265
|
+
return type(self)(self.s, self.intervals.invert(len(self.s)))
|
266
|
+
|
267
|
+
def invert(self):
|
268
|
+
r"""
|
269
|
+
>>> NestEnv('aa$b$cc').find2('$', '$').invert().strings()
|
270
|
+
['aa', 'cc']
|
271
|
+
"""
|
272
|
+
return ~self
|
273
|
+
|
274
|
+
def __and__(self, other):
|
275
|
+
r""" 区间集求并运算
|
276
|
+
|
277
|
+
>>> s = 'aa$b$ccc$dd$eee'
|
278
|
+
>>> (NestEnv(s).find2('$', '$') & NestEnv(s).inside('a', 'd')).strings()
|
279
|
+
['$b$', '$d']
|
280
|
+
>>> (NestEnv(s).find2('$', '$') & re.finditer(r'a.*?d', s)).strings()
|
281
|
+
['$b$', '$d']
|
282
|
+
"""
|
283
|
+
if isinstance(other, Intervals):
|
284
|
+
return type(self)(self.s, self.intervals & other)
|
285
|
+
elif isinstance(other, NestEnv):
|
286
|
+
if self.s != other.s: # 两个不是同个文本内容的话是不能合并的
|
287
|
+
raise ValueError('两个NestEnv的主文本内容不相同,不能求子区间集的交')
|
288
|
+
return type(self)(self.s, self.intervals & other.intervals)
|
289
|
+
else: # 其他一律转Intervals对象处理
|
290
|
+
# raise TypeError(rf'NestEnv不能和{type(other)}类型做区间集交运算')
|
291
|
+
return type(self)(self.s, self.intervals & Intervals(other))
|
292
|
+
|
293
|
+
def __or__(self, other):
|
294
|
+
""" 区间集相加运算
|
295
|
+
|
296
|
+
>>> s = 'aa$b$ccc$dd$eee'
|
297
|
+
>>> (NestEnv(s).find2('$', '$') | NestEnv(s).inside('a', 'd')).strings()
|
298
|
+
['aa$b$ccc$dd$']
|
299
|
+
>>> (NestEnv(s).find2('$', '$') | re.finditer(r'a.*?d', s)).strings()
|
300
|
+
['aa$b$ccc$dd$']
|
301
|
+
"""
|
302
|
+
if isinstance(other, Intervals):
|
303
|
+
return type(self)(self.s, self.intervals | other)
|
304
|
+
elif isinstance(other, NestEnv):
|
305
|
+
if self.s != other.s:
|
306
|
+
raise ValueError('两个NestEnv的主文本内容不相同,不能求子区间集的并')
|
307
|
+
return type(self)(self.s, self.intervals | other.intervals)
|
308
|
+
else: # 其他一律转Intervals对象处理
|
309
|
+
return type(self)(self.s, self.intervals | Intervals(other))
|
310
|
+
|
311
|
+
def __add__(self, other):
|
312
|
+
return self | other
|
313
|
+
|
314
|
+
def __sub__(self, other):
|
315
|
+
""" 区间集减法运算
|
316
|
+
|
317
|
+
>>> s = 'aa$b$ccc$dd$eee'
|
318
|
+
>>> (NestEnv(s).find2('$', '$') - NestEnv(s).inside('a', 'd')).strings()
|
319
|
+
['d$']
|
320
|
+
>>> (NestEnv(s).find2('$', '$') - re.finditer(r'a.*?d', s)).strings()
|
321
|
+
['d$']
|
322
|
+
"""
|
323
|
+
if isinstance(other, Intervals):
|
324
|
+
return type(self)(self.s, self.intervals - other)
|
325
|
+
elif isinstance(other, NestEnv):
|
326
|
+
if self.s != other.s:
|
327
|
+
raise ValueError('两个NestEnv的主文本内容不相同,子区间集不能相减')
|
328
|
+
return type(self)(self.s, self.intervals - other.intervals)
|
329
|
+
else: # 其他一律转Intervals对象处理
|
330
|
+
return type(self)(self.s, self.intervals - Intervals(other))
|
331
|
+
|
332
|
+
def nest(self, func, invert=False):
|
333
|
+
""" 对每个子区间进行一层嵌套定位
|
334
|
+
|
335
|
+
:param func: 输入一个函数,模式为 func(s)
|
336
|
+
支持输入一个字符串,返回一个"区间集like"对象
|
337
|
+
:param invert: 是否对最终的结果再做一次取反
|
338
|
+
:return: 返回一个新的NestEnv对象
|
339
|
+
|
340
|
+
注意所有的定位功能,基本都要基于这个模式开发。
|
341
|
+
因为不是要对self.s整串匹配,而是要嵌套处理,只处理self.intervals标记的区间。
|
342
|
+
"""
|
343
|
+
li = []
|
344
|
+
for reg in self.intervals:
|
345
|
+
left, right = reg.start(), reg.end()
|
346
|
+
t = self.s[left:right]
|
347
|
+
res = Intervals(func(t))
|
348
|
+
if invert: res = res.invert(len(t))
|
349
|
+
li.extend(res + left)
|
350
|
+
return type(self)(self.s, Intervals(li))
|
351
|
+
|
352
|
+
def highlight(self, colors=None, **kwargs):
|
353
|
+
from pyxllib.algo.intervals import highlight_intervals
|
354
|
+
return highlight_intervals(self.s, self.intervals, colors=colors, **kwargs)
|
355
|
+
|
356
|
+
|
357
|
+
class NestEnv(__NestEnvBase):
|
358
|
+
"""可以在该类扩展特定字符串匹配功能
|
359
|
+
实现方法可以参照find、find2
|
360
|
+
核心是要实现一个core函数
|
361
|
+
支持输入一个字符串s,能计算出需要定位的子区间集位置
|
362
|
+
"""
|
363
|
+
|
364
|
+
def find(self, head, invert=False):
|
365
|
+
r"""没有tail,仅查找head获得区间集的算法
|
366
|
+
|
367
|
+
>>> ne = NestEnv('111222333')
|
368
|
+
>>> ne.find('2').strings()
|
369
|
+
['2', '2', '2']
|
370
|
+
>>> ne.find('2', invert=True).strings()
|
371
|
+
['111', '333']
|
372
|
+
>>> ne.find('4').strings()
|
373
|
+
[]
|
374
|
+
>>> ne.find('4', invert=True).strings()
|
375
|
+
['111222333']
|
376
|
+
>>> ne.find('22').find('2').strings()
|
377
|
+
['2', '2']
|
378
|
+
"""
|
379
|
+
|
380
|
+
def core(s):
|
381
|
+
pos1, parts = 0, []
|
382
|
+
while True:
|
383
|
+
pos2 = s.find(head, pos1)
|
384
|
+
if pos2 == -1: break
|
385
|
+
pos1 = pos2 + len(head)
|
386
|
+
parts.append([pos2, pos1])
|
387
|
+
return parts
|
388
|
+
|
389
|
+
return self.nest(core, invert)
|
390
|
+
|
391
|
+
def find2(self, head, tail, *, inner=False, invert=False, symmetry=False):
|
392
|
+
r""" 配对字符串匹配
|
393
|
+
|
394
|
+
:param head: 默认字符串匹配,也支持输入re.compile的正则
|
395
|
+
:param tail: 同head
|
396
|
+
:param symmetry: 要求匹配到的head和tail数量对称
|
397
|
+
这个算法会慢非常多,如无必要不用开
|
398
|
+
|
399
|
+
>>> ne = NestEnv('111222333')
|
400
|
+
>>> ne.find2('1', '3').strings()
|
401
|
+
['1112223']
|
402
|
+
"""
|
403
|
+
|
404
|
+
def core(s):
|
405
|
+
pos1, parts = 0, []
|
406
|
+
while True:
|
407
|
+
# 找到第1个head
|
408
|
+
pos2, pos2end = findspan(s, head, pos1)
|
409
|
+
if pos2 == -1:
|
410
|
+
break
|
411
|
+
|
412
|
+
# 找到上一个head后,最近出现的tail
|
413
|
+
pos3, pos1 = findspan(s, tail, pos2end)
|
414
|
+
|
415
|
+
if symmetry:
|
416
|
+
while True:
|
417
|
+
substr = s[pos2:pos1]
|
418
|
+
cnt1, cnt2 = substr_count(substr, head), substr_count(substr, tail)
|
419
|
+
|
420
|
+
if pos3 == -1 or cnt1 == cnt2:
|
421
|
+
break
|
422
|
+
else:
|
423
|
+
pos3, pos1 = findspan(s, tail, pos1)
|
424
|
+
|
425
|
+
if pos3 == -1:
|
426
|
+
# 有头无尾,不处理,跳过
|
427
|
+
break
|
428
|
+
|
429
|
+
# 坐标计算、存储
|
430
|
+
if inner:
|
431
|
+
parts.append(pqmove(s, pos2end, pos3))
|
432
|
+
else:
|
433
|
+
parts.append([pos2, pos1])
|
434
|
+
|
435
|
+
return parts
|
436
|
+
|
437
|
+
return self.nest(core, invert)
|
438
|
+
|
439
|
+
def search(self, pattern, flags=0, group=0, invert=False):
|
440
|
+
r""" 正则模式匹配
|
441
|
+
|
442
|
+
:param group: 可以指定返回的编组内容,默认第0组
|
443
|
+
|
444
|
+
>>> NestEnv(r'xx\teste{aa}{bb}').search(r'\\test[a-z]*').strings()
|
445
|
+
['\\teste']
|
446
|
+
|
447
|
+
TODO 如果需要用inner可以用命名组 (?P<inner>.*?),含inner组名时,inner默认值为True
|
448
|
+
"""
|
449
|
+
|
450
|
+
def core(s):
|
451
|
+
return [m.span(group) for m in re.finditer(pattern, s, flags=flags)]
|
452
|
+
|
453
|
+
return self.nest(core, invert)
|
454
|
+
|
455
|
+
def search2(self, pattern1, pattern2, *, inner=False, flags1=0, flags2=0, invert=False):
|
456
|
+
""" 配对正则匹配
|
457
|
+
TODO 实现应该可以参考find2
|
458
|
+
"""
|
459
|
+
head = re.compile(pattern1, flags=flags1)
|
460
|
+
tail = re.compile(pattern2, flags=flags2)
|
461
|
+
return self.find2(head, tail, inner=inner, invert=invert)
|
462
|
+
|
463
|
+
def bracket(self, head, tail=None, inner=False, *, latexenv=False, invert=False):
|
464
|
+
r""" (尾)括号匹配
|
465
|
+
head最后一个字符是参考匹配括号
|
466
|
+
tail可以自定义,甚至可以长度不为1,但长度超过1时,算法是有bug的,只是不会抛出异常而已
|
467
|
+
|
468
|
+
:param latexenv: latex环境下的括号匹配,需要忽略注释,以及\{等转义的影响
|
469
|
+
目前并未细致优化该功能,只是暂时考虑了\{的影响
|
470
|
+
|
471
|
+
>>> NestEnv('__{_}_[_]_{[_]+[_]}__').bracket('{', '}').bracket('[', ']', inner=True).replace('1')
|
472
|
+
'__{_}_[_]_{[1]+[1]}__'
|
473
|
+
|
474
|
+
>>> NestEnv('__{_}_[_]_{[_]+[_]}__').bracket('{', '}', inner=True).bracket('[', ']', invert=True).replace('1')
|
475
|
+
'__{1}_[_]_{[_]1[_]}__'
|
476
|
+
|
477
|
+
>>> NestEnv(r'xx\ce{b{c}d}b').bracket(r'\ce{').strings()
|
478
|
+
['\\ce{b{c}d}']
|
479
|
+
|
480
|
+
>>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').bracket(r'\ce{', inner=True).bracket(r'\ce{', inner=True).replace(lambda s: 'x')
|
481
|
+
'01\\ce{H2O\\ce{x}}01\\ce{1\\ce{x}5}'
|
482
|
+
"""
|
483
|
+
|
484
|
+
def core(s):
|
485
|
+
# # 1 对常用的latex命令括号做加速 (做了实验发现加速差不到哪里去,还不如不加速)
|
486
|
+
# if re.match(r'\\[a-zA-Z]+\{$', head) and tail == '}':
|
487
|
+
# i = 'inner' if inner else 0
|
488
|
+
# brace5 = r'{(?P<inner>(?:[^{}]|{(?:[^{}]|{(?:[^{}]|{(?:[^{}]|{(?:[^{}]|{[^{}]*})*})*})*})*})*)}'
|
489
|
+
# return [m.span(i) for m in re.finditer(r'\\' + head[1:-1] + brace5, s)]
|
490
|
+
|
491
|
+
# 2 原版正常功能
|
492
|
+
pos1, parts = 0, []
|
493
|
+
# TODO grp_bracket(5, head[-1]),使用 grp_bracket 正则实现方式提速(需要测试性能差别,和一定测试)
|
494
|
+
pos2 = s.find(head, pos1)
|
495
|
+
while pos2 >= 0:
|
496
|
+
# 找tail位置,目标区段
|
497
|
+
p = bracket_match2(s, pos2 + len(head) - 1)
|
498
|
+
if not p:
|
499
|
+
s += ' '
|
500
|
+
p = len(s)
|
501
|
+
pos1 = p + 1
|
502
|
+
|
503
|
+
if inner:
|
504
|
+
parts.append(pqmove(s, pos2 + len(head), pos1 - len(tail)))
|
505
|
+
else:
|
506
|
+
parts.append([pos2, pos1])
|
507
|
+
|
508
|
+
pos2 = s.find(head, pos1)
|
509
|
+
if pos2 < pos1: break
|
510
|
+
return parts
|
511
|
+
|
512
|
+
# 自动推导 tail 的取值
|
513
|
+
if not tail and head[-1] in '[{(<': # 配对括号
|
514
|
+
tail = {'[': ']', '{': '}', '(': ')', '<': '>'}[head[-1]]
|
515
|
+
|
516
|
+
return self.nest(core, invert)
|
517
|
+
|
518
|
+
def bracket2(self, head, tail=None, inner=False, *, latexenv=False, invert=False):
|
519
|
+
r""" (头)括号匹配
|
520
|
+
>>> NestEnv(r'xx{\centerline{aa}b}yy').bracket2(r'{\centerline').strings()
|
521
|
+
['{\\centerline{aa}b}']
|
522
|
+
"""
|
523
|
+
|
524
|
+
def core(s):
|
525
|
+
pos1, parts = 0, []
|
526
|
+
# TODO 考虑用正则实现提速?
|
527
|
+
pos2 = s.find(head, pos1)
|
528
|
+
while pos2 >= 0:
|
529
|
+
# 找tail位置,目标区段
|
530
|
+
p = bracket_match2(s, pos2)
|
531
|
+
if not p:
|
532
|
+
s += ' '
|
533
|
+
p = len(s)
|
534
|
+
pos1 = p + 1
|
535
|
+
|
536
|
+
if inner:
|
537
|
+
parts.append(pqmove(s, pos2 + len(head), pos1 - len(tail)))
|
538
|
+
else:
|
539
|
+
parts.append([pos2, pos1])
|
540
|
+
|
541
|
+
pos2 = s.find(head, pos1)
|
542
|
+
if pos2 < pos1: break
|
543
|
+
return parts
|
544
|
+
|
545
|
+
# 自动推导 tail 的取值
|
546
|
+
if not tail and head[0] in '[{(<': # 配对括号
|
547
|
+
tail = {'[': ']', '{': '}', '(': ')', '<': '>'}[head[0]]
|
548
|
+
|
549
|
+
return self.nest(core, invert)
|
550
|
+
|
551
|
+
def xmltag(self, head, inner=False, invert=False, symmetry=True):
|
552
|
+
r"""
|
553
|
+
>>> s = 'a\n<p class="clearfix">\nbb\n</p>c<p a="2"/>cc'
|
554
|
+
>>> NestEnv(s).xmltag('p').replace('x')
|
555
|
+
'a\nxc<p a="2"/>cc'
|
556
|
+
|
557
|
+
陷阱备忘:
|
558
|
+
1、百分注就那么几种格式,所以写的稍微不太严谨也么关系的,比如不会出现自闭合标签
|
559
|
+
2、但要用正则做通用的xml格式解析,就很难去保证严谨性能,很容易出bug
|
560
|
+
3、但用标准的xml解析也不行,因为很多文本场合并不是严格意义上的xml文档
|
561
|
+
4、似乎只有用编译原理的理念一个个字符去解析文本才能真正确保准确性了。。。
|
562
|
+
但这不切实际,实际可行方案还是得用正则,虽然不严谨有风险
|
563
|
+
"""
|
564
|
+
# 暂不考虑自关闭 <a/>的情况
|
565
|
+
h = re.compile(rf'<({head})(?:>|\s.*?>)', flags=re.DOTALL)
|
566
|
+
t = re.compile(f'</{head}>')
|
567
|
+
return self.find2(h, t, inner=inner, invert=invert, symmetry=symmetry)
|
568
|
+
|
569
|
+
def attr(self, name, part=0, prefix=r'(?<![a-zA-Z])', suffix=r'\s*=\s*', invert=False):
|
570
|
+
r"""
|
571
|
+
:param name: 正则规则的属性名
|
572
|
+
:param prefix: 前向切割断言
|
573
|
+
:param suffix: 后缀与值之间的间隔
|
574
|
+
:param part:
|
575
|
+
'value', 纯的属性值,不含引号
|
576
|
+
'rawvalue',如果属性被引号包裹,则只返回包括引号本身的内容
|
577
|
+
'name',仅属性名
|
578
|
+
'name-op', 属性名和设置的=内容
|
579
|
+
0,(默认)整串内容
|
580
|
+
:return:
|
581
|
+
|
582
|
+
>>> NestEnv('a b=12 3 c').attr('b', 'value').string()
|
583
|
+
'12'
|
584
|
+
>>> NestEnv('a b=123 c').attr('a', 'value').string()
|
585
|
+
>>> NestEnv('a b="123" c').attr('b', 'rawvalue').string() # 匹配含引号本身的值
|
586
|
+
'"123"'
|
587
|
+
>>> NestEnv("a b='123' c").attr('b', 'rawvalue').string()
|
588
|
+
"'123'"
|
589
|
+
>>> NestEnv('a b="123" c').attr('b', 'value').string() # 仅匹配值
|
590
|
+
'123'
|
591
|
+
>>> NestEnv('a b="123" c').attr('b').string() # 匹配整串
|
592
|
+
'b="123"'
|
593
|
+
>>> NestEnv('a=ab b="123" c').attr(r'(a|b)', 'rawvalue').strings() # 正则匹配属性名
|
594
|
+
['ab', '"123"']
|
595
|
+
>>> NestEnv('a=ab b="123" c').attr(r'(a|b)', 'name').strings()
|
596
|
+
['a', 'b']
|
597
|
+
>>> NestEnv('a= ab b ="123" c').attr(r'(a|b)', 'name-op').strings()
|
598
|
+
['a= ', 'b =']
|
599
|
+
"""
|
600
|
+
|
601
|
+
def core(s):
|
602
|
+
p, parts = 0, []
|
603
|
+
while True:
|
604
|
+
m0 = re.search(pattern0, s[p:])
|
605
|
+
if not m0: break
|
606
|
+
|
607
|
+
q = p + m0.end()
|
608
|
+
ch = s[q] if q < len(s) else ''
|
609
|
+
if ch in '"\'': # 下一个字符是双引号或者单引号
|
610
|
+
m1 = re.search(fr'{ch}[^ch]*{ch}', s[q:])
|
611
|
+
inner_left, inner_right = m1.start() + 1, m1.end() - 1
|
612
|
+
else: # 没有引号
|
613
|
+
m1 = re.search(r'\S*', s[q:])
|
614
|
+
inner_left, inner_right = m1.span()
|
615
|
+
|
616
|
+
if part == 0:
|
617
|
+
parts.append([p + m0.start(), q + m1.end()])
|
618
|
+
elif part == 'value':
|
619
|
+
parts.append([q + inner_left, q + inner_right])
|
620
|
+
elif part == 'rawvalue':
|
621
|
+
parts.append([q + m1.start(), q + m1.end()])
|
622
|
+
elif part == 'name':
|
623
|
+
parts.append([p + m0.start(), p + m0.start('op')])
|
624
|
+
elif part == 'name-op':
|
625
|
+
parts.append([p + m0.start(), p + m0.end()])
|
626
|
+
else:
|
627
|
+
raise ValueError(f'part名称不对{part}')
|
628
|
+
|
629
|
+
p = q + m1.end()
|
630
|
+
return parts
|
631
|
+
|
632
|
+
pattern0 = fr'{prefix}{name}(?P<op>{suffix})'
|
633
|
+
return self.nest(core, invert)
|
634
|
+
|
635
|
+
def pathstem(self):
|
636
|
+
r""" TODO 路径相关的规则匹配
|
637
|
+
例如pathstem可以和includegraphics('inner')配合
|
638
|
+
但这个接口最后不一定是这样设计的,可能会写个通用的path处理接口
|
639
|
+
"""
|
640
|
+
raise NotImplementedError
|
641
|
+
|
642
|
+
def paragraph(self, linefeed=1, invert=False):
|
643
|
+
""" 定位段落
|
644
|
+
:param linefeed: 每个段落间至少间隔换行符数量
|
645
|
+
"""
|
646
|
+
|
647
|
+
def core(s):
|
648
|
+
return list(filter(lambda m: m.group().count('\n') >= linefeed, re.finditer(r'\s+', s)))
|
649
|
+
|
650
|
+
# 由于这个算法核心是要定位分隔符,最后实际效果是否invert是要取反的
|
651
|
+
return self.nest(core, not invert)
|
652
|
+
|
653
|
+
|
654
|
+
class LatexNestEnv(NestEnv):
|
655
|
+
def includegraphics(self, part=0, invert=False):
|
656
|
+
r""" 能抓取各种插图命令,使用inner可以只获得图片文件名
|
657
|
+
|
658
|
+
:param part:
|
659
|
+
0 全内容
|
660
|
+
cmd 命令名
|
661
|
+
optional 可选参数的内容
|
662
|
+
inner 花括号里的参数内容
|
663
|
+
stem inner里不含路径、扩展名的纯文件名
|
664
|
+
|
665
|
+
>>> s = r'阳离子:\underline{\includegraphics{18pH-g1=8-8.eps}\qquad \figt[9pt]{18pH-g1=8-9.eps}}'
|
666
|
+
>>> LatexNestEnv(s).includegraphics().strings()
|
667
|
+
['\\includegraphics{18pH-g1=8-8.eps}', '\\figt[9pt]{18pH-g1=8-9.eps}']
|
668
|
+
>>> LatexNestEnv(s).includegraphics('inner').strings()
|
669
|
+
['18pH-g1=8-8.eps', '18pH-g1=8-9.eps']
|
670
|
+
"""
|
671
|
+
|
672
|
+
def core(s):
|
673
|
+
grp_bracket3 = '{(?P<inner>(?:[^{}]|{(?:[^{}]|{(?:[^{}]|{[^{}]*})*})*})*)}'
|
674
|
+
pattern = r'\\(?P<cmd>includegraphics|figt|figc|figr|fig)(?P<optional>.*?)' + grp_bracket3
|
675
|
+
return [m.span(part) for m in re.finditer(pattern, s, flags=re.DOTALL + re.MULTILINE)]
|
676
|
+
|
677
|
+
if part == 'stem': raise NotImplementedError
|
678
|
+
return self.nest(core, invert)
|
679
|
+
|
680
|
+
def lewis(self, inner=False, invert=False):
|
681
|
+
r""" 电子式的匹配
|
682
|
+
这个本身是有个命令的并不难,难的是实际情况中,往往两边会有拓展
|
683
|
+
|
684
|
+
>>> LatexNestEnv(r'aa H\Lewis{0:2:4:6:,N}H bb').inside(r'\lewis').strings()
|
685
|
+
['H\\Lewis{0:2:4:6:,N}H']
|
686
|
+
>>> LatexNestEnv(r'aa H\Lewis{0:2:4:6:,N}H bb').lewis().strings()
|
687
|
+
['H\\Lewis{0:2:4:6:,N}H']
|
688
|
+
"""
|
689
|
+
|
690
|
+
def core(s):
|
691
|
+
if inner: # 只取\lewis{}花括号里内容的定位
|
692
|
+
raise ValueError(r"lewis模式没有inner模式,如果需要可以使用NestEnv(s).inner(r'\lewis{')")
|
693
|
+
|
694
|
+
lewis = r'\\(l|L)ewis' + grp_bracket(5, inner=True) # 基本匹配模式
|
695
|
+
ms = re.finditer(rf'(H?~*{lewis}\s*|~*H)*(~*{lewis}|~*H)', s) # 有一定的延展
|
696
|
+
return [m.span(0) for m in ms if 'lewis' in m.group().lower()]
|
697
|
+
|
698
|
+
return self.nest(core, invert)
|
699
|
+
|
700
|
+
def item(self, invert=False):
|
701
|
+
r""" 主要用于word转latex中,对不含百分注,但是有基本的\itemQ、\test、itemKey等的切分定位,找出每个item的区间
|
702
|
+
|
703
|
+
# TODO 支持inner功能
|
704
|
+
"""
|
705
|
+
|
706
|
+
def core(s):
|
707
|
+
pos1, parts = 0, []
|
708
|
+
pos2 = strfind(s, (r'\item', r'\test'), start=pos1)
|
709
|
+
while pos2 >= 0:
|
710
|
+
# 找tail位置,目标区段
|
711
|
+
p = strfind(s, (r'\item', r'\test'), start=pos2 + 1)
|
712
|
+
if p == -1:
|
713
|
+
s += ' '
|
714
|
+
p = len(s)
|
715
|
+
# else:
|
716
|
+
# while s[p-1] in ' \t\n': p -= 1
|
717
|
+
if p < len(s) and s[p] == '\n': p += 1
|
718
|
+
while s[p - 1] in ' \t\n': p -= 1
|
719
|
+
pos1 = p
|
720
|
+
parts.append([pos2, pos1])
|
721
|
+
pos2 = strfind(s, (r'\item', r'\test'), start=pos1)
|
722
|
+
if pos2 < pos1: break
|
723
|
+
return parts
|
724
|
+
|
725
|
+
return self.nest(core, invert)
|
726
|
+
|
727
|
+
def latexcmd(self, name=r'[a-zA-Z]+', *, part=0, star=True, optional=True,
|
728
|
+
min_bracket=0, max_bracket=float('inf'), brackets=None,
|
729
|
+
linefeed=1, invert=False):
|
730
|
+
r""" 匹配latex命令区间
|
731
|
+
|
732
|
+
:param part: TODO 功能待开发~~
|
733
|
+
0,整块内容
|
734
|
+
name,仅命令名,如果有star会含*
|
735
|
+
optional,可选参数部分
|
736
|
+
optional-value,可选参数里的值
|
737
|
+
|
738
|
+
rawvalues,含花括号的整组匹配内容,例如 {...}{...}{...}
|
739
|
+
rawvalue1,仅每个命令第一对花括号内容
|
740
|
+
rawvalue2,仅每个命令第二对花括号内容
|
741
|
+
...rawvalue
|
742
|
+
|
743
|
+
values,仅花括号里的值,如果bracket不只一个,会拆成多个子区间,即虽然匹配到2个命令,但有可能得到6个子区间
|
744
|
+
value1,仅每个命令的第一个花括号里的值
|
745
|
+
value2,仅每个命令的第二个花括号里的值
|
746
|
+
...valuex
|
747
|
+
|
748
|
+
:param name: 命令名,需要使用正则模式指明匹配规则,不用写前缀\\和后缀(?![a-zA-Z])
|
749
|
+
:param star: 命令后是否支持跟一个*,默认支持。可以设置star=False强制不支持
|
750
|
+
:param optional: 是否支持 [...] 的可选参数模式,默认支持
|
751
|
+
:param min_bracket: 最少要有多少对花括号,默认可以没有
|
752
|
+
:param max_bracket: 最多匹配多少对花括号,默认值inf表示不设上限
|
753
|
+
:param brackets: 特定匹配多少对花括号
|
754
|
+
使用该参数时,min_bracket、max_bracket重置为brackets设定的值
|
755
|
+
:param linefeed: 各项内容之间最多只能有几个换行
|
756
|
+
:return:
|
757
|
+
|
758
|
+
TODO 后面有余力可以考虑下怎么扩展inner参数
|
759
|
+
|
760
|
+
算法:由于情况的复杂性,基本思路是只能一步一步往前search
|
761
|
+
注意:这里暂不支持multirow那种可以在第2个参数写*的情形,那种情况可以另外再开发multirow匹配函数
|
762
|
+
|
763
|
+
>>> LatexNestEnv('\n\\ssb{有关概念及其相互关系}\n\n{\\includegraphics{19pS-g4=5-1.png}}').latexcmd().replace('')
|
764
|
+
'\n\n\n{}'
|
765
|
+
"""
|
766
|
+
|
767
|
+
def core(s):
|
768
|
+
right, parts = 0, []
|
769
|
+
while True:
|
770
|
+
m0 = re.search(r'\\(' + name + r')(?![a-zA-Z])', s[right:])
|
771
|
+
if not m0: break
|
772
|
+
left, right = m0.start() + right, m0.end() + right
|
773
|
+
|
774
|
+
if star:
|
775
|
+
m1 = re.match(r'(\s*)(\*)', s[right:])
|
776
|
+
if m1 and m1.group(1).count('\n') <= linefeed and m1.group(2):
|
777
|
+
right += m1.end()
|
778
|
+
|
779
|
+
if optional:
|
780
|
+
m2 = re.match(r'(\s*)(' + grp_bracket(5, '[') + ')', s[right:])
|
781
|
+
if m2 and m2.group(1).count('\n') <= linefeed and m2.group(2):
|
782
|
+
right += m2.end()
|
783
|
+
|
784
|
+
cur_cnt, pattern = 0, r'(\s*)(' + grp_bracket(5) + ')'
|
785
|
+
max_bracket_ = max_bracket
|
786
|
+
if max_bracket == float('inf'):
|
787
|
+
if m0.group(1) in ('begin', 'end'): max_bracket_ = 1 # 有些命令只能匹配一个花括号
|
788
|
+
if m0.group(1) in ('hfil', 'hfill'): max_bracket_ = 0 # 有些命令不能匹配花括号
|
789
|
+
while cur_cnt < max_bracket_:
|
790
|
+
m3 = re.match(pattern, s[right:])
|
791
|
+
if m3 and m3.group(1).count('\n') <= linefeed and m3.group(2):
|
792
|
+
right += m3.end()
|
793
|
+
cur_cnt += 1
|
794
|
+
else:
|
795
|
+
break
|
796
|
+
|
797
|
+
if cur_cnt >= min_bracket:
|
798
|
+
parts.append([left, right])
|
799
|
+
|
800
|
+
return parts
|
801
|
+
|
802
|
+
if brackets:
|
803
|
+
min_bracket = max_bracket = brackets
|
804
|
+
|
805
|
+
return self.nest(core, invert)
|
806
|
+
|
807
|
+
def latexcmd0(self, name=r'[a-zA-Z]+', *, part=0, star=False, optional=False,
|
808
|
+
min_bracket=0, max_bracket=0, brackets=None,
|
809
|
+
linefeed=1, invert=False):
|
810
|
+
r""" 只匹配命令本身,不含star、optional、brackets
|
811
|
+
"""
|
812
|
+
return self.latexcmd(name, part=part, star=star, optional=optional,
|
813
|
+
min_bracket=min_bracket, max_bracket=max_bracket, brackets=brackets,
|
814
|
+
linefeed=linefeed, invert=invert)
|
815
|
+
|
816
|
+
def latexcmd1(self, name=r'[a-zA-Z]+', *, part=0, star=True, optional=True,
|
817
|
+
min_bracket=1, max_bracket=1, brackets=None,
|
818
|
+
linefeed=1, invert=False):
|
819
|
+
r""" 只有一个花括号的latex命令匹配
|
820
|
+
"""
|
821
|
+
return self.latexcmd(name, part=part, star=star, optional=optional,
|
822
|
+
min_bracket=min_bracket, max_bracket=max_bracket, brackets=brackets,
|
823
|
+
linefeed=linefeed, invert=invert)
|
824
|
+
|
825
|
+
def latexenv(self, head, tail=None, inner=False, invert=False):
|
826
|
+
r""" latex的\begin、\end环境匹配,支持嵌套定位
|
827
|
+
|
828
|
+
>>> s = r"\begin{center}\begin{tabular}\begin{tabular}\end{tabular}\end{tabular}\end{center}"
|
829
|
+
>>> LatexNestEnv(s).latexenv('tabular').replace('x')
|
830
|
+
'\\begin{center}x\\end{center}'
|
831
|
+
|
832
|
+
TODO 因为存在自嵌套情况,暂时还不好对head扩展支持正则匹配模式
|
833
|
+
"""
|
834
|
+
|
835
|
+
def core(s):
|
836
|
+
pos1, parts = 0, []
|
837
|
+
# 最外层的head支持有额外杂质(tail暂不支持杂质),但是内部的h、t不考虑杂质,但最好不要遇到、用到这么危险的小概率功能
|
838
|
+
h, t = re.match(r'\\begin{[a-zA-Z*]+}', head).group(), re.match(r'\\end{[a-zA-Z*]+}', tail).group()
|
839
|
+
while True:
|
840
|
+
pos2 = s.find(head, pos1)
|
841
|
+
if pos2 == -1: break
|
842
|
+
# 处理嵌套环境,所以要计算区间里和t出现的次数
|
843
|
+
cnt1, cnt2, pos1 = 1, 0, pos2 + len(head)
|
844
|
+
while cnt1 != cnt2: # 当前pos2~pos1区间里,h和t的数量如果不配对,需要往后寻找下一个t
|
845
|
+
pos1 = s.find(t, pos1)
|
846
|
+
if pos1 == -1: # 匹配失败,应该是数据有问题,不配对
|
847
|
+
break
|
848
|
+
else:
|
849
|
+
pos1 += len(t) # 往右移动pos1
|
850
|
+
cnt1, cnt2 = s[:pos1].count(h), s[:pos1].count(t)
|
851
|
+
if pos1 != -1:
|
852
|
+
if inner:
|
853
|
+
parts.append(pqmove(s, pos2 + len(head), pos1 - len(tail)))
|
854
|
+
else:
|
855
|
+
parts.append([pos2, pos1])
|
856
|
+
return parts
|
857
|
+
|
858
|
+
# 参数推算
|
859
|
+
if re.match(r'[a-zA-Z*]+$', head):
|
860
|
+
head = r'\begin{' + head + '}'
|
861
|
+
if not tail and re.match(r'\\begin{[a-zA-Z*]+}', head): # latex类的环境匹配
|
862
|
+
m = re.match(r'\\begin({[a-zA-Z*]+})', head)
|
863
|
+
tail = r'\end' + m.group(1)
|
864
|
+
|
865
|
+
return self.nest(core, invert)
|
866
|
+
|
867
|
+
def latexenv1(self, inner=False, invert=False):
|
868
|
+
r""" 定位文本中所有最外层的 \begin、\end 环境
|
869
|
+
|
870
|
+
这个函数需要依赖基础功能latexenv来实现
|
871
|
+
"""
|
872
|
+
|
873
|
+
def core(s):
|
874
|
+
parts = []
|
875
|
+
|
876
|
+
start_idx = 0
|
877
|
+
while True:
|
878
|
+
_s = s[start_idx:]
|
879
|
+
# 找文本中出现的第一个\begin{xxx}
|
880
|
+
m = re.search(r'\\begin{([a-zA-Z*]+)}', _s)
|
881
|
+
if not m:
|
882
|
+
break
|
883
|
+
|
884
|
+
# 找第一组配对的begin, end
|
885
|
+
name = m.group(1)
|
886
|
+
m = LatexNestEnv(_s).latexenv(name, inner=inner).group(0)
|
887
|
+
if not m: # 配对的时候可能也会出现找不到的情景,这种情况其实可以考虑报警告日志
|
888
|
+
break
|
889
|
+
parts.append([start_idx + m.start(), start_idx + m.end()])
|
890
|
+
start_idx += m.end()
|
891
|
+
|
892
|
+
return parts
|
893
|
+
|
894
|
+
return self.nest(core, invert)
|
895
|
+
|
896
|
+
def latexcomment(self, include_pxmltag=False, invert=False):
|
897
|
+
""" latex 的注释性代码
|
898
|
+
|
899
|
+
:param include_pxmltag: 是否包含进百分注,默认不包含
|
900
|
+
"""
|
901
|
+
if include_pxmltag:
|
902
|
+
pattern = r'(?<!\\)%.*'
|
903
|
+
else:
|
904
|
+
pattern = r'(?<!\\)%(?!<).*'
|
905
|
+
return self.search(pattern, invert=invert)
|
906
|
+
|
907
|
+
def formula(self, inner=False, invert=False, *, mode=3):
|
908
|
+
r""" 公式匹配
|
909
|
+
|
910
|
+
>>> LatexNestEnv(r'aa$bb$cc').formula().strings()
|
911
|
+
['$bb$']
|
912
|
+
|
913
|
+
:param inner: 要定位获取的公式内容
|
914
|
+
False,0,默认值 取公式及其对应的定界符
|
915
|
+
True 只取公式内部的实际内容,不含公式定界符
|
916
|
+
:param mode: 匹配模式,由以下几个数值的二进制组合而成
|
917
|
+
1,$...$、$$...$$
|
918
|
+
2,$\begin{array}...\end{array}$
|
919
|
+
4,\[...\]
|
920
|
+
8,\(...\)
|
921
|
+
|
922
|
+
没有定界符的情况?(这个功能太复杂,只是一个初步设想,如果实现的话,还是后续放到另一个地方去)
|
923
|
+
|
924
|
+
TODO 遇到 "$$ x xx $xx$",前面的$$是中间漏了内容的,应该要有异常处理机制,至少报个错误位置吧
|
925
|
+
"""
|
926
|
+
|
927
|
+
def core(s):
|
928
|
+
if r'\\$' in s:
|
929
|
+
raise ValueError(r'内容中含有\\$,请先跑「refine_formula」加上空格')
|
930
|
+
i = 'inner' if inner else 0
|
931
|
+
|
932
|
+
res = Intervals()
|
933
|
+
if mode & 1:
|
934
|
+
# 线上才要考虑转义情况,线下也有可能\\后面跟$是不用处理的
|
935
|
+
li = [m.span(i) for m in re.finditer(r'(?<!\\)(\$\$?)(?P<inner>.*?)(?<!\\)\1', s, flags=re.DOTALL)]
|
936
|
+
res += Intervals(li)
|
937
|
+
if mode & 2:
|
938
|
+
li = [m.span(i) for m in
|
939
|
+
re.finditer(r'\$\s*\\begin{array}\s*(?P<inner>.*?)\s*\\end{array}\s*\$', s, flags=re.DOTALL)]
|
940
|
+
res += Intervals(li)
|
941
|
+
if mode & 4:
|
942
|
+
li = [m.span(i) for m in re.finditer(r'\\\[(?P<inner>.*?)\\\]', s, flags=re.DOTALL)]
|
943
|
+
res += Intervals(li)
|
944
|
+
if mode & 8:
|
945
|
+
li = [m.span(i) for m in re.finditer(r'\\\((?P<inner>.*?)\\\)', s, flags=re.DOTALL)]
|
946
|
+
res += Intervals(li)
|
947
|
+
|
948
|
+
return res
|
949
|
+
|
950
|
+
return self.nest(core, invert)
|
951
|
+
|
952
|
+
def pxmltag(self, name, part=0, invert=False):
|
953
|
+
r""" 百分注结构匹配
|
954
|
+
p(百分号 percent 的缩写) + xml + tag
|
955
|
+
|
956
|
+
:param part:
|
957
|
+
0 完整的全内容匹配
|
958
|
+
head 百分注开标签前的内容,含边界的换行等空白字符
|
959
|
+
inner_head 百分注开标签前的内容,不含边界的换行等空白字符
|
960
|
+
open 开标签的内容,%<...>
|
961
|
+
open_name 开标签的名称
|
962
|
+
open_attrs 开标签的属性
|
963
|
+
inner_open 不含 %<、> 的边界,但如果有左右空白还是带入
|
964
|
+
body 开关标签间的内容,含边界
|
965
|
+
inner 开关标签间的内容,不含边界
|
966
|
+
close 关标签的内容,%</...>
|
967
|
+
close_name 同inner_close
|
968
|
+
inner_close 不含 %</、> 的边界,但如果有左右空白还是会带入
|
969
|
+
tail 关标签后的内容,含边界,注意before和tail在定位topic等时可能会有重叠
|
970
|
+
inner_tail 关标签后的内容,不含边界
|
971
|
+
|
972
|
+
>>> s = 'a\n%<topic>\nbb\n%</topic>c'
|
973
|
+
>>> LatexNestEnv(s).pxmltag('topic').replace('x')
|
974
|
+
'a\nxc'
|
975
|
+
|
976
|
+
head支持正则模式,例如:stem|answer
|
977
|
+
TODO 怎么避免属性值中含有 > 的干扰?
|
978
|
+
"""
|
979
|
+
|
980
|
+
def head(s):
|
981
|
+
ms = re.finditer(r'(?P<head>\s*(?P<inner_head>.*?)\s*)(?P<tail>%<.*?>)', s)
|
982
|
+
return [m.span(part) for m in ms if re.search(fr'%<({name})', m.group('tail'))]
|
983
|
+
|
984
|
+
def open(s):
|
985
|
+
pattern = fr'(?<!\\)%<(?P<open_name>{name})(>|\s[^\n]*?>)'
|
986
|
+
if part == 'open':
|
987
|
+
return [m.span() for m in re.finditer(pattern, s)]
|
988
|
+
elif part == 'inner_open':
|
989
|
+
return [[m.start() + 2, m.end() - 1] for m in re.finditer(pattern, s)]
|
990
|
+
elif part == 'open_name':
|
991
|
+
return [m.span('open_name') for m in re.finditer(pattern, s)]
|
992
|
+
elif part == 'open_attrs':
|
993
|
+
return [m.span(2) for m in re.finditer(fr'(?<!\\)%<({name})\s+(.+?)>', s)]
|
994
|
+
else:
|
995
|
+
raise ValueError(f'part名称不对{part}')
|
996
|
+
|
997
|
+
def core(s):
|
998
|
+
pattern = fr'(?<!\\)%<({name})(?:>|\s[^\n]*?>)(?P<body>\s*(?P<inner>.*?)\s*)%</\1>'
|
999
|
+
return [m.span(part) for m in re.finditer(pattern, s, flags=re.DOTALL + re.MULTILINE)]
|
1000
|
+
|
1001
|
+
def close(s):
|
1002
|
+
pattern = fr'(?<!\\)%</(?P<close_name>{name})>'
|
1003
|
+
res = [m.span() for m in re.finditer(pattern, s)]
|
1004
|
+
if part in ('inner_close', 'close_name'):
|
1005
|
+
res = [[x[0] + 3, x[1] - 1] for x in res]
|
1006
|
+
return res
|
1007
|
+
|
1008
|
+
def tail(s):
|
1009
|
+
pattern = fr'(?<!\\)%</({name})>(?P<tail>\s*(?P<inner_tail>.*?)\s*)(?=%<|$)'
|
1010
|
+
return [m.span(part) for m in re.finditer(pattern, s, flags=re.DOTALL)]
|
1011
|
+
|
1012
|
+
if name.startswith('%<'):
|
1013
|
+
raise ValueError
|
1014
|
+
|
1015
|
+
if part in (0, 'body', 'inner'):
|
1016
|
+
return self.nest(core, invert)
|
1017
|
+
if part in ('head', 'inner_head'):
|
1018
|
+
return self.nest(head, invert)
|
1019
|
+
elif part in ('tail', 'inner_tail'):
|
1020
|
+
return self.nest(tail, invert)
|
1021
|
+
elif part in ('open', 'inner_open', 'open_name', 'open_attrs'):
|
1022
|
+
return self.nest(open, invert)
|
1023
|
+
elif part in ('close', 'inner_close'):
|
1024
|
+
return self.nest(close, invert)
|
1025
|
+
else:
|
1026
|
+
raise ValueError(f'part名称不对{part}')
|
1027
|
+
|
1028
|
+
def latexpart(self, head, inner=False, invert=False):
|
1029
|
+
# TexPos比较特殊,暂时不迁移
|
1030
|
+
|
1031
|
+
def core(s):
|
1032
|
+
parts = []
|
1033
|
+
n = TexPos(s).get(f'{head}Cnt')
|
1034
|
+
for i in range(n):
|
1035
|
+
p1, p2 = texpos(s, f'{head}{i}')
|
1036
|
+
if inner: # inner应该要排除掉整个\chapter{...}内容
|
1037
|
+
inter = (NestEnv(s[p1:p2]).latexcmd1(head, invert=True) + p1).intervals[0]
|
1038
|
+
p1, p2 = inter.start(), inter.end()
|
1039
|
+
parts.append([p1, p2])
|
1040
|
+
return parts
|
1041
|
+
|
1042
|
+
return self.nest(core, invert)
|
1043
|
+
|
1044
|
+
def latexparagraph(self, linefeed=2):
|
1045
|
+
"""
|
1046
|
+
latex的段落不能简单从这个函数继承,latex需要考虑注释的影响!
|
1047
|
+
"""
|
1048
|
+
return self.paragraph(linefeed=linefeed).search(r'\n?(?<!\\)%<.+\n?', invert=True)
|
1049
|
+
|
1050
|
+
|
1051
|
+
def substr_intervals(s, head, tail=None, invert=False, inner=False):
|
1052
|
+
""" 旧模块,不推荐使用,建议使用新版的NestEnv接口直接处理
|
1053
|
+
|
1054
|
+
:param s: 内容
|
1055
|
+
:param head: 头
|
1056
|
+
TODO 含正则和不含正则的,可以分子函数来实现,不要都写在这个函数
|
1057
|
+
:param tail: 尾
|
1058
|
+
TODO 支持普通字符串和正则对象的头尾搭配
|
1059
|
+
:param invert: 是否取反
|
1060
|
+
:param inner: TODO 注意目前很多匹配功能还不支持inner模式
|
1061
|
+
False,定位内部时,含标签
|
1062
|
+
True,不含标签
|
1063
|
+
:return:
|
1064
|
+
|
1065
|
+
TODO 考虑tabular嵌套tabular这种的正常定位?
|
1066
|
+
TODO 支持同时定位topic和sub_topic?
|
1067
|
+
"""
|
1068
|
+
|
1069
|
+
def infer_headtail(head, tail=None):
|
1070
|
+
"""输入简化的head、tail命令,返回智能推导出的完整的head、tail值"""
|
1071
|
+
if isinstance(head, str) and tail is None:
|
1072
|
+
if re.match(r'\$+$', head): # 公式
|
1073
|
+
tail = head
|
1074
|
+
elif re.match(r'\\(chapter|section|subsection){', head):
|
1075
|
+
pass # 这种情况 tail 不用改,就是用 None 来代表不确定性结尾标记
|
1076
|
+
elif head[-1] in '[{(<': # 配对括号
|
1077
|
+
tail = {'[': ']', '{': '}', '(': ')', '<': '>'}[head[-1]]
|
1078
|
+
elif head.startswith('%<'):
|
1079
|
+
tail = '%/'
|
1080
|
+
elif head[0] == '<':
|
1081
|
+
tail = 'xmltag'
|
1082
|
+
elif re.match(r'\\begin{[a-zA-Z]+}', head): # latex类的环境匹配
|
1083
|
+
m = re.match(r'\\begin({[a-zA-Z]+})', head)
|
1084
|
+
tail = r'\end' + m.group(1)
|
1085
|
+
else: # 没有推导出来
|
1086
|
+
tail = None
|
1087
|
+
return head, tail
|
1088
|
+
|
1089
|
+
head, tail = infer_headtail(head, tail)
|
1090
|
+
|
1091
|
+
pos1, parts = 0, []
|
1092
|
+
# 1 括号匹配:head最后一个字符和tail第一个字符是匹配括号 # TODO 其实可以考虑tail的匹配括号不在头尾而在内容中间的情况
|
1093
|
+
if head[-1] in '[{(<' and tail and len(tail) and tail[0] == ']})>'['[{(<'.index(head[-1])]:
|
1094
|
+
parts = NestEnv(s).bracket(head, tail, inner).intervals
|
1095
|
+
# 2 第2种括号匹配: head第一个字符与tail最后一个字符是匹配括号
|
1096
|
+
elif head[0] in '[{(<' and tail and len(tail) and tail[-1] == ']})>'['[{(<'.index(head[0])]:
|
1097
|
+
parts = NestEnv(s).bracket2(head, tail, inner).intervals
|
1098
|
+
# 3 公式匹配
|
1099
|
+
elif head == tail == '$':
|
1100
|
+
parts = LatexNestEnv(s).formula(inner).intervals
|
1101
|
+
# 4 百分注结构 %<xxx a='yy'> ... %</xxx> 的格式匹配
|
1102
|
+
elif re.match(r'%<[a-zA-Z\-_]+', head) and tail == '%/':
|
1103
|
+
parts = LatexNestEnv(s).pxmltag(head[2:], 'inner').intervals
|
1104
|
+
# 5 latex的 章、节、子节 匹配
|
1105
|
+
elif re.match(r'\\(chapter|section|subsection)', head) and not tail: # TODO 支持inner功能
|
1106
|
+
parts = LatexNestEnv(s).latexpart(head[1:], inner=inner)
|
1107
|
+
elif head == r'\item':
|
1108
|
+
parts = LatexNestEnv(s).item().intervals
|
1109
|
+
# 7 latex类的环境匹配
|
1110
|
+
elif re.match(r'\\begin{([a-zA-Z]+)}', head):
|
1111
|
+
m1 = re.match(r'\\begin{([a-zA-Z]+)}', head)
|
1112
|
+
m2 = re.match(r'\\end{([a-zA-Z]+)}', tail)
|
1113
|
+
if m2 and m1.group(1) == m2.group(1):
|
1114
|
+
parts = LatexNestEnv(s).latexenv(head, tail, inner).intervals
|
1115
|
+
else:
|
1116
|
+
parts = LatexNestEnv(s).find2(head, tail, inner).intervals
|
1117
|
+
# 8 抓取latex中所有插图命令
|
1118
|
+
elif head == r'\includegraphics' and tail is None:
|
1119
|
+
parts = LatexNestEnv(s).includegraphics('inner').intervals
|
1120
|
+
# 9 lewis电子式匹配
|
1121
|
+
elif head == r'\lewis' and tail is None:
|
1122
|
+
parts = LatexNestEnv(s).lewis(inner=inner).intervals
|
1123
|
+
# 10 xml标签结点匹配
|
1124
|
+
elif head[0] == '<' and tail == 'xmltag':
|
1125
|
+
parts = NestEnv(s).xmltag(head[1:], inner).intervals
|
1126
|
+
# +、普通匹配
|
1127
|
+
elif isinstance(head, str) and isinstance(tail, str):
|
1128
|
+
parts = NestEnv(s).find2(head, tail, inner).intervals
|
1129
|
+
elif isinstance(head, str) and not isinstance(tail, str):
|
1130
|
+
parts = NestEnv(s).find(head).intervals
|
1131
|
+
|
1132
|
+
t = Intervals(parts)
|
1133
|
+
if invert: t = t.invert(len(s))
|
1134
|
+
return t
|
1135
|
+
|
1136
|
+
|
1137
|
+
def substrfunc(s, head, tail, *, func1=lambda x: x, func2=lambda x: x):
|
1138
|
+
r"""对字符串s,查找里面的代码块
|
1139
|
+
代码块有head、tail组成,例如
|
1140
|
+
“$”,“$”:能扩展支持对双美元符的定位; TODO:使用$...$时,也能智能识别\(、\)
|
1141
|
+
目前也支持 r'$\begin{array}', r'\end{array}$' 的定位了
|
1142
|
+
第1种括号匹配:head最后一个字符与tail最后一个字符是匹配括号
|
1143
|
+
“\ce{”、“}”
|
1144
|
+
“\text{”、“}”
|
1145
|
+
第2种括号匹配:head第一个字符与tail最后一个字符是匹配括号
|
1146
|
+
'{\centerline', '}'
|
1147
|
+
“\includegraph”
|
1148
|
+
|
1149
|
+
如果head的最后一个字符是:[{(、且tail的第一个字符是对应的]}),则会进行智能括号匹配
|
1150
|
+
反向选择=True时:对锁定外的区域进行操作
|
1151
|
+
|
1152
|
+
对找到的每个子字符串,调用func1进行操作;对反向内容,调用func2进行操作
|
1153
|
+
|
1154
|
+
>>> substrfunc('aa\\itemQ%\naabb\nccdd\n\n\\test\n{A}\n{B}\n\\item\nabc', r'\item', '', func1= lambda x: 'X' + x + 'Y')
|
1155
|
+
'aaX\\itemQ%\naabb\nccddY\n\nX\\test\n{A}\n{B}Y\nX\\item\nabcY'
|
1156
|
+
>>> substrfunc(r'aa\verb|bb|cc', r'\verb|', '|', func2 = lambda x: '')
|
1157
|
+
'\\verb|bb|'
|
1158
|
+
"""
|
1159
|
+
intervals = substr_intervals(s, head, tail)
|
1160
|
+
return intervals.replace(s, func1, out_repl=func2)
|
1161
|
+
|
1162
|
+
|
1163
|
+
class CppNestEnv(NestEnv):
|
1164
|
+
def comments(self, *, invert=False):
|
1165
|
+
""" 这个实现是不太严谨的,如果急用,可以先凑合吧 """
|
1166
|
+
|
1167
|
+
def core(s):
|
1168
|
+
ne = NestEnv(s)
|
1169
|
+
ne2 = ne.find2('/*', '*/') + ne.search(r'//.+') # 找出c++的注释块
|
1170
|
+
return ne2
|
1171
|
+
|
1172
|
+
return self.nest(core, invert)
|
1173
|
+
|
1174
|
+
|
1175
|
+
class PyNestEnv(NestEnv):
|
1176
|
+
def imports(self, *, invert=False):
|
1177
|
+
""" 定位所有的from、import,默认每个import是分开的 """
|
1178
|
+
|
1179
|
+
def core(s):
|
1180
|
+
# 捕捉连续的以'from ', 'import '开头的行
|
1181
|
+
ne = NestEnv(s)
|
1182
|
+
ne2 = ne.search(r'^(import|from)\s.+\n?', flags=re.MULTILINE) \
|
1183
|
+
+ ne.search(r'^from\s.+\([^\)]+\)[ \t]*\n?', flags=re.MULTILINE)
|
1184
|
+
return ne2
|
1185
|
+
|
1186
|
+
return self.nest(core, invert)
|
1187
|
+
|
1188
|
+
def search(self, pattern, flags=0, group=0, invert=False):
|
1189
|
+
r""" 正则模式匹配
|
1190
|
+
|
1191
|
+
:param group: 可以指定返回的编组内容,默认第0组
|
1192
|
+
|
1193
|
+
>>> NestEnv(r'xx\teste{aa}{bb}').search(r'\\test[a-z]*').strings()
|
1194
|
+
['\\teste']
|
1195
|
+
|
1196
|
+
TODO 如果需要用iner可以用命名组 (?P<inner>.*?),含inner组名时,inner默认值为True
|
1197
|
+
"""
|
1198
|
+
|
1199
|
+
def core(s):
|
1200
|
+
return [m.span(group) for m in re.finditer(pattern, s, flags)]
|
1201
|
+
|
1202
|
+
return self.nest(core, invert)
|
1203
|
+
|
1204
|
+
def identifier(self, name, flags=0, group=0, *, invert=False):
|
1205
|
+
""" 定位特定的标识符位置
|
1206
|
+
这东西比较简单,可以在正则头尾加\b即可,也可以用普通正则实现
|
1207
|
+
|
1208
|
+
:param name: 支持正则格式
|
1209
|
+
|
1210
|
+
"""
|
1211
|
+
|
1212
|
+
def core(s):
|
1213
|
+
return [m.span(group) for m in re.finditer(rf'\b{name}\b', s, flags)]
|
1214
|
+
|
1215
|
+
return self.nest(core, invert)
|