pyxllib 0.3.197__py3-none-any.whl → 3.201.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +14 -21
- pyxllib/algo/__init__.py +8 -8
- pyxllib/algo/disjoint.py +54 -54
- pyxllib/algo/geo.py +537 -541
- pyxllib/algo/intervals.py +964 -964
- pyxllib/algo/matcher.py +389 -389
- pyxllib/algo/newbie.py +166 -166
- pyxllib/algo/pupil.py +629 -629
- pyxllib/algo/shapelylib.py +67 -67
- pyxllib/algo/specialist.py +241 -241
- pyxllib/algo/stat.py +494 -494
- pyxllib/algo/treelib.py +145 -149
- pyxllib/algo/unitlib.py +62 -66
- pyxllib/autogui/__init__.py +5 -5
- pyxllib/autogui/activewin.py +246 -246
- pyxllib/autogui/all.py +9 -9
- pyxllib/autogui/autogui.py +846 -852
- pyxllib/autogui/uiautolib.py +362 -362
- pyxllib/autogui/virtualkey.py +102 -102
- pyxllib/autogui/wechat.py +827 -827
- pyxllib/autogui/wechat_msg.py +421 -421
- pyxllib/autogui/wxautolib.py +84 -84
- pyxllib/cv/__init__.py +5 -5
- pyxllib/cv/expert.py +267 -267
- pyxllib/cv/imfile.py +159 -159
- pyxllib/cv/imhash.py +39 -39
- pyxllib/cv/pupil.py +9 -9
- pyxllib/cv/rgbfmt.py +1525 -1525
- pyxllib/cv/slidercaptcha.py +137 -137
- pyxllib/cv/trackbartools.py +251 -251
- pyxllib/cv/xlcvlib.py +1040 -1040
- pyxllib/cv/xlpillib.py +423 -423
- pyxllib/data/echarts.py +236 -240
- pyxllib/data/jsonlib.py +85 -89
- pyxllib/data/oss.py +72 -72
- pyxllib/data/pglib.py +1111 -1127
- pyxllib/data/sqlite.py +568 -568
- pyxllib/data/sqllib.py +297 -297
- pyxllib/ext/JLineViewer.py +505 -505
- pyxllib/ext/__init__.py +6 -6
- pyxllib/ext/demolib.py +251 -246
- pyxllib/ext/drissionlib.py +277 -277
- pyxllib/ext/kq5034lib.py +12 -12
- pyxllib/ext/qt.py +449 -449
- pyxllib/ext/robustprocfile.py +493 -497
- pyxllib/ext/seleniumlib.py +76 -76
- pyxllib/ext/tk.py +173 -173
- pyxllib/ext/unixlib.py +821 -827
- pyxllib/ext/utools.py +345 -351
- pyxllib/ext/webhook.py +124 -119
- pyxllib/ext/win32lib.py +40 -40
- pyxllib/ext/wjxlib.py +91 -88
- pyxllib/ext/wpsapi.py +124 -124
- pyxllib/ext/xlwork.py +9 -9
- pyxllib/ext/yuquelib.py +1110 -1105
- pyxllib/file/__init__.py +17 -17
- pyxllib/file/docxlib.py +757 -761
- pyxllib/file/gitlib.py +309 -309
- pyxllib/file/libreoffice.py +165 -165
- pyxllib/file/movielib.py +144 -148
- pyxllib/file/newbie.py +10 -10
- pyxllib/file/onenotelib.py +1469 -1469
- pyxllib/file/packlib/__init__.py +330 -330
- pyxllib/file/packlib/zipfile.py +2441 -2441
- pyxllib/file/pdflib.py +422 -426
- pyxllib/file/pupil.py +185 -185
- pyxllib/file/specialist/__init__.py +681 -685
- pyxllib/file/specialist/dirlib.py +799 -799
- pyxllib/file/specialist/download.py +193 -193
- pyxllib/file/specialist/filelib.py +2825 -2829
- pyxllib/file/xlsxlib.py +3122 -3131
- pyxllib/file/xlsyncfile.py +341 -341
- pyxllib/prog/__init__.py +5 -5
- pyxllib/prog/cachetools.py +58 -64
- pyxllib/prog/deprecatedlib.py +233 -233
- pyxllib/prog/filelock.py +42 -42
- pyxllib/prog/ipyexec.py +253 -253
- pyxllib/prog/multiprogs.py +940 -940
- pyxllib/prog/newbie.py +451 -451
- pyxllib/prog/pupil.py +1208 -1197
- pyxllib/prog/sitepackages.py +33 -33
- pyxllib/prog/specialist/__init__.py +348 -391
- pyxllib/prog/specialist/bc.py +203 -203
- pyxllib/prog/specialist/browser.py +497 -497
- pyxllib/prog/specialist/common.py +347 -347
- pyxllib/prog/specialist/datetime.py +198 -198
- pyxllib/prog/specialist/tictoc.py +240 -240
- pyxllib/prog/specialist/xllog.py +180 -180
- pyxllib/prog/xlosenv.py +110 -108
- pyxllib/stdlib/__init__.py +17 -17
- pyxllib/stdlib/tablepyxl/__init__.py +10 -10
- pyxllib/stdlib/tablepyxl/style.py +303 -303
- pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
- pyxllib/text/__init__.py +8 -8
- pyxllib/text/ahocorasick.py +36 -39
- pyxllib/text/airscript.js +754 -744
- pyxllib/text/charclasslib.py +121 -121
- pyxllib/text/jiebalib.py +267 -267
- pyxllib/text/jinjalib.py +27 -32
- pyxllib/text/jsa_ai_prompt.md +271 -271
- pyxllib/text/jscode.py +922 -922
- pyxllib/text/latex/__init__.py +158 -158
- pyxllib/text/levenshtein.py +303 -303
- pyxllib/text/nestenv.py +1215 -1215
- pyxllib/text/newbie.py +300 -300
- pyxllib/text/pupil/__init__.py +8 -8
- pyxllib/text/pupil/common.py +1121 -1121
- pyxllib/text/pupil/xlalign.py +326 -326
- pyxllib/text/pycode.py +47 -47
- pyxllib/text/specialist/__init__.py +8 -8
- pyxllib/text/specialist/common.py +112 -112
- pyxllib/text/specialist/ptag.py +186 -186
- pyxllib/text/spellchecker.py +172 -172
- pyxllib/text/templates/echart_base.html +10 -10
- pyxllib/text/templates/highlight_code.html +16 -16
- pyxllib/text/templates/latex_editor.html +102 -102
- pyxllib/text/vbacode.py +17 -17
- pyxllib/text/xmllib.py +741 -747
- pyxllib/xl.py +42 -39
- pyxllib/xlcv.py +17 -17
- pyxllib-3.201.1.dist-info/METADATA +296 -0
- pyxllib-3.201.1.dist-info/RECORD +125 -0
- {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/licenses/LICENSE +190 -190
- pyxllib/ext/old.py +0 -663
- pyxllib-0.3.197.dist-info/METADATA +0 -48
- pyxllib-0.3.197.dist-info/RECORD +0 -126
- {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/WHEEL +0 -0
pyxllib/algo/intervals.py
CHANGED
@@ -1,964 +1,964 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2019/12/04 11:16
|
6
|
-
|
7
|
-
|
8
|
-
""" 区间类
|
9
|
-
|
10
|
-
关于区间类,可以参考: https://github.com/AlexandreDecan/python-intervals
|
11
|
-
但其跟我的业务场景有区别,不太适用,所以这里还是开发了自己的功能库
|
12
|
-
|
13
|
-
文档: https://histudy.yuque.com/docs/share/365f3a75-28d0-4595-bc80-5e9d6ab36f71#
|
14
|
-
"""
|
15
|
-
|
16
|
-
import collections
|
17
|
-
import itertools
|
18
|
-
import math
|
19
|
-
import re
|
20
|
-
|
21
|
-
|
22
|
-
class Interval:
|
23
|
-
"""
|
24
|
-
这个类要考虑跟正则的 Match 对象兼容,所以有些细节比较“诡异”
|
25
|
-
主要是区间的记录,是用了特殊的 regs 格式
|
26
|
-
即虽然说是一个区间,但这个区间可以标记许多子区间
|
27
|
-
对应正则里的group(0),和众多具体的子区间group(1)、group(2)等
|
28
|
-
正则里是用regs存储这些区间,所以这里的Interval跟正则Match的regs概念相同
|
29
|
-
这里的形式统一为左闭右开
|
30
|
-
"""
|
31
|
-
__slots__ = ('regs',)
|
32
|
-
|
33
|
-
def __init__(self, arg1=None, arg2=None):
|
34
|
-
if isinstance(arg1, int) and isinstance(arg2, int):
|
35
|
-
# 正常的构建方式
|
36
|
-
self.regs = ((arg1, arg2),)
|
37
|
-
elif getattr(arg1, 'regs', None):
|
38
|
-
# 有regs成员变量,则直接取用(一般是re的Match对象传过来转区间类的)
|
39
|
-
self.regs = arg1.regs
|
40
|
-
elif arg2 is None and arg1 and len(arg1) == 2 and isinstance(arg1[0], int):
|
41
|
-
self.regs = (tuple(arg1),)
|
42
|
-
elif arg1:
|
43
|
-
# 直接传入区间集
|
44
|
-
self.regs = tuple(arg1)
|
45
|
-
else:
|
46
|
-
# 空区间
|
47
|
-
self.regs = None
|
48
|
-
self.update()
|
49
|
-
|
50
|
-
def update(self):
|
51
|
-
""" 将空区间统一标记为None
|
52
|
-
|
53
|
-
>>> Interval(5, 5).regs # 返回 None
|
54
|
-
>>> Interval(6, 5).regs
|
55
|
-
"""
|
56
|
-
if self.regs and self.regs[0][0] >= self.regs[0][1]:
|
57
|
-
self.regs = None
|
58
|
-
|
59
|
-
def start(self, idx=0):
|
60
|
-
return self.regs[idx][0] if self.regs else math.inf
|
61
|
-
|
62
|
-
def end(self, idx=0):
|
63
|
-
return self.regs[idx][1] if self.regs else -math.inf
|
64
|
-
|
65
|
-
def __bool__(self):
|
66
|
-
"""
|
67
|
-
>>> bool(Interval(5, 6))
|
68
|
-
True
|
69
|
-
>>> bool(Interval())
|
70
|
-
False
|
71
|
-
>>> bool(Interval(5, 5)) # 由于标记是左闭右开
|
72
|
-
False
|
73
|
-
>>> bool(Interval(6, 5))
|
74
|
-
False
|
75
|
-
"""
|
76
|
-
if self.regs: # self.regs存在还不够,区间必须要有长度
|
77
|
-
return self.end() > self.start()
|
78
|
-
else:
|
79
|
-
return False
|
80
|
-
|
81
|
-
def __repr__(self):
|
82
|
-
""" 所有区间都是左闭右开! 第一组(x~y)是主区间,后面跟的是子区间
|
83
|
-
|
84
|
-
>>> Interval() # 空区间
|
85
|
-
[]
|
86
|
-
>>> Interval(6, 5) # 空区间
|
87
|
-
[]
|
88
|
-
>>> Interval(4, 8) # 会将底层的左闭右开区间值,改成左闭右闭的区间值显示,更直观
|
89
|
-
[4~7]
|
90
|
-
>>> Interval(5, 6) # 只有单个值的,就不写5~5了,而是简洁性写成5
|
91
|
-
[5]
|
92
|
-
>>> Interval(((4, 8), (4, 6), (7, 8))) # 如果有子区间,会在主区间冒号后显示
|
93
|
-
[4~7: 4~5 7]
|
94
|
-
"""
|
95
|
-
li = []
|
96
|
-
if self.regs:
|
97
|
-
li = [(f'{a}~{b - 1}' if b - a > 1 else str(a)) for a, b in self.regs]
|
98
|
-
if len(li) > 1: li[0] += ':'
|
99
|
-
return '[' + ' '.join(li) + ']'
|
100
|
-
|
101
|
-
def __eq__(self, other):
|
102
|
-
"""只要主区间标记范围一致就是相等的
|
103
|
-
>>> Interval(5, 7) == Interval([(5, 7), (4, 6)])
|
104
|
-
True
|
105
|
-
>>> Interval(5, 5) == Interval()
|
106
|
-
True
|
107
|
-
>>> Interval(5, 6) == Interval(5, 7)
|
108
|
-
False
|
109
|
-
"""
|
110
|
-
a = self.regs and self.regs[0]
|
111
|
-
b = other.regs and other.regs[0]
|
112
|
-
return a == b
|
113
|
-
|
114
|
-
def __lt__(self, other):
|
115
|
-
"""两个区间比大小,只考虑regs[0],先比第1个值,如果相同再比第2个值
|
116
|
-
|
117
|
-
>>> Interval(4, 6) < Interval(7, 8)
|
118
|
-
True
|
119
|
-
>>> Interval(4, 5) < Interval(4, 6)
|
120
|
-
True
|
121
|
-
>>> Interval(4, 6) < Interval(5, 6)
|
122
|
-
True
|
123
|
-
>>> Interval(4, 6) > Interval(7, 8) # 虽然只写了<,但python对>也能智能对称操作
|
124
|
-
False
|
125
|
-
"""
|
126
|
-
return (self.end() < other.end()) if self.start() == other.start() else (self.start() < other.start())
|
127
|
-
|
128
|
-
def __and__(self, other):
|
129
|
-
"""
|
130
|
-
>>> Interval(4, 7) & Interval(5, 8)
|
131
|
-
[5~6]
|
132
|
-
>>> Interval(4, 6) & Interval(5, 8)
|
133
|
-
[5]
|
134
|
-
>>> Interval(4, 6) & Interval(6, 8)
|
135
|
-
[]
|
136
|
-
|
137
|
-
# 可以和区间集对象运算,会把区间集当成一系列的子区间,将其上下限范围作为主区间来分析
|
138
|
-
# 注意和实际的线段点集求并的结果相区别:Intervals([(2, 4), [9, 11]]) & Interval(0, 10)
|
139
|
-
>>> Interval(0, 10) & Intervals([(2, 4), [9, 11]])
|
140
|
-
[2~9]
|
141
|
-
"""
|
142
|
-
# 如果左右值不合理,类初始化里自带的update会自动变为None
|
143
|
-
return Interval(max(self.start(), other.start()), min(self.end(), other.end()))
|
144
|
-
|
145
|
-
def __contains__(self, other):
|
146
|
-
"""
|
147
|
-
:param other: 另一个Interval对象
|
148
|
-
:return: self.regs[0] 是否包含了 other.regs[0]
|
149
|
-
|
150
|
-
>>> Interval(6, 8) in Interval(4, 10)
|
151
|
-
True
|
152
|
-
>>> Interval(2, 7) in Interval(4, 10)
|
153
|
-
False
|
154
|
-
>>> Intervals([(1,3), (2,4)]) in Interval(1, 4) # 可以和Intervals混合使用
|
155
|
-
True
|
156
|
-
>>> Interval(2, 7) not in Interval(4, 10)
|
157
|
-
True
|
158
|
-
"""
|
159
|
-
return self.start() <= other.start() and self.end() >= other.end()
|
160
|
-
|
161
|
-
def __or__(self, other):
|
162
|
-
"""两个regs[0]共有的部分
|
163
|
-
:param other: 另一个Interval对象 或 Intervals 区间集对象
|
164
|
-
:return: 返回other对象的类型(注意,会丢失所有子区间!),
|
165
|
-
如果不存在则regs的值为None
|
166
|
-
>>> m1, m2, m3 = Interval(4, 6), Interval(5, 8), Interval(10, 15)
|
167
|
-
>>> m1 | m2 # 注意两个子区间会按顺序排
|
168
|
-
[4~7: 4~5 5~7]
|
169
|
-
>>> m1 | m3
|
170
|
-
[4~14: 4~5 10~14]
|
171
|
-
>>> m1 | Intervals([(2, 4), (7, 9)])
|
172
|
-
[2~8: 2~8 4~5]
|
173
|
-
"""
|
174
|
-
left, right = min(self.start(), other.start()), max(self.end(), other.end())
|
175
|
-
a, b = sorted([self.regs[0], (other.start(), other.end())]) # 小的排左边
|
176
|
-
return Interval(((left, right), a, b))
|
177
|
-
|
178
|
-
def __add__(self, other):
|
179
|
-
"""
|
180
|
-
>>> Interval(10, 15) + 6
|
181
|
-
[16~20]
|
182
|
-
"""
|
183
|
-
if isinstance(other, int):
|
184
|
-
regs = [(t[0] + other, t[1] + other) for t in self.regs]
|
185
|
-
return Interval(regs)
|
186
|
-
else:
|
187
|
-
return self | other
|
188
|
-
|
189
|
-
def __sub__(self, other):
|
190
|
-
"""
|
191
|
-
:param other: 另一个Interval对象 或者 Intervals对象
|
192
|
-
:return: self.regs[0] 减去 other.regs[0] 后剩余的区间(会丢失子空间集)
|
193
|
-
|
194
|
-
区间减区间:,
|
195
|
-
>>> Interval(4, 6) - Interval(5, 8)
|
196
|
-
[4]
|
197
|
-
|
198
|
-
# 这种特殊情况会返回含有两个子区间的Interval对象
|
199
|
-
>>> Interval(0, 10) - Interval(4, 6)
|
200
|
-
[0~9: 0~3 6~9]
|
201
|
-
|
202
|
-
# 这里后者实际区间值并不包含前者,
|
203
|
-
# 但实际是按后者的start、end界定的范围作为一个Interval来减的
|
204
|
-
>>> Interval(4, 7) - Intervals([(2, 3), (7, 8)])
|
205
|
-
[]
|
206
|
-
"""
|
207
|
-
if isinstance(other, Intervals):
|
208
|
-
other = Interval(other.start(), other.end())
|
209
|
-
a, a1, a2 = self, self.start(), self.end()
|
210
|
-
b, b1, b2 = other, other.start(), other.end()
|
211
|
-
if a1 >= b2 or a2 <= b1: # a 与 b 不相交
|
212
|
-
# 这里不能直接返回a,如果a有子区间,会混淆return值类型
|
213
|
-
return Interval(a1, a2)
|
214
|
-
else:
|
215
|
-
c1, c2 = Interval(a1, b1), Interval(b2, a2)
|
216
|
-
if c1 and not c2:
|
217
|
-
return c1
|
218
|
-
elif c2 and not c1:
|
219
|
-
return c2
|
220
|
-
elif not c1 and not c2:
|
221
|
-
return Interval()
|
222
|
-
else:
|
223
|
-
return Interval(((a1, a2), (c1.start(), c1.end()), (c2.start(), c2.end())))
|
224
|
-
|
225
|
-
|
226
|
-
class ReMatch(Interval):
|
227
|
-
"""
|
228
|
-
1、伪re._sre.SRE_Match类
|
229
|
-
真Match类在re.py的第223行
|
230
|
-
有什么办法嘞,标准Match不支持修改成员变量,不支持自定义spes
|
231
|
-
2、这个类同时还可以作为“区间”类使用
|
232
|
-
有配套的Intervals区间集类,有很刁的各种区间运算功能
|
233
|
-
"""
|
234
|
-
__slots__ = ('regs', 'string', 'pos', 'endpos', 'lastindex', 'lastgroup', 're')
|
235
|
-
|
236
|
-
def __init__(self, regs=None, string=None, pos=0, endpos=None, lastindex=None, lastgroup=None, re=None):
|
237
|
-
"""Create a new match object.
|
238
|
-
|
239
|
-
:param regs: 区间值
|
240
|
-
:param string: 原始的完整字符串内容
|
241
|
-
:param pos: 匹配范围开始的位置,一般就是0
|
242
|
-
:param endpos: 匹配范围的结束位置,一般就是原字符串长度
|
243
|
-
:param lastindex: int,表示有多少个子分组
|
244
|
-
:param lastgroup: NoneType,None,The name of the last matched capturing group,
|
245
|
-
or None if the group didn’t have a name, or if no group was matched at all.
|
246
|
-
:param re: 使用的原正则匹配模式
|
247
|
-
"""
|
248
|
-
if getattr(regs, 'regs', None):
|
249
|
-
# 从一个类match对象来初始化
|
250
|
-
m = regs
|
251
|
-
self.pos = getattr(m, 'pos', None)
|
252
|
-
self.endpos = getattr(m, 'endpos', None)
|
253
|
-
self.lastindex = getattr(m, 'lastindex', None)
|
254
|
-
self.lastgroup = getattr(m, 'lastgroup', None)
|
255
|
-
self.re = getattr(m, 're', None)
|
256
|
-
self.string = getattr(m, 'string', None)
|
257
|
-
self.regs = getattr(m, 'regs', None)
|
258
|
-
else:
|
259
|
-
self.regs = regs
|
260
|
-
self.string = string
|
261
|
-
self.pos = pos
|
262
|
-
self.endpos = endpos
|
263
|
-
self.lastindex = lastindex
|
264
|
-
if not self.lastindex and len(self.regs) > 1: self.lastindex = len(self.regs) - 1
|
265
|
-
self.lastgroup = lastgroup
|
266
|
-
self.re = re
|
267
|
-
self.update()
|
268
|
-
|
269
|
-
def group(self, idx=0):
|
270
|
-
return self.string[self.regs[idx][0]:self.regs[idx][1]]
|
271
|
-
|
272
|
-
def expand(self, template):
|
273
|
-
"""Return the string obtained by doing backslash substitution on the
|
274
|
-
template string template.
|
275
|
-
|
276
|
-
好像是个输入'\1'可以返回匹配的第1组类似这样的功能
|
277
|
-
|
278
|
-
:type template: T
|
279
|
-
:rtype: T
|
280
|
-
"""
|
281
|
-
raise NotImplementedError
|
282
|
-
|
283
|
-
def groups(self, default=None):
|
284
|
-
"""Return a tuple containing all the subgroups of the match, from 1 up
|
285
|
-
to however many groups are in the pattern.
|
286
|
-
|
287
|
-
:rtype: tuple
|
288
|
-
"""
|
289
|
-
return tuple(map(lambda x: self.string[x[0]:x[1]], self.regs[1:]))
|
290
|
-
|
291
|
-
def groupdict(self, default=None):
|
292
|
-
"""Return a dictionary containing all the named subgroups of the match,
|
293
|
-
keyed by the subgroup name.
|
294
|
-
|
295
|
-
:rtype: dict[bytes | unicode, T]
|
296
|
-
"""
|
297
|
-
raise NotImplementedError
|
298
|
-
|
299
|
-
def span(self, group=0):
|
300
|
-
"""Return a 2-tuple (start, end) for the substring matched by group.
|
301
|
-
|
302
|
-
:type group: int | bytes | unicode
|
303
|
-
:rtype: (int, int)
|
304
|
-
"""
|
305
|
-
return self.regs[group][0], self.regs[group][1]
|
306
|
-
|
307
|
-
|
308
|
-
class Intervals:
|
309
|
-
|
310
|
-
def __init__(self, li=None):
|
311
|
-
"""
|
312
|
-
:param li: 若干interval对象
|
313
|
-
"""
|
314
|
-
# 1 matches支持list等类型初始化
|
315
|
-
if hasattr(li, 'intervals'):
|
316
|
-
li = li.intervals
|
317
|
-
if isinstance(li, Intervals):
|
318
|
-
self.__dict__ = li.__dict__
|
319
|
-
else:
|
320
|
-
self.li = []
|
321
|
-
if li is None: li = []
|
322
|
-
for m in li:
|
323
|
-
if not isinstance(m, Interval):
|
324
|
-
m = Interval(m)
|
325
|
-
if m: self.li.append(m) # 只加入非空区间
|
326
|
-
self.li.sort() # 按顺序排
|
327
|
-
# 2 生成成员变量
|
328
|
-
# self._start = min([m.start() for m in self.li], default=math.inf)
|
329
|
-
self._start = min([m.start() for m in self.li[:1]], default=math.inf)
|
330
|
-
self._end = max([m.end() for m in self.li], default=-math.inf)
|
331
|
-
|
332
|
-
def start(self):
|
333
|
-
"""和Interval操作方法尽量对称,头尾也用函数来取,不要用成员变量取"""
|
334
|
-
return self._start
|
335
|
-
|
336
|
-
def end(self):
|
337
|
-
return self._end
|
338
|
-
|
339
|
-
def _merge_intersect_interval(self, adjacent=True):
|
340
|
-
if not self: return []
|
341
|
-
li = [self[0]]
|
342
|
-
for m in self[1:]:
|
343
|
-
if li[-1].end() > m.start() or (adjacent and li[-1].end() == m.start()):
|
344
|
-
li[-1] = m | li[-1] # 如果跟上一个相交,则合并过去
|
345
|
-
else:
|
346
|
-
li.append(m) # 否则新建一个区间
|
347
|
-
return li
|
348
|
-
|
349
|
-
def merge_intersect_interval(self, adjacent=False):
|
350
|
-
""" 将存在相交的区域进行合并
|
351
|
-
|
352
|
-
:param adjacent: 如果相邻紧接,也进行拼接
|
353
|
-
|
354
|
-
# 注意(1,3)和(2,4)合并为一个区间了
|
355
|
-
>>> Intervals([(1, 3), (2, 4), (5, 6)]).merge_intersect_interval(True)
|
356
|
-
{[1~3: 1~2 2~3], [5]}
|
357
|
-
>>> Intervals([(1, 2), (2, 3)]).merge_intersect_interval(True)
|
358
|
-
{[1~2: 1 2]}
|
359
|
-
>>> Intervals([(1, 2), (2, 3)]).merge_intersect_interval(adjacent=False)
|
360
|
-
{[1], [2]}
|
361
|
-
"""
|
362
|
-
# 因为可能经常被调用,所以要变成static存储
|
363
|
-
if adjacent:
|
364
|
-
self._li1 = getattr(self, '_li1', None)
|
365
|
-
if self._li1 is None:
|
366
|
-
self._li1 = Intervals(self._merge_intersect_interval(adjacent))
|
367
|
-
return self._li1
|
368
|
-
else:
|
369
|
-
self._li2 = getattr(self, '_li2', None)
|
370
|
-
if self._li2 is None:
|
371
|
-
self._li2 = Intervals(self._merge_intersect_interval(adjacent))
|
372
|
-
return self._li2
|
373
|
-
|
374
|
-
def true_intersect_subinterval(self, other):
|
375
|
-
"""判断改区间集,与other区间集,是否存在相交的子区间(真相交,不含子集关系)
|
376
|
-
|
377
|
-
如果一个区间只是self或other独有,或者一个子区间被另一方的一个子区间完全包含,是存在很多优良性质,方便做很多自动化的
|
378
|
-
否则,如果存在相交的子区间,就麻烦了
|
379
|
-
|
380
|
-
其实就是想把两个区间完全相同的子区间去掉,然后求交就好了~~
|
381
|
-
|
382
|
-
>>> a, b = Intervals([(1,3), (7,9), (10,12)]), Intervals([(2,4), (7,9), (10, 15)])
|
383
|
-
>>> a.true_intersect_subinterval(b) # 在集合交的基础上,去掉完全包含的情况
|
384
|
-
{[2]}
|
385
|
-
"""
|
386
|
-
# 0 区间 转 区间集
|
387
|
-
if isinstance(other, Interval):
|
388
|
-
other = Intervals([other])
|
389
|
-
|
390
|
-
# 1 区间集和区间集做相交运算,生成一个新的区间集
|
391
|
-
"""假设a、b都是从左到右按顺序取得,所以可以对b的遍历进行一定过滤简化"""
|
392
|
-
A = self.merge_intersect_interval()
|
393
|
-
B = other.merge_intersect_interval()
|
394
|
-
li, k = [], 0
|
395
|
-
for a in A:
|
396
|
-
for j in range(k, len(B)):
|
397
|
-
b = B[j]
|
398
|
-
x1, y1, x2, y2 = a.start(), a.end(), b.start(), b.end()
|
399
|
-
if y2 <= x1:
|
400
|
-
# B[0~j]都在a前面,在后续的a中,可以直接从B[j]开始找
|
401
|
-
k = j
|
402
|
-
elif x2 >= y1: # b已经到a右边,后面的b不用再找了,不会有相交
|
403
|
-
break
|
404
|
-
elif (x2 < x1 < y2 < y1) or (x1 < x2 < y1 < y2): # 严格相交,非子集关系
|
405
|
-
li.append(a & b)
|
406
|
-
return Intervals(li)
|
407
|
-
|
408
|
-
def sub(self, s, repl, *, out_repl=None, adjacent=False) -> str:
|
409
|
-
r"""
|
410
|
-
:param repl: 替换的规则函数
|
411
|
-
暂不支持和正则等价的字符串替换规则表达
|
412
|
-
这个得找技巧,用re现成的功能代码,不可能自己暴力解析
|
413
|
-
:param out_repl: 对范围外若有处理需要,可以自定义处理函数
|
414
|
-
:param s: 要处理的文本串
|
415
|
-
原版 re.sub 还有 count 和 flags 参数,这里难开发,暂时先不做这个接口
|
416
|
-
:return:
|
417
|
-
|
418
|
-
>>> s = '0123456789'
|
419
|
-
>>> inters = Intervals([(2, 5), (7, 8)])
|
420
|
-
>>> inters.sub(s, lambda m: 'b')
|
421
|
-
'01b56b89'
|
422
|
-
>>> inters.sub(s, lambda m: 'b', out_repl=lambda m: 'a')
|
423
|
-
'ababa'
|
424
|
-
>>> inters.sub(s, 'b')
|
425
|
-
'01b56b89'
|
426
|
-
>>> inters.sub(s, 'b', out_repl='a')
|
427
|
-
'ababa'
|
428
|
-
>>> inters.sub(s, lambda m: ' ' + ''.join(reversed(m.group())) + ' ')
|
429
|
-
'01 432 56 7 89'
|
430
|
-
>>> inters.sub(s, lambda m: ' ' + ''.join(reversed(m.group())) + ' ', out_repl=lambda m: str(len(m.group())))
|
431
|
-
'2 432 2 7 2'
|
432
|
-
"""
|
433
|
-
res, idx = [], 0
|
434
|
-
|
435
|
-
def str2func(a):
|
436
|
-
# TODO,如果是str类型,应该要处理字符串标记中的编组和转义等信息的
|
437
|
-
return (lambda s: a) if isinstance(a, str) else a
|
438
|
-
|
439
|
-
repl, out_repl = str2func(repl), str2func(out_repl)
|
440
|
-
|
441
|
-
def func1(regs):
|
442
|
-
return repl(ReMatch(regs, s, 0, len(s))) # 构造伪match类并传入
|
443
|
-
|
444
|
-
def func2(start_, end_):
|
445
|
-
if out_repl:
|
446
|
-
return out_repl(ReMatch(((start_, end_),), s, 0, len(s)))
|
447
|
-
else:
|
448
|
-
return s[start_:end_]
|
449
|
-
|
450
|
-
for inter in self.merge_intersect_interval(adjacent=adjacent):
|
451
|
-
# 匹配范围外的文本处理
|
452
|
-
if inter.start() > idx:
|
453
|
-
res.append(func2(idx, inter.start()))
|
454
|
-
# 匹配范围内的处理
|
455
|
-
res.append(func1(inter.regs))
|
456
|
-
idx = inter.end()
|
457
|
-
if idx < len(s): res.append(func2(idx, len(s)))
|
458
|
-
return ''.join(res)
|
459
|
-
|
460
|
-
def replace(self, s, arg1, arg2=None, *, out_repl=lambda s: s, adjacent=False) -> str:
|
461
|
-
r"""类似sub函数,但是对两个自定义函数传入的是普通字符串类型,而不是match对象
|
462
|
-
|
463
|
-
:param arg1: 可以输入一个自定义函数
|
464
|
-
:param arg2: 可以配合arg1使用,功能同str.replace(arg1, arg2)
|
465
|
-
:param adjacent: 替换的时候,为了避免混乱出错,是先要合并重叠的区间集的
|
466
|
-
这里有个adjacent参数,True表示邻接的区间会合并,反之则不会合并临接区间
|
467
|
-
|
468
|
-
>>> s = '0123456789'
|
469
|
-
>>> inters = Intervals([(2, 5), (7, 8)])
|
470
|
-
>>> inters.replace(s, lambda s: 'b')
|
471
|
-
'01b56b89'
|
472
|
-
>>> inters.replace(s, lambda s: 'b', out_repl=lambda s: 'a')
|
473
|
-
'ababa'
|
474
|
-
>>> inters.replace(s, 'b')
|
475
|
-
'01b56b89'
|
476
|
-
>>> inters.replace(s, '2', 'b')
|
477
|
-
'01b3456789'
|
478
|
-
>>> inters.replace(s, lambda s: ' ' + ''.join(reversed(s)) + ' ', out_repl=lambda s: str(len(s)))
|
479
|
-
'2 432 2 7 2'
|
480
|
-
"""
|
481
|
-
res, idx = [], 0
|
482
|
-
|
483
|
-
def str2func(a):
|
484
|
-
return (lambda s: a) if isinstance(a, str) else a
|
485
|
-
|
486
|
-
repl, out_repl = str2func(arg1), str2func(out_repl)
|
487
|
-
if arg2:
|
488
|
-
repl = lambda a: a.replace(arg1, arg2)
|
489
|
-
|
490
|
-
for inter in self.merge_intersect_interval(adjacent=adjacent):
|
491
|
-
# 匹配范围外的文本处理
|
492
|
-
if inter.start() >= idx:
|
493
|
-
res.append(out_repl(s[idx:inter.start()]))
|
494
|
-
idx = inter.end()
|
495
|
-
# 匹配范围内的处理
|
496
|
-
res.append(repl(s[inter.start():inter.end()]))
|
497
|
-
if idx < len(s): res.append(out_repl(s[idx:]))
|
498
|
-
return ''.join(res)
|
499
|
-
|
500
|
-
def __bool__(self):
|
501
|
-
"""
|
502
|
-
>>> bool(Intervals())
|
503
|
-
False
|
504
|
-
>>> bool(Intervals([(1, 2), (4, 5)]))
|
505
|
-
True
|
506
|
-
>>> bool(Intervals([(2, 1), (5, 4)]))
|
507
|
-
False
|
508
|
-
"""
|
509
|
-
return bool(self.li)
|
510
|
-
|
511
|
-
def __getitem__(self, item):
|
512
|
-
return self.li[item]
|
513
|
-
|
514
|
-
def __iter__(self):
|
515
|
-
for m in self.li:
|
516
|
-
yield m
|
517
|
-
|
518
|
-
def __len__(self):
|
519
|
-
return len(self.li)
|
520
|
-
|
521
|
-
def __repr__(self):
|
522
|
-
return '{' + ', '.join([str(m) for m in self.li]) + '}'
|
523
|
-
|
524
|
-
def __eq__(self, other):
|
525
|
-
""""数量相等,且每个Interval也相等
|
526
|
-
|
527
|
-
即只考虑强相等,不考虑“弱相等”。
|
528
|
-
例如两个区间集虽然数量不同,但使用merge_intersect_interval后,再比较可能就是一样的。
|
529
|
-
|
530
|
-
>>> Intervals([(1,2), (3,5)]) == Intervals([(1,2), (3,5)])
|
531
|
-
True
|
532
|
-
>>> Intervals([(1,2), (3,5)]) == Intervals([(1,2), (3,4), (4,5)])
|
533
|
-
False
|
534
|
-
>>> Intervals([(1,2), (3,5)]) == Intervals([(1,2), (3,4), (4,5)]).merge_intersect_interval(True)
|
535
|
-
True
|
536
|
-
"""
|
537
|
-
if len(self) != len(other): return False
|
538
|
-
for i in range(len(self)):
|
539
|
-
if self[i] != other[i]:
|
540
|
-
return False
|
541
|
-
return True
|
542
|
-
|
543
|
-
def __invert__(self, maxn=None):
|
544
|
-
"""取反区间集的补集
|
545
|
-
注意这样会丢失所有区间的子区间标记
|
546
|
-
|
547
|
-
>>> ~Intervals([(1, 3), (4, 6), (8, 10)]) # 区间取反操作
|
548
|
-
{[0], [3], [6~7]}
|
549
|
-
>>> ~Intervals([]) # 区间取反操作
|
550
|
-
{}
|
551
|
-
"""
|
552
|
-
# 1 要先把有相交的区间合并了
|
553
|
-
itvs = self.merge_intersect_interval()
|
554
|
-
|
555
|
-
# 2 辅助变量
|
556
|
-
li = []
|
557
|
-
if maxn is None: maxn = itvs.end() # 计算出坐标上限
|
558
|
-
|
559
|
-
# 3 第1个区间是否从0开始
|
560
|
-
if len(itvs) and itvs[0].start() == 0:
|
561
|
-
idx = itvs[0].end()
|
562
|
-
i = 1
|
563
|
-
else:
|
564
|
-
i = idx = 0
|
565
|
-
|
566
|
-
# 4 循环取得新的区间值
|
567
|
-
for m in itvs[i:]:
|
568
|
-
li.append(Interval(idx, m.start()))
|
569
|
-
idx = m.end()
|
570
|
-
|
571
|
-
# 5 最后一个区间特判
|
572
|
-
if idx != maxn: li.append(Interval(idx, maxn))
|
573
|
-
res = Intervals(li)
|
574
|
-
return res
|
575
|
-
|
576
|
-
def invert(self, maxn=None):
|
577
|
-
"""
|
578
|
-
>>> Intervals([(1, 3), (4, 6), (8, 10)]).invert(20)
|
579
|
-
{[0], [3], [6~7], [10~19]}
|
580
|
-
"""
|
581
|
-
return self.__invert__(maxn)
|
582
|
-
|
583
|
-
def __and__(self, other):
|
584
|
-
r"""
|
585
|
-
# 区间集和单个区间的相交运算:
|
586
|
-
>>> Intervals([(2, 4), (9, 11)]) & Interval(0, 10)
|
587
|
-
{[2~3], [9]}
|
588
|
-
|
589
|
-
# 区间集和区间集的相交运算:
|
590
|
-
>>> Intervals([(1, 5), (6, 8)]) & Intervals([(2, 7), (7, 9)])
|
591
|
-
{[2~4], [6], [7]}
|
592
|
-
|
593
|
-
>>> Intervals([(2, 11)]) & Intervals()
|
594
|
-
{}
|
595
|
-
>>> Intervals() & Intervals([(2, 11)])
|
596
|
-
{}
|
597
|
-
"""
|
598
|
-
# 0 区间 转 区间集
|
599
|
-
if isinstance(other, Interval):
|
600
|
-
other = Intervals([other])
|
601
|
-
|
602
|
-
# 1 区间集和区间集做相交运算,生成一个新的区间集
|
603
|
-
"""假设a、b都是从左到右按顺序取得,所以可以对b的遍历进行一定过滤简化"""
|
604
|
-
A = self.merge_intersect_interval()
|
605
|
-
B = other.merge_intersect_interval()
|
606
|
-
li, k = [], 0
|
607
|
-
for a in A:
|
608
|
-
for j in range(k, len(B)):
|
609
|
-
b = B[j]
|
610
|
-
if b.end() <= a.start():
|
611
|
-
# B[0~j]都在a前面,在后续的a中,可以直接从B[j]开始找
|
612
|
-
k = j
|
613
|
-
elif b.start() >= a.end(): # b已经到a右边,后面的b不用再找了,不会有相交
|
614
|
-
break
|
615
|
-
else: # 可能有相交
|
616
|
-
li.append(a & b)
|
617
|
-
return Intervals(li)
|
618
|
-
|
619
|
-
def is_adjacent_and(self, other):
|
620
|
-
""" __and__运算的变形,两区间邻接时也认为相交
|
621
|
-
|
622
|
-
>>> Intervals([(2, 4), (9, 11)]).is_adjacent_and(Interval(0, 10))
|
623
|
-
True
|
624
|
-
>>> Intervals([(1, 5), (6, 8)]).is_adjacent_and(Intervals([(2, 7), (7, 9)]))
|
625
|
-
True
|
626
|
-
>>> Intervals([(2, 11)]).is_adjacent_and(Intervals())
|
627
|
-
False
|
628
|
-
>>> Intervals().is_adjacent_and(Intervals([(2, 11)]))
|
629
|
-
False
|
630
|
-
>>> Intervals([(2, 11)]).is_adjacent_and(Interval(11, 13))
|
631
|
-
True
|
632
|
-
"""
|
633
|
-
# 0 区间 转 区间集
|
634
|
-
if isinstance(other, Interval):
|
635
|
-
other = Intervals([other])
|
636
|
-
|
637
|
-
# 1 区间集和区间集做相交运算,生成一个新的区间集
|
638
|
-
"""假设a、b都是从左到右按顺序取得,所以可以对b的遍历进行一定过滤简化"""
|
639
|
-
A = self.merge_intersect_interval()
|
640
|
-
B = other.merge_intersect_interval()
|
641
|
-
li, k = [], 0
|
642
|
-
for a in A:
|
643
|
-
for j in range(k, len(B)):
|
644
|
-
b = B[j]
|
645
|
-
if b.end() < a.start():
|
646
|
-
# B[0~j]都在a前面,在后续的a中,可以直接从B[j]开始找
|
647
|
-
k = j
|
648
|
-
elif b.start() > a.end(): # b已经到a右边,后面的b不用再找了,不会有相交
|
649
|
-
break
|
650
|
-
else: # 可能有相交
|
651
|
-
return True
|
652
|
-
return False
|
653
|
-
|
654
|
-
def __contains__(self, other):
|
655
|
-
r"""
|
656
|
-
>>> Interval(3, 5) in Intervals([(2, 6)])
|
657
|
-
True
|
658
|
-
>>> Interval(3, 5) in Intervals([(0, 4)])
|
659
|
-
False
|
660
|
-
>>> Intervals([(1, 2), (3, 4)]) in Intervals([(0, 3), (3, 5)])
|
661
|
-
True
|
662
|
-
>>> Interval(3, 5) not in Intervals([(2, 6)])
|
663
|
-
False
|
664
|
-
|
665
|
-
这里具体实现,可以双循环暴力,但考虑区间集顺序性,其实只要双指针同时往前找就好了,
|
666
|
-
设几个条件去对循环进行优化,跳出,能大大提高效率
|
667
|
-
"""
|
668
|
-
# 1 区间集 是否包含 区间,转为 区间集 是否包含 区间集 处理
|
669
|
-
if isinstance(other, (Interval, list, tuple)):
|
670
|
-
other = Intervals([other])
|
671
|
-
|
672
|
-
# 2 合并相交区域
|
673
|
-
A = self.merge_intersect_interval()
|
674
|
-
B = other.merge_intersect_interval()
|
675
|
-
|
676
|
-
# 3 看是否每个b,都能在A中找到一个a包含它
|
677
|
-
i = 0
|
678
|
-
for b in B:
|
679
|
-
for j in range(i, len(A)):
|
680
|
-
if b in A[j]:
|
681
|
-
# A[j-1]前面都不能包含b,但是A[j]能包含b的,后面的b,A[j-1]也一定包含不到
|
682
|
-
i = j
|
683
|
-
break
|
684
|
-
elif A[j].start() > b.end():
|
685
|
-
# 后续的a左边都已经比b的右边更大,就不用再找了,肯定都不会有相交的了
|
686
|
-
return False
|
687
|
-
else: # 找到一个b,在A中不包含它
|
688
|
-
return False
|
689
|
-
return True
|
690
|
-
|
691
|
-
def __or__(self, other):
|
692
|
-
r"""区间集相加运算,合成一个新的区间集对象(会丢失所有子区间)
|
693
|
-
|
694
|
-
出现相交的元素会合成一个新的元素,避免区间集中存在相交的两个元素,在sub时会出bug
|
695
|
-
>>> Intervals([(2, 4), (5, 7)]) | Interval(1, 3)
|
696
|
-
{[1~3: 1~2 2~3], [5~6]}
|
697
|
-
>>> Intervals([(2, 4), (5, 7)]) | Intervals([(1, 3), (6, 9)])
|
698
|
-
{[1~3: 1~2 2~3], [5~8: 5~6 6~8]}
|
699
|
-
|
700
|
-
>>> Intervals([(1, 3), (6, 9)]) + 3
|
701
|
-
{[4~5], [9~11]}
|
702
|
-
"""
|
703
|
-
if isinstance(other, Interval):
|
704
|
-
other = Intervals([other])
|
705
|
-
else:
|
706
|
-
other = Intervals(other)
|
707
|
-
return Intervals(self.li + other.li).merge_intersect_interval()
|
708
|
-
|
709
|
-
def __add__(self, other):
|
710
|
-
if isinstance(other, int):
|
711
|
-
li = [x + other for x in self.li]
|
712
|
-
return Intervals(li)
|
713
|
-
else:
|
714
|
-
return self | other
|
715
|
-
|
716
|
-
def __sub__(self, other):
|
717
|
-
"""区间集减法操作(注意跟Interval减法操作是有区别的)
|
718
|
-
对于任意的 a ∈ self,更新 a = a - {b | b ∈ other}
|
719
|
-
|
720
|
-
>>> Intervals([(0, 10)]) - Interval(4, 6)
|
721
|
-
{[0~3], [6~9]}
|
722
|
-
>>> Intervals([(0, 10)]) - Interval(8, 12)
|
723
|
-
{[0~7]}
|
724
|
-
|
725
|
-
>>> Intervals([(0, 10), (20, 30)]) - Intervals([(0, 5), (15, 25)])
|
726
|
-
{[5~9], [25~29]}
|
727
|
-
>>> Intervals([(0, 10), (20, 30)]) - Intervals([(2, 5), (7, 12), (25, 27)])
|
728
|
-
{[0~1], [5~6], [20~24], [27~29]}
|
729
|
-
"""
|
730
|
-
# 1
|
731
|
-
if isinstance(other, Interval):
|
732
|
-
other = Intervals([other])
|
733
|
-
|
734
|
-
# a - b,一个a可能会拆成a1,a2两段,此时左边的a1可以继续处理,
|
735
|
-
# 但是a2要加到堆栈A,留作下一轮处理,所以A要用栈结构处理
|
736
|
-
A = list(reversed(self.merge_intersect_interval().li))
|
737
|
-
|
738
|
-
B = other.merge_intersect_interval()
|
739
|
-
|
740
|
-
# 2
|
741
|
-
li, k = [], 0
|
742
|
-
while A:
|
743
|
-
a = A.pop()
|
744
|
-
for j in range(k, len(B)):
|
745
|
-
b = B[j]
|
746
|
-
if b.end() < a.start():
|
747
|
-
k = j
|
748
|
-
elif a.end() < a.start():
|
749
|
-
break
|
750
|
-
else:
|
751
|
-
c = a - b
|
752
|
-
if not c: # a已经被减光,就直接跳出循环了
|
753
|
-
a = Interval()
|
754
|
-
break
|
755
|
-
elif len(c.regs) == 1:
|
756
|
-
a = c
|
757
|
-
else: # 如果 a - c 变成了两段,则左边a1继续处理,右边a2加入A下轮处理
|
758
|
-
a = Interval(c.regs[1])
|
759
|
-
A.append(Interval(c.regs[2]))
|
760
|
-
if a: li.append(a)
|
761
|
-
|
762
|
-
return Intervals(li).merge_intersect_interval()
|
763
|
-
|
764
|
-
|
765
|
-
def iter_intervals(arg):
|
766
|
-
"""从多种类区间类型来构造Interval对象,返回值可能有多组"""
|
767
|
-
|
768
|
-
def judge_range(t):
|
769
|
-
return hasattr(t, '__len__') and len(t) == 2 and isinstance(t[0], int) and isinstance(t[1], int)
|
770
|
-
|
771
|
-
if hasattr(arg, 'regs'):
|
772
|
-
yield Interval(arg)
|
773
|
-
elif judge_range(arg):
|
774
|
-
yield Interval(arg)
|
775
|
-
elif isinstance(arg, Interval):
|
776
|
-
yield arg
|
777
|
-
elif isinstance(arg, Intervals):
|
778
|
-
for i in range(len(arg)):
|
779
|
-
yield arg[i]
|
780
|
-
elif hasattr(arg, '__len__') and len(arg) and judge_range(arg[0]):
|
781
|
-
for i in range(len(arg)):
|
782
|
-
yield Interval(arg[i])
|
783
|
-
elif isinstance(arg, collections.Iterable):
|
784
|
-
for t in list(arg):
|
785
|
-
yield t
|
786
|
-
|
787
|
-
|
788
|
-
def highlight_intervals(content, intervals, colors=None, background=True,
|
789
|
-
use_mathjax=False,
|
790
|
-
only_body=False,
|
791
|
-
title='highlight_intervals',
|
792
|
-
set_pre='<pre class="prettyprint nocode linenums" style="white-space: pre-wrap;">'):
|
793
|
-
"""文本匹配可视化
|
794
|
-
获得高亮显示的匹配区间的html代码
|
795
|
-
|
796
|
-
:param content:需要展示的文本内容
|
797
|
-
:param intervals: 输入一个数组,数组的每个元素支持单区间或区间集相关类
|
798
|
-
Interval、re正则的Match对象、(4, 10)
|
799
|
-
Intervals、[(2,4), (6,8)]
|
800
|
-
|
801
|
-
请自行保证区间嵌套语法正确性,本函数不检查处理嵌套混乱错误问题
|
802
|
-
:param set_pre: 设置<pre>显示格式。
|
803
|
-
标准 不自动换行: '<pre class="prettyprint nocode linenums">'
|
804
|
-
比如常见的,对于太长的文本行,可以自动断行:
|
805
|
-
set_pre='<pre class="prettyprint nocode linenums" style="white-space: pre-wrap;">'
|
806
|
-
:param colors: 一个数组,和intervals一一对应,轮询使用的颜色
|
807
|
-
默认值为: ['red']
|
808
|
-
:param background:
|
809
|
-
True,使用背景色
|
810
|
-
False,不使用背景色,而是字体颜色
|
811
|
-
:param use_mathjax:
|
812
|
-
True,渲染公式
|
813
|
-
False,不渲染公式,只以文本展示
|
814
|
-
:param only_body: 不返回完整的html页面内容,只有body主体内容
|
815
|
-
"""
|
816
|
-
# 1 存储要插入的html样式
|
817
|
-
from collections import defaultdict
|
818
|
-
import html
|
819
|
-
from pyxllib.text.xmllib import get_jinja_template
|
820
|
-
|
821
|
-
d = defaultdict(str)
|
822
|
-
|
823
|
-
# 2 其他所有子组从颜色列表取颜色清单,每组一个颜色
|
824
|
-
if colors is None:
|
825
|
-
colors = ('red',)
|
826
|
-
elif isinstance(colors, str):
|
827
|
-
colors = (colors,)
|
828
|
-
n = len(colors)
|
829
|
-
for i, arg in enumerate(intervals):
|
830
|
-
color = colors[i % n]
|
831
|
-
for interval in iter_intervals(arg):
|
832
|
-
l, r = interval.start(), interval.end()
|
833
|
-
if background:
|
834
|
-
d[l] = d[l] + f'<span style="background-color: {color}">'
|
835
|
-
d[r] = '</span>' + d[r]
|
836
|
-
else:
|
837
|
-
d[l] = d[l] + f'<font color={color}>'
|
838
|
-
d[r] = '</font>' + d[r]
|
839
|
-
|
840
|
-
# 3 拼接最终的html代码
|
841
|
-
res = [set_pre]
|
842
|
-
s = content
|
843
|
-
idxs = sorted(d.keys()) # 按顺序取需要插入的下标
|
844
|
-
|
845
|
-
# (3)拼接
|
846
|
-
if idxs: res.append(s[:idxs[0]])
|
847
|
-
for i in range(1, len(idxs)):
|
848
|
-
res.append(d[idxs[i - 1]])
|
849
|
-
res.append(html.escape(s[idxs[i - 1]:idxs[i]]))
|
850
|
-
if idxs: # 最后一个标记
|
851
|
-
res.append(d[idxs[-1]])
|
852
|
-
res.append(s[idxs[-1]:])
|
853
|
-
if not idxs:
|
854
|
-
res.append(s)
|
855
|
-
res.append('</pre>')
|
856
|
-
|
857
|
-
if only_body:
|
858
|
-
return ''.join(res)
|
859
|
-
else:
|
860
|
-
return get_jinja_template('highlight_code.html').render(title=title, body=''.join(res), use_mathjax=use_mathjax)
|
861
|
-
|
862
|
-
|
863
|
-
class StrIdxBack:
|
864
|
-
r"""字符串删除部分干扰字符后,对新字符串匹配并回溯找原字符串的下标
|
865
|
-
|
866
|
-
>>> ob = StrIdxBack('bxx ax xbxax')
|
867
|
-
>>> ob.delchars(r'[ x]+')
|
868
|
-
>>> ob # 删除空格、删除字符x
|
869
|
-
baba
|
870
|
-
>>> print(ob.idx) # keystr中与原字符串对应位置:(0, 5, 9, 11)
|
871
|
-
(0, 5, 9, 11)
|
872
|
-
>>> m = re.match(r'b(ab)', ob.keystr)
|
873
|
-
>>> m = ob.matchback(m)
|
874
|
-
>>> m.group(1)
|
875
|
-
'ax xb'
|
876
|
-
>>> ob.search('ab') # 找出原字符串中内容:'ax xb'
|
877
|
-
'ax xb'
|
878
|
-
"""
|
879
|
-
|
880
|
-
def __init__(self, s):
|
881
|
-
self.oristr = s
|
882
|
-
self.idx = tuple(range(len(s))) # 存储还保留着内容的下标
|
883
|
-
self.keystr = s
|
884
|
-
|
885
|
-
def delchars(self, pattern, flags=0):
|
886
|
-
r""" 模仿正则的替换语法
|
887
|
-
但是不用输入替换目标s,以及目标格式,因为都是删除操作
|
888
|
-
|
889
|
-
利用正则可以知道被删除的是哪个区间范围
|
890
|
-
>>> ob = StrIdxBack('abc123df4a'); ob.delchars(r'\d+'); str(ob)
|
891
|
-
'abcdfa'
|
892
|
-
>>> ob.idx
|
893
|
-
(0, 1, 2, 6, 7, 9)
|
894
|
-
"""
|
895
|
-
k = 0
|
896
|
-
idxs = []
|
897
|
-
|
898
|
-
def repl(m):
|
899
|
-
nonlocal k, idxs
|
900
|
-
idxs.append(self.idx[k:m.start(0)])
|
901
|
-
k = m.end(0)
|
902
|
-
return ''
|
903
|
-
|
904
|
-
self.keystr = re.sub(pattern, repl, self.keystr, flags=flags)
|
905
|
-
idxs.append(self.idx[k:])
|
906
|
-
self.idx = tuple(itertools.chain(*idxs))
|
907
|
-
|
908
|
-
def compare_newstr(self, limit=300):
|
909
|
-
r"""比较直观的比较字符串前后变化
|
910
|
-
|
911
|
-
newstr相对于oldnew作展开,比较直观的显示字符串前后变化差异
|
912
|
-
>>> ob = StrIdxBack('abab'); ob.delchars('b'); ob.compare_newstr()
|
913
|
-
'a a '
|
914
|
-
"""
|
915
|
-
s1 = self.oristr
|
916
|
-
dd = set(self.idx)
|
917
|
-
|
918
|
-
s2 = []
|
919
|
-
k = 0
|
920
|
-
for i in range(min(len(s1), limit)):
|
921
|
-
if i in dd:
|
922
|
-
s2.append(s1[i])
|
923
|
-
k += 1
|
924
|
-
else:
|
925
|
-
if ord(s1[i]) < 128:
|
926
|
-
if s1[i] == ' ': # 原来是空格的,删除后要用_表示
|
927
|
-
s2.append('_')
|
928
|
-
else: # 原始不是空格的,可以用空格表示已被删除
|
929
|
-
s2.append(' ')
|
930
|
-
else: # 中文字符要用两个空格表示才能对齐
|
931
|
-
s2.append(' ')
|
932
|
-
s2 = ''.join(s2)
|
933
|
-
s2 = s2.replace('\n', r'\n')
|
934
|
-
|
935
|
-
return s2
|
936
|
-
|
937
|
-
def compare(self, limit=300):
|
938
|
-
"""比较直观的比较字符串前后变化"""
|
939
|
-
s1 = self.oristr
|
940
|
-
|
941
|
-
s1 = s1.replace('\n', r'\n')[:limit]
|
942
|
-
s2 = self.compare_newstr(limit)
|
943
|
-
|
944
|
-
return s1 + '\n' + s2 + '\n'
|
945
|
-
|
946
|
-
def matchback(self, m):
|
947
|
-
"""输入一个keystr匹配的match对象,将其映射回oristr的match对象"""
|
948
|
-
regs = []
|
949
|
-
for rs in getattr(m, 'regs'):
|
950
|
-
regs.append((self.idx[rs[0]], self.idx[rs[1] - 1] + 1)) # 注意右边界的处理有细节
|
951
|
-
return ReMatch(regs, self.oristr, m.pos, len(self.oristr), m.lastindex, m.lastgroup, m.re)
|
952
|
-
|
953
|
-
def search(self, pattern):
|
954
|
-
"""在新字符串上查找模式,但是返回的是原字符串的相关下标数据"""
|
955
|
-
m = re.search(pattern, self.keystr)
|
956
|
-
if m:
|
957
|
-
m = self.matchback(m) # pycharm这里会提示m没有regs的成员变量,其实是正常的,没问题
|
958
|
-
return m.group()
|
959
|
-
else:
|
960
|
-
return ''
|
961
|
-
|
962
|
-
def __repr__(self):
|
963
|
-
"""返回处理后当前的新字符串"""
|
964
|
-
return self.keystr
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2019/12/04 11:16
|
6
|
+
|
7
|
+
|
8
|
+
""" 区间类
|
9
|
+
|
10
|
+
关于区间类,可以参考: https://github.com/AlexandreDecan/python-intervals
|
11
|
+
但其跟我的业务场景有区别,不太适用,所以这里还是开发了自己的功能库
|
12
|
+
|
13
|
+
文档: https://histudy.yuque.com/docs/share/365f3a75-28d0-4595-bc80-5e9d6ab36f71#
|
14
|
+
"""
|
15
|
+
|
16
|
+
import collections
|
17
|
+
import itertools
|
18
|
+
import math
|
19
|
+
import re
|
20
|
+
|
21
|
+
|
22
|
+
class Interval:
|
23
|
+
"""
|
24
|
+
这个类要考虑跟正则的 Match 对象兼容,所以有些细节比较“诡异”
|
25
|
+
主要是区间的记录,是用了特殊的 regs 格式
|
26
|
+
即虽然说是一个区间,但这个区间可以标记许多子区间
|
27
|
+
对应正则里的group(0),和众多具体的子区间group(1)、group(2)等
|
28
|
+
正则里是用regs存储这些区间,所以这里的Interval跟正则Match的regs概念相同
|
29
|
+
这里的形式统一为左闭右开
|
30
|
+
"""
|
31
|
+
__slots__ = ('regs',)
|
32
|
+
|
33
|
+
def __init__(self, arg1=None, arg2=None):
|
34
|
+
if isinstance(arg1, int) and isinstance(arg2, int):
|
35
|
+
# 正常的构建方式
|
36
|
+
self.regs = ((arg1, arg2),)
|
37
|
+
elif getattr(arg1, 'regs', None):
|
38
|
+
# 有regs成员变量,则直接取用(一般是re的Match对象传过来转区间类的)
|
39
|
+
self.regs = arg1.regs
|
40
|
+
elif arg2 is None and arg1 and len(arg1) == 2 and isinstance(arg1[0], int):
|
41
|
+
self.regs = (tuple(arg1),)
|
42
|
+
elif arg1:
|
43
|
+
# 直接传入区间集
|
44
|
+
self.regs = tuple(arg1)
|
45
|
+
else:
|
46
|
+
# 空区间
|
47
|
+
self.regs = None
|
48
|
+
self.update()
|
49
|
+
|
50
|
+
def update(self):
|
51
|
+
""" 将空区间统一标记为None
|
52
|
+
|
53
|
+
>>> Interval(5, 5).regs # 返回 None
|
54
|
+
>>> Interval(6, 5).regs
|
55
|
+
"""
|
56
|
+
if self.regs and self.regs[0][0] >= self.regs[0][1]:
|
57
|
+
self.regs = None
|
58
|
+
|
59
|
+
def start(self, idx=0):
|
60
|
+
return self.regs[idx][0] if self.regs else math.inf
|
61
|
+
|
62
|
+
def end(self, idx=0):
|
63
|
+
return self.regs[idx][1] if self.regs else -math.inf
|
64
|
+
|
65
|
+
def __bool__(self):
|
66
|
+
"""
|
67
|
+
>>> bool(Interval(5, 6))
|
68
|
+
True
|
69
|
+
>>> bool(Interval())
|
70
|
+
False
|
71
|
+
>>> bool(Interval(5, 5)) # 由于标记是左闭右开
|
72
|
+
False
|
73
|
+
>>> bool(Interval(6, 5))
|
74
|
+
False
|
75
|
+
"""
|
76
|
+
if self.regs: # self.regs存在还不够,区间必须要有长度
|
77
|
+
return self.end() > self.start()
|
78
|
+
else:
|
79
|
+
return False
|
80
|
+
|
81
|
+
def __repr__(self):
|
82
|
+
""" 所有区间都是左闭右开! 第一组(x~y)是主区间,后面跟的是子区间
|
83
|
+
|
84
|
+
>>> Interval() # 空区间
|
85
|
+
[]
|
86
|
+
>>> Interval(6, 5) # 空区间
|
87
|
+
[]
|
88
|
+
>>> Interval(4, 8) # 会将底层的左闭右开区间值,改成左闭右闭的区间值显示,更直观
|
89
|
+
[4~7]
|
90
|
+
>>> Interval(5, 6) # 只有单个值的,就不写5~5了,而是简洁性写成5
|
91
|
+
[5]
|
92
|
+
>>> Interval(((4, 8), (4, 6), (7, 8))) # 如果有子区间,会在主区间冒号后显示
|
93
|
+
[4~7: 4~5 7]
|
94
|
+
"""
|
95
|
+
li = []
|
96
|
+
if self.regs:
|
97
|
+
li = [(f'{a}~{b - 1}' if b - a > 1 else str(a)) for a, b in self.regs]
|
98
|
+
if len(li) > 1: li[0] += ':'
|
99
|
+
return '[' + ' '.join(li) + ']'
|
100
|
+
|
101
|
+
def __eq__(self, other):
|
102
|
+
"""只要主区间标记范围一致就是相等的
|
103
|
+
>>> Interval(5, 7) == Interval([(5, 7), (4, 6)])
|
104
|
+
True
|
105
|
+
>>> Interval(5, 5) == Interval()
|
106
|
+
True
|
107
|
+
>>> Interval(5, 6) == Interval(5, 7)
|
108
|
+
False
|
109
|
+
"""
|
110
|
+
a = self.regs and self.regs[0]
|
111
|
+
b = other.regs and other.regs[0]
|
112
|
+
return a == b
|
113
|
+
|
114
|
+
def __lt__(self, other):
|
115
|
+
"""两个区间比大小,只考虑regs[0],先比第1个值,如果相同再比第2个值
|
116
|
+
|
117
|
+
>>> Interval(4, 6) < Interval(7, 8)
|
118
|
+
True
|
119
|
+
>>> Interval(4, 5) < Interval(4, 6)
|
120
|
+
True
|
121
|
+
>>> Interval(4, 6) < Interval(5, 6)
|
122
|
+
True
|
123
|
+
>>> Interval(4, 6) > Interval(7, 8) # 虽然只写了<,但python对>也能智能对称操作
|
124
|
+
False
|
125
|
+
"""
|
126
|
+
return (self.end() < other.end()) if self.start() == other.start() else (self.start() < other.start())
|
127
|
+
|
128
|
+
def __and__(self, other):
|
129
|
+
"""
|
130
|
+
>>> Interval(4, 7) & Interval(5, 8)
|
131
|
+
[5~6]
|
132
|
+
>>> Interval(4, 6) & Interval(5, 8)
|
133
|
+
[5]
|
134
|
+
>>> Interval(4, 6) & Interval(6, 8)
|
135
|
+
[]
|
136
|
+
|
137
|
+
# 可以和区间集对象运算,会把区间集当成一系列的子区间,将其上下限范围作为主区间来分析
|
138
|
+
# 注意和实际的线段点集求并的结果相区别:Intervals([(2, 4), [9, 11]]) & Interval(0, 10)
|
139
|
+
>>> Interval(0, 10) & Intervals([(2, 4), [9, 11]])
|
140
|
+
[2~9]
|
141
|
+
"""
|
142
|
+
# 如果左右值不合理,类初始化里自带的update会自动变为None
|
143
|
+
return Interval(max(self.start(), other.start()), min(self.end(), other.end()))
|
144
|
+
|
145
|
+
def __contains__(self, other):
|
146
|
+
"""
|
147
|
+
:param other: 另一个Interval对象
|
148
|
+
:return: self.regs[0] 是否包含了 other.regs[0]
|
149
|
+
|
150
|
+
>>> Interval(6, 8) in Interval(4, 10)
|
151
|
+
True
|
152
|
+
>>> Interval(2, 7) in Interval(4, 10)
|
153
|
+
False
|
154
|
+
>>> Intervals([(1,3), (2,4)]) in Interval(1, 4) # 可以和Intervals混合使用
|
155
|
+
True
|
156
|
+
>>> Interval(2, 7) not in Interval(4, 10)
|
157
|
+
True
|
158
|
+
"""
|
159
|
+
return self.start() <= other.start() and self.end() >= other.end()
|
160
|
+
|
161
|
+
def __or__(self, other):
|
162
|
+
"""两个regs[0]共有的部分
|
163
|
+
:param other: 另一个Interval对象 或 Intervals 区间集对象
|
164
|
+
:return: 返回other对象的类型(注意,会丢失所有子区间!),
|
165
|
+
如果不存在则regs的值为None
|
166
|
+
>>> m1, m2, m3 = Interval(4, 6), Interval(5, 8), Interval(10, 15)
|
167
|
+
>>> m1 | m2 # 注意两个子区间会按顺序排
|
168
|
+
[4~7: 4~5 5~7]
|
169
|
+
>>> m1 | m3
|
170
|
+
[4~14: 4~5 10~14]
|
171
|
+
>>> m1 | Intervals([(2, 4), (7, 9)])
|
172
|
+
[2~8: 2~8 4~5]
|
173
|
+
"""
|
174
|
+
left, right = min(self.start(), other.start()), max(self.end(), other.end())
|
175
|
+
a, b = sorted([self.regs[0], (other.start(), other.end())]) # 小的排左边
|
176
|
+
return Interval(((left, right), a, b))
|
177
|
+
|
178
|
+
def __add__(self, other):
|
179
|
+
"""
|
180
|
+
>>> Interval(10, 15) + 6
|
181
|
+
[16~20]
|
182
|
+
"""
|
183
|
+
if isinstance(other, int):
|
184
|
+
regs = [(t[0] + other, t[1] + other) for t in self.regs]
|
185
|
+
return Interval(regs)
|
186
|
+
else:
|
187
|
+
return self | other
|
188
|
+
|
189
|
+
def __sub__(self, other):
|
190
|
+
"""
|
191
|
+
:param other: 另一个Interval对象 或者 Intervals对象
|
192
|
+
:return: self.regs[0] 减去 other.regs[0] 后剩余的区间(会丢失子空间集)
|
193
|
+
|
194
|
+
区间减区间:,
|
195
|
+
>>> Interval(4, 6) - Interval(5, 8)
|
196
|
+
[4]
|
197
|
+
|
198
|
+
# 这种特殊情况会返回含有两个子区间的Interval对象
|
199
|
+
>>> Interval(0, 10) - Interval(4, 6)
|
200
|
+
[0~9: 0~3 6~9]
|
201
|
+
|
202
|
+
# 这里后者实际区间值并不包含前者,
|
203
|
+
# 但实际是按后者的start、end界定的范围作为一个Interval来减的
|
204
|
+
>>> Interval(4, 7) - Intervals([(2, 3), (7, 8)])
|
205
|
+
[]
|
206
|
+
"""
|
207
|
+
if isinstance(other, Intervals):
|
208
|
+
other = Interval(other.start(), other.end())
|
209
|
+
a, a1, a2 = self, self.start(), self.end()
|
210
|
+
b, b1, b2 = other, other.start(), other.end()
|
211
|
+
if a1 >= b2 or a2 <= b1: # a 与 b 不相交
|
212
|
+
# 这里不能直接返回a,如果a有子区间,会混淆return值类型
|
213
|
+
return Interval(a1, a2)
|
214
|
+
else:
|
215
|
+
c1, c2 = Interval(a1, b1), Interval(b2, a2)
|
216
|
+
if c1 and not c2:
|
217
|
+
return c1
|
218
|
+
elif c2 and not c1:
|
219
|
+
return c2
|
220
|
+
elif not c1 and not c2:
|
221
|
+
return Interval()
|
222
|
+
else:
|
223
|
+
return Interval(((a1, a2), (c1.start(), c1.end()), (c2.start(), c2.end())))
|
224
|
+
|
225
|
+
|
226
|
+
class ReMatch(Interval):
|
227
|
+
"""
|
228
|
+
1、伪re._sre.SRE_Match类
|
229
|
+
真Match类在re.py的第223行
|
230
|
+
有什么办法嘞,标准Match不支持修改成员变量,不支持自定义spes
|
231
|
+
2、这个类同时还可以作为“区间”类使用
|
232
|
+
有配套的Intervals区间集类,有很刁的各种区间运算功能
|
233
|
+
"""
|
234
|
+
__slots__ = ('regs', 'string', 'pos', 'endpos', 'lastindex', 'lastgroup', 're')
|
235
|
+
|
236
|
+
def __init__(self, regs=None, string=None, pos=0, endpos=None, lastindex=None, lastgroup=None, re=None):
|
237
|
+
"""Create a new match object.
|
238
|
+
|
239
|
+
:param regs: 区间值
|
240
|
+
:param string: 原始的完整字符串内容
|
241
|
+
:param pos: 匹配范围开始的位置,一般就是0
|
242
|
+
:param endpos: 匹配范围的结束位置,一般就是原字符串长度
|
243
|
+
:param lastindex: int,表示有多少个子分组
|
244
|
+
:param lastgroup: NoneType,None,The name of the last matched capturing group,
|
245
|
+
or None if the group didn’t have a name, or if no group was matched at all.
|
246
|
+
:param re: 使用的原正则匹配模式
|
247
|
+
"""
|
248
|
+
if getattr(regs, 'regs', None):
|
249
|
+
# 从一个类match对象来初始化
|
250
|
+
m = regs
|
251
|
+
self.pos = getattr(m, 'pos', None)
|
252
|
+
self.endpos = getattr(m, 'endpos', None)
|
253
|
+
self.lastindex = getattr(m, 'lastindex', None)
|
254
|
+
self.lastgroup = getattr(m, 'lastgroup', None)
|
255
|
+
self.re = getattr(m, 're', None)
|
256
|
+
self.string = getattr(m, 'string', None)
|
257
|
+
self.regs = getattr(m, 'regs', None)
|
258
|
+
else:
|
259
|
+
self.regs = regs
|
260
|
+
self.string = string
|
261
|
+
self.pos = pos
|
262
|
+
self.endpos = endpos
|
263
|
+
self.lastindex = lastindex
|
264
|
+
if not self.lastindex and len(self.regs) > 1: self.lastindex = len(self.regs) - 1
|
265
|
+
self.lastgroup = lastgroup
|
266
|
+
self.re = re
|
267
|
+
self.update()
|
268
|
+
|
269
|
+
def group(self, idx=0):
|
270
|
+
return self.string[self.regs[idx][0]:self.regs[idx][1]]
|
271
|
+
|
272
|
+
def expand(self, template):
|
273
|
+
"""Return the string obtained by doing backslash substitution on the
|
274
|
+
template string template.
|
275
|
+
|
276
|
+
好像是个输入'\1'可以返回匹配的第1组类似这样的功能
|
277
|
+
|
278
|
+
:type template: T
|
279
|
+
:rtype: T
|
280
|
+
"""
|
281
|
+
raise NotImplementedError
|
282
|
+
|
283
|
+
def groups(self, default=None):
|
284
|
+
"""Return a tuple containing all the subgroups of the match, from 1 up
|
285
|
+
to however many groups are in the pattern.
|
286
|
+
|
287
|
+
:rtype: tuple
|
288
|
+
"""
|
289
|
+
return tuple(map(lambda x: self.string[x[0]:x[1]], self.regs[1:]))
|
290
|
+
|
291
|
+
def groupdict(self, default=None):
|
292
|
+
"""Return a dictionary containing all the named subgroups of the match,
|
293
|
+
keyed by the subgroup name.
|
294
|
+
|
295
|
+
:rtype: dict[bytes | unicode, T]
|
296
|
+
"""
|
297
|
+
raise NotImplementedError
|
298
|
+
|
299
|
+
def span(self, group=0):
|
300
|
+
"""Return a 2-tuple (start, end) for the substring matched by group.
|
301
|
+
|
302
|
+
:type group: int | bytes | unicode
|
303
|
+
:rtype: (int, int)
|
304
|
+
"""
|
305
|
+
return self.regs[group][0], self.regs[group][1]
|
306
|
+
|
307
|
+
|
308
|
+
class Intervals:
|
309
|
+
|
310
|
+
def __init__(self, li=None):
|
311
|
+
"""
|
312
|
+
:param li: 若干interval对象
|
313
|
+
"""
|
314
|
+
# 1 matches支持list等类型初始化
|
315
|
+
if hasattr(li, 'intervals'):
|
316
|
+
li = li.intervals
|
317
|
+
if isinstance(li, Intervals):
|
318
|
+
self.__dict__ = li.__dict__
|
319
|
+
else:
|
320
|
+
self.li = []
|
321
|
+
if li is None: li = []
|
322
|
+
for m in li:
|
323
|
+
if not isinstance(m, Interval):
|
324
|
+
m = Interval(m)
|
325
|
+
if m: self.li.append(m) # 只加入非空区间
|
326
|
+
self.li.sort() # 按顺序排
|
327
|
+
# 2 生成成员变量
|
328
|
+
# self._start = min([m.start() for m in self.li], default=math.inf)
|
329
|
+
self._start = min([m.start() for m in self.li[:1]], default=math.inf)
|
330
|
+
self._end = max([m.end() for m in self.li], default=-math.inf)
|
331
|
+
|
332
|
+
def start(self):
|
333
|
+
"""和Interval操作方法尽量对称,头尾也用函数来取,不要用成员变量取"""
|
334
|
+
return self._start
|
335
|
+
|
336
|
+
def end(self):
|
337
|
+
return self._end
|
338
|
+
|
339
|
+
def _merge_intersect_interval(self, adjacent=True):
|
340
|
+
if not self: return []
|
341
|
+
li = [self[0]]
|
342
|
+
for m in self[1:]:
|
343
|
+
if li[-1].end() > m.start() or (adjacent and li[-1].end() == m.start()):
|
344
|
+
li[-1] = m | li[-1] # 如果跟上一个相交,则合并过去
|
345
|
+
else:
|
346
|
+
li.append(m) # 否则新建一个区间
|
347
|
+
return li
|
348
|
+
|
349
|
+
def merge_intersect_interval(self, adjacent=False):
|
350
|
+
""" 将存在相交的区域进行合并
|
351
|
+
|
352
|
+
:param adjacent: 如果相邻紧接,也进行拼接
|
353
|
+
|
354
|
+
# 注意(1,3)和(2,4)合并为一个区间了
|
355
|
+
>>> Intervals([(1, 3), (2, 4), (5, 6)]).merge_intersect_interval(True)
|
356
|
+
{[1~3: 1~2 2~3], [5]}
|
357
|
+
>>> Intervals([(1, 2), (2, 3)]).merge_intersect_interval(True)
|
358
|
+
{[1~2: 1 2]}
|
359
|
+
>>> Intervals([(1, 2), (2, 3)]).merge_intersect_interval(adjacent=False)
|
360
|
+
{[1], [2]}
|
361
|
+
"""
|
362
|
+
# 因为可能经常被调用,所以要变成static存储
|
363
|
+
if adjacent:
|
364
|
+
self._li1 = getattr(self, '_li1', None)
|
365
|
+
if self._li1 is None:
|
366
|
+
self._li1 = Intervals(self._merge_intersect_interval(adjacent))
|
367
|
+
return self._li1
|
368
|
+
else:
|
369
|
+
self._li2 = getattr(self, '_li2', None)
|
370
|
+
if self._li2 is None:
|
371
|
+
self._li2 = Intervals(self._merge_intersect_interval(adjacent))
|
372
|
+
return self._li2
|
373
|
+
|
374
|
+
def true_intersect_subinterval(self, other):
|
375
|
+
"""判断改区间集,与other区间集,是否存在相交的子区间(真相交,不含子集关系)
|
376
|
+
|
377
|
+
如果一个区间只是self或other独有,或者一个子区间被另一方的一个子区间完全包含,是存在很多优良性质,方便做很多自动化的
|
378
|
+
否则,如果存在相交的子区间,就麻烦了
|
379
|
+
|
380
|
+
其实就是想把两个区间完全相同的子区间去掉,然后求交就好了~~
|
381
|
+
|
382
|
+
>>> a, b = Intervals([(1,3), (7,9), (10,12)]), Intervals([(2,4), (7,9), (10, 15)])
|
383
|
+
>>> a.true_intersect_subinterval(b) # 在集合交的基础上,去掉完全包含的情况
|
384
|
+
{[2]}
|
385
|
+
"""
|
386
|
+
# 0 区间 转 区间集
|
387
|
+
if isinstance(other, Interval):
|
388
|
+
other = Intervals([other])
|
389
|
+
|
390
|
+
# 1 区间集和区间集做相交运算,生成一个新的区间集
|
391
|
+
"""假设a、b都是从左到右按顺序取得,所以可以对b的遍历进行一定过滤简化"""
|
392
|
+
A = self.merge_intersect_interval()
|
393
|
+
B = other.merge_intersect_interval()
|
394
|
+
li, k = [], 0
|
395
|
+
for a in A:
|
396
|
+
for j in range(k, len(B)):
|
397
|
+
b = B[j]
|
398
|
+
x1, y1, x2, y2 = a.start(), a.end(), b.start(), b.end()
|
399
|
+
if y2 <= x1:
|
400
|
+
# B[0~j]都在a前面,在后续的a中,可以直接从B[j]开始找
|
401
|
+
k = j
|
402
|
+
elif x2 >= y1: # b已经到a右边,后面的b不用再找了,不会有相交
|
403
|
+
break
|
404
|
+
elif (x2 < x1 < y2 < y1) or (x1 < x2 < y1 < y2): # 严格相交,非子集关系
|
405
|
+
li.append(a & b)
|
406
|
+
return Intervals(li)
|
407
|
+
|
408
|
+
def sub(self, s, repl, *, out_repl=None, adjacent=False) -> str:
|
409
|
+
r"""
|
410
|
+
:param repl: 替换的规则函数
|
411
|
+
暂不支持和正则等价的字符串替换规则表达
|
412
|
+
这个得找技巧,用re现成的功能代码,不可能自己暴力解析
|
413
|
+
:param out_repl: 对范围外若有处理需要,可以自定义处理函数
|
414
|
+
:param s: 要处理的文本串
|
415
|
+
原版 re.sub 还有 count 和 flags 参数,这里难开发,暂时先不做这个接口
|
416
|
+
:return:
|
417
|
+
|
418
|
+
>>> s = '0123456789'
|
419
|
+
>>> inters = Intervals([(2, 5), (7, 8)])
|
420
|
+
>>> inters.sub(s, lambda m: 'b')
|
421
|
+
'01b56b89'
|
422
|
+
>>> inters.sub(s, lambda m: 'b', out_repl=lambda m: 'a')
|
423
|
+
'ababa'
|
424
|
+
>>> inters.sub(s, 'b')
|
425
|
+
'01b56b89'
|
426
|
+
>>> inters.sub(s, 'b', out_repl='a')
|
427
|
+
'ababa'
|
428
|
+
>>> inters.sub(s, lambda m: ' ' + ''.join(reversed(m.group())) + ' ')
|
429
|
+
'01 432 56 7 89'
|
430
|
+
>>> inters.sub(s, lambda m: ' ' + ''.join(reversed(m.group())) + ' ', out_repl=lambda m: str(len(m.group())))
|
431
|
+
'2 432 2 7 2'
|
432
|
+
"""
|
433
|
+
res, idx = [], 0
|
434
|
+
|
435
|
+
def str2func(a):
|
436
|
+
# TODO,如果是str类型,应该要处理字符串标记中的编组和转义等信息的
|
437
|
+
return (lambda s: a) if isinstance(a, str) else a
|
438
|
+
|
439
|
+
repl, out_repl = str2func(repl), str2func(out_repl)
|
440
|
+
|
441
|
+
def func1(regs):
|
442
|
+
return repl(ReMatch(regs, s, 0, len(s))) # 构造伪match类并传入
|
443
|
+
|
444
|
+
def func2(start_, end_):
|
445
|
+
if out_repl:
|
446
|
+
return out_repl(ReMatch(((start_, end_),), s, 0, len(s)))
|
447
|
+
else:
|
448
|
+
return s[start_:end_]
|
449
|
+
|
450
|
+
for inter in self.merge_intersect_interval(adjacent=adjacent):
|
451
|
+
# 匹配范围外的文本处理
|
452
|
+
if inter.start() > idx:
|
453
|
+
res.append(func2(idx, inter.start()))
|
454
|
+
# 匹配范围内的处理
|
455
|
+
res.append(func1(inter.regs))
|
456
|
+
idx = inter.end()
|
457
|
+
if idx < len(s): res.append(func2(idx, len(s)))
|
458
|
+
return ''.join(res)
|
459
|
+
|
460
|
+
def replace(self, s, arg1, arg2=None, *, out_repl=lambda s: s, adjacent=False) -> str:
|
461
|
+
r"""类似sub函数,但是对两个自定义函数传入的是普通字符串类型,而不是match对象
|
462
|
+
|
463
|
+
:param arg1: 可以输入一个自定义函数
|
464
|
+
:param arg2: 可以配合arg1使用,功能同str.replace(arg1, arg2)
|
465
|
+
:param adjacent: 替换的时候,为了避免混乱出错,是先要合并重叠的区间集的
|
466
|
+
这里有个adjacent参数,True表示邻接的区间会合并,反之则不会合并临接区间
|
467
|
+
|
468
|
+
>>> s = '0123456789'
|
469
|
+
>>> inters = Intervals([(2, 5), (7, 8)])
|
470
|
+
>>> inters.replace(s, lambda s: 'b')
|
471
|
+
'01b56b89'
|
472
|
+
>>> inters.replace(s, lambda s: 'b', out_repl=lambda s: 'a')
|
473
|
+
'ababa'
|
474
|
+
>>> inters.replace(s, 'b')
|
475
|
+
'01b56b89'
|
476
|
+
>>> inters.replace(s, '2', 'b')
|
477
|
+
'01b3456789'
|
478
|
+
>>> inters.replace(s, lambda s: ' ' + ''.join(reversed(s)) + ' ', out_repl=lambda s: str(len(s)))
|
479
|
+
'2 432 2 7 2'
|
480
|
+
"""
|
481
|
+
res, idx = [], 0
|
482
|
+
|
483
|
+
def str2func(a):
|
484
|
+
return (lambda s: a) if isinstance(a, str) else a
|
485
|
+
|
486
|
+
repl, out_repl = str2func(arg1), str2func(out_repl)
|
487
|
+
if arg2:
|
488
|
+
repl = lambda a: a.replace(arg1, arg2)
|
489
|
+
|
490
|
+
for inter in self.merge_intersect_interval(adjacent=adjacent):
|
491
|
+
# 匹配范围外的文本处理
|
492
|
+
if inter.start() >= idx:
|
493
|
+
res.append(out_repl(s[idx:inter.start()]))
|
494
|
+
idx = inter.end()
|
495
|
+
# 匹配范围内的处理
|
496
|
+
res.append(repl(s[inter.start():inter.end()]))
|
497
|
+
if idx < len(s): res.append(out_repl(s[idx:]))
|
498
|
+
return ''.join(res)
|
499
|
+
|
500
|
+
def __bool__(self):
|
501
|
+
"""
|
502
|
+
>>> bool(Intervals())
|
503
|
+
False
|
504
|
+
>>> bool(Intervals([(1, 2), (4, 5)]))
|
505
|
+
True
|
506
|
+
>>> bool(Intervals([(2, 1), (5, 4)]))
|
507
|
+
False
|
508
|
+
"""
|
509
|
+
return bool(self.li)
|
510
|
+
|
511
|
+
def __getitem__(self, item):
|
512
|
+
return self.li[item]
|
513
|
+
|
514
|
+
def __iter__(self):
|
515
|
+
for m in self.li:
|
516
|
+
yield m
|
517
|
+
|
518
|
+
def __len__(self):
|
519
|
+
return len(self.li)
|
520
|
+
|
521
|
+
def __repr__(self):
|
522
|
+
return '{' + ', '.join([str(m) for m in self.li]) + '}'
|
523
|
+
|
524
|
+
def __eq__(self, other):
|
525
|
+
""""数量相等,且每个Interval也相等
|
526
|
+
|
527
|
+
即只考虑强相等,不考虑“弱相等”。
|
528
|
+
例如两个区间集虽然数量不同,但使用merge_intersect_interval后,再比较可能就是一样的。
|
529
|
+
|
530
|
+
>>> Intervals([(1,2), (3,5)]) == Intervals([(1,2), (3,5)])
|
531
|
+
True
|
532
|
+
>>> Intervals([(1,2), (3,5)]) == Intervals([(1,2), (3,4), (4,5)])
|
533
|
+
False
|
534
|
+
>>> Intervals([(1,2), (3,5)]) == Intervals([(1,2), (3,4), (4,5)]).merge_intersect_interval(True)
|
535
|
+
True
|
536
|
+
"""
|
537
|
+
if len(self) != len(other): return False
|
538
|
+
for i in range(len(self)):
|
539
|
+
if self[i] != other[i]:
|
540
|
+
return False
|
541
|
+
return True
|
542
|
+
|
543
|
+
def __invert__(self, maxn=None):
|
544
|
+
"""取反区间集的补集
|
545
|
+
注意这样会丢失所有区间的子区间标记
|
546
|
+
|
547
|
+
>>> ~Intervals([(1, 3), (4, 6), (8, 10)]) # 区间取反操作
|
548
|
+
{[0], [3], [6~7]}
|
549
|
+
>>> ~Intervals([]) # 区间取反操作
|
550
|
+
{}
|
551
|
+
"""
|
552
|
+
# 1 要先把有相交的区间合并了
|
553
|
+
itvs = self.merge_intersect_interval()
|
554
|
+
|
555
|
+
# 2 辅助变量
|
556
|
+
li = []
|
557
|
+
if maxn is None: maxn = itvs.end() # 计算出坐标上限
|
558
|
+
|
559
|
+
# 3 第1个区间是否从0开始
|
560
|
+
if len(itvs) and itvs[0].start() == 0:
|
561
|
+
idx = itvs[0].end()
|
562
|
+
i = 1
|
563
|
+
else:
|
564
|
+
i = idx = 0
|
565
|
+
|
566
|
+
# 4 循环取得新的区间值
|
567
|
+
for m in itvs[i:]:
|
568
|
+
li.append(Interval(idx, m.start()))
|
569
|
+
idx = m.end()
|
570
|
+
|
571
|
+
# 5 最后一个区间特判
|
572
|
+
if idx != maxn: li.append(Interval(idx, maxn))
|
573
|
+
res = Intervals(li)
|
574
|
+
return res
|
575
|
+
|
576
|
+
def invert(self, maxn=None):
|
577
|
+
"""
|
578
|
+
>>> Intervals([(1, 3), (4, 6), (8, 10)]).invert(20)
|
579
|
+
{[0], [3], [6~7], [10~19]}
|
580
|
+
"""
|
581
|
+
return self.__invert__(maxn)
|
582
|
+
|
583
|
+
def __and__(self, other):
|
584
|
+
r"""
|
585
|
+
# 区间集和单个区间的相交运算:
|
586
|
+
>>> Intervals([(2, 4), (9, 11)]) & Interval(0, 10)
|
587
|
+
{[2~3], [9]}
|
588
|
+
|
589
|
+
# 区间集和区间集的相交运算:
|
590
|
+
>>> Intervals([(1, 5), (6, 8)]) & Intervals([(2, 7), (7, 9)])
|
591
|
+
{[2~4], [6], [7]}
|
592
|
+
|
593
|
+
>>> Intervals([(2, 11)]) & Intervals()
|
594
|
+
{}
|
595
|
+
>>> Intervals() & Intervals([(2, 11)])
|
596
|
+
{}
|
597
|
+
"""
|
598
|
+
# 0 区间 转 区间集
|
599
|
+
if isinstance(other, Interval):
|
600
|
+
other = Intervals([other])
|
601
|
+
|
602
|
+
# 1 区间集和区间集做相交运算,生成一个新的区间集
|
603
|
+
"""假设a、b都是从左到右按顺序取得,所以可以对b的遍历进行一定过滤简化"""
|
604
|
+
A = self.merge_intersect_interval()
|
605
|
+
B = other.merge_intersect_interval()
|
606
|
+
li, k = [], 0
|
607
|
+
for a in A:
|
608
|
+
for j in range(k, len(B)):
|
609
|
+
b = B[j]
|
610
|
+
if b.end() <= a.start():
|
611
|
+
# B[0~j]都在a前面,在后续的a中,可以直接从B[j]开始找
|
612
|
+
k = j
|
613
|
+
elif b.start() >= a.end(): # b已经到a右边,后面的b不用再找了,不会有相交
|
614
|
+
break
|
615
|
+
else: # 可能有相交
|
616
|
+
li.append(a & b)
|
617
|
+
return Intervals(li)
|
618
|
+
|
619
|
+
def is_adjacent_and(self, other):
|
620
|
+
""" __and__运算的变形,两区间邻接时也认为相交
|
621
|
+
|
622
|
+
>>> Intervals([(2, 4), (9, 11)]).is_adjacent_and(Interval(0, 10))
|
623
|
+
True
|
624
|
+
>>> Intervals([(1, 5), (6, 8)]).is_adjacent_and(Intervals([(2, 7), (7, 9)]))
|
625
|
+
True
|
626
|
+
>>> Intervals([(2, 11)]).is_adjacent_and(Intervals())
|
627
|
+
False
|
628
|
+
>>> Intervals().is_adjacent_and(Intervals([(2, 11)]))
|
629
|
+
False
|
630
|
+
>>> Intervals([(2, 11)]).is_adjacent_and(Interval(11, 13))
|
631
|
+
True
|
632
|
+
"""
|
633
|
+
# 0 区间 转 区间集
|
634
|
+
if isinstance(other, Interval):
|
635
|
+
other = Intervals([other])
|
636
|
+
|
637
|
+
# 1 区间集和区间集做相交运算,生成一个新的区间集
|
638
|
+
"""假设a、b都是从左到右按顺序取得,所以可以对b的遍历进行一定过滤简化"""
|
639
|
+
A = self.merge_intersect_interval()
|
640
|
+
B = other.merge_intersect_interval()
|
641
|
+
li, k = [], 0
|
642
|
+
for a in A:
|
643
|
+
for j in range(k, len(B)):
|
644
|
+
b = B[j]
|
645
|
+
if b.end() < a.start():
|
646
|
+
# B[0~j]都在a前面,在后续的a中,可以直接从B[j]开始找
|
647
|
+
k = j
|
648
|
+
elif b.start() > a.end(): # b已经到a右边,后面的b不用再找了,不会有相交
|
649
|
+
break
|
650
|
+
else: # 可能有相交
|
651
|
+
return True
|
652
|
+
return False
|
653
|
+
|
654
|
+
def __contains__(self, other):
|
655
|
+
r"""
|
656
|
+
>>> Interval(3, 5) in Intervals([(2, 6)])
|
657
|
+
True
|
658
|
+
>>> Interval(3, 5) in Intervals([(0, 4)])
|
659
|
+
False
|
660
|
+
>>> Intervals([(1, 2), (3, 4)]) in Intervals([(0, 3), (3, 5)])
|
661
|
+
True
|
662
|
+
>>> Interval(3, 5) not in Intervals([(2, 6)])
|
663
|
+
False
|
664
|
+
|
665
|
+
这里具体实现,可以双循环暴力,但考虑区间集顺序性,其实只要双指针同时往前找就好了,
|
666
|
+
设几个条件去对循环进行优化,跳出,能大大提高效率
|
667
|
+
"""
|
668
|
+
# 1 区间集 是否包含 区间,转为 区间集 是否包含 区间集 处理
|
669
|
+
if isinstance(other, (Interval, list, tuple)):
|
670
|
+
other = Intervals([other])
|
671
|
+
|
672
|
+
# 2 合并相交区域
|
673
|
+
A = self.merge_intersect_interval()
|
674
|
+
B = other.merge_intersect_interval()
|
675
|
+
|
676
|
+
# 3 看是否每个b,都能在A中找到一个a包含它
|
677
|
+
i = 0
|
678
|
+
for b in B:
|
679
|
+
for j in range(i, len(A)):
|
680
|
+
if b in A[j]:
|
681
|
+
# A[j-1]前面都不能包含b,但是A[j]能包含b的,后面的b,A[j-1]也一定包含不到
|
682
|
+
i = j
|
683
|
+
break
|
684
|
+
elif A[j].start() > b.end():
|
685
|
+
# 后续的a左边都已经比b的右边更大,就不用再找了,肯定都不会有相交的了
|
686
|
+
return False
|
687
|
+
else: # 找到一个b,在A中不包含它
|
688
|
+
return False
|
689
|
+
return True
|
690
|
+
|
691
|
+
def __or__(self, other):
|
692
|
+
r"""区间集相加运算,合成一个新的区间集对象(会丢失所有子区间)
|
693
|
+
|
694
|
+
出现相交的元素会合成一个新的元素,避免区间集中存在相交的两个元素,在sub时会出bug
|
695
|
+
>>> Intervals([(2, 4), (5, 7)]) | Interval(1, 3)
|
696
|
+
{[1~3: 1~2 2~3], [5~6]}
|
697
|
+
>>> Intervals([(2, 4), (5, 7)]) | Intervals([(1, 3), (6, 9)])
|
698
|
+
{[1~3: 1~2 2~3], [5~8: 5~6 6~8]}
|
699
|
+
|
700
|
+
>>> Intervals([(1, 3), (6, 9)]) + 3
|
701
|
+
{[4~5], [9~11]}
|
702
|
+
"""
|
703
|
+
if isinstance(other, Interval):
|
704
|
+
other = Intervals([other])
|
705
|
+
else:
|
706
|
+
other = Intervals(other)
|
707
|
+
return Intervals(self.li + other.li).merge_intersect_interval()
|
708
|
+
|
709
|
+
def __add__(self, other):
|
710
|
+
if isinstance(other, int):
|
711
|
+
li = [x + other for x in self.li]
|
712
|
+
return Intervals(li)
|
713
|
+
else:
|
714
|
+
return self | other
|
715
|
+
|
716
|
+
def __sub__(self, other):
|
717
|
+
"""区间集减法操作(注意跟Interval减法操作是有区别的)
|
718
|
+
对于任意的 a ∈ self,更新 a = a - {b | b ∈ other}
|
719
|
+
|
720
|
+
>>> Intervals([(0, 10)]) - Interval(4, 6)
|
721
|
+
{[0~3], [6~9]}
|
722
|
+
>>> Intervals([(0, 10)]) - Interval(8, 12)
|
723
|
+
{[0~7]}
|
724
|
+
|
725
|
+
>>> Intervals([(0, 10), (20, 30)]) - Intervals([(0, 5), (15, 25)])
|
726
|
+
{[5~9], [25~29]}
|
727
|
+
>>> Intervals([(0, 10), (20, 30)]) - Intervals([(2, 5), (7, 12), (25, 27)])
|
728
|
+
{[0~1], [5~6], [20~24], [27~29]}
|
729
|
+
"""
|
730
|
+
# 1
|
731
|
+
if isinstance(other, Interval):
|
732
|
+
other = Intervals([other])
|
733
|
+
|
734
|
+
# a - b,一个a可能会拆成a1,a2两段,此时左边的a1可以继续处理,
|
735
|
+
# 但是a2要加到堆栈A,留作下一轮处理,所以A要用栈结构处理
|
736
|
+
A = list(reversed(self.merge_intersect_interval().li))
|
737
|
+
|
738
|
+
B = other.merge_intersect_interval()
|
739
|
+
|
740
|
+
# 2
|
741
|
+
li, k = [], 0
|
742
|
+
while A:
|
743
|
+
a = A.pop()
|
744
|
+
for j in range(k, len(B)):
|
745
|
+
b = B[j]
|
746
|
+
if b.end() < a.start():
|
747
|
+
k = j
|
748
|
+
elif a.end() < a.start():
|
749
|
+
break
|
750
|
+
else:
|
751
|
+
c = a - b
|
752
|
+
if not c: # a已经被减光,就直接跳出循环了
|
753
|
+
a = Interval()
|
754
|
+
break
|
755
|
+
elif len(c.regs) == 1:
|
756
|
+
a = c
|
757
|
+
else: # 如果 a - c 变成了两段,则左边a1继续处理,右边a2加入A下轮处理
|
758
|
+
a = Interval(c.regs[1])
|
759
|
+
A.append(Interval(c.regs[2]))
|
760
|
+
if a: li.append(a)
|
761
|
+
|
762
|
+
return Intervals(li).merge_intersect_interval()
|
763
|
+
|
764
|
+
|
765
|
+
def iter_intervals(arg):
|
766
|
+
"""从多种类区间类型来构造Interval对象,返回值可能有多组"""
|
767
|
+
|
768
|
+
def judge_range(t):
|
769
|
+
return hasattr(t, '__len__') and len(t) == 2 and isinstance(t[0], int) and isinstance(t[1], int)
|
770
|
+
|
771
|
+
if hasattr(arg, 'regs'):
|
772
|
+
yield Interval(arg)
|
773
|
+
elif judge_range(arg):
|
774
|
+
yield Interval(arg)
|
775
|
+
elif isinstance(arg, Interval):
|
776
|
+
yield arg
|
777
|
+
elif isinstance(arg, Intervals):
|
778
|
+
for i in range(len(arg)):
|
779
|
+
yield arg[i]
|
780
|
+
elif hasattr(arg, '__len__') and len(arg) and judge_range(arg[0]):
|
781
|
+
for i in range(len(arg)):
|
782
|
+
yield Interval(arg[i])
|
783
|
+
elif isinstance(arg, collections.Iterable):
|
784
|
+
for t in list(arg):
|
785
|
+
yield t
|
786
|
+
|
787
|
+
|
788
|
+
def highlight_intervals(content, intervals, colors=None, background=True,
|
789
|
+
use_mathjax=False,
|
790
|
+
only_body=False,
|
791
|
+
title='highlight_intervals',
|
792
|
+
set_pre='<pre class="prettyprint nocode linenums" style="white-space: pre-wrap;">'):
|
793
|
+
"""文本匹配可视化
|
794
|
+
获得高亮显示的匹配区间的html代码
|
795
|
+
|
796
|
+
:param content:需要展示的文本内容
|
797
|
+
:param intervals: 输入一个数组,数组的每个元素支持单区间或区间集相关类
|
798
|
+
Interval、re正则的Match对象、(4, 10)
|
799
|
+
Intervals、[(2,4), (6,8)]
|
800
|
+
|
801
|
+
请自行保证区间嵌套语法正确性,本函数不检查处理嵌套混乱错误问题
|
802
|
+
:param set_pre: 设置<pre>显示格式。
|
803
|
+
标准 不自动换行: '<pre class="prettyprint nocode linenums">'
|
804
|
+
比如常见的,对于太长的文本行,可以自动断行:
|
805
|
+
set_pre='<pre class="prettyprint nocode linenums" style="white-space: pre-wrap;">'
|
806
|
+
:param colors: 一个数组,和intervals一一对应,轮询使用的颜色
|
807
|
+
默认值为: ['red']
|
808
|
+
:param background:
|
809
|
+
True,使用背景色
|
810
|
+
False,不使用背景色,而是字体颜色
|
811
|
+
:param use_mathjax:
|
812
|
+
True,渲染公式
|
813
|
+
False,不渲染公式,只以文本展示
|
814
|
+
:param only_body: 不返回完整的html页面内容,只有body主体内容
|
815
|
+
"""
|
816
|
+
# 1 存储要插入的html样式
|
817
|
+
from collections import defaultdict
|
818
|
+
import html
|
819
|
+
from pyxllib.text.xmllib import get_jinja_template
|
820
|
+
|
821
|
+
d = defaultdict(str)
|
822
|
+
|
823
|
+
# 2 其他所有子组从颜色列表取颜色清单,每组一个颜色
|
824
|
+
if colors is None:
|
825
|
+
colors = ('red',)
|
826
|
+
elif isinstance(colors, str):
|
827
|
+
colors = (colors,)
|
828
|
+
n = len(colors)
|
829
|
+
for i, arg in enumerate(intervals):
|
830
|
+
color = colors[i % n]
|
831
|
+
for interval in iter_intervals(arg):
|
832
|
+
l, r = interval.start(), interval.end()
|
833
|
+
if background:
|
834
|
+
d[l] = d[l] + f'<span style="background-color: {color}">'
|
835
|
+
d[r] = '</span>' + d[r]
|
836
|
+
else:
|
837
|
+
d[l] = d[l] + f'<font color={color}>'
|
838
|
+
d[r] = '</font>' + d[r]
|
839
|
+
|
840
|
+
# 3 拼接最终的html代码
|
841
|
+
res = [set_pre]
|
842
|
+
s = content
|
843
|
+
idxs = sorted(d.keys()) # 按顺序取需要插入的下标
|
844
|
+
|
845
|
+
# (3)拼接
|
846
|
+
if idxs: res.append(s[:idxs[0]])
|
847
|
+
for i in range(1, len(idxs)):
|
848
|
+
res.append(d[idxs[i - 1]])
|
849
|
+
res.append(html.escape(s[idxs[i - 1]:idxs[i]]))
|
850
|
+
if idxs: # 最后一个标记
|
851
|
+
res.append(d[idxs[-1]])
|
852
|
+
res.append(s[idxs[-1]:])
|
853
|
+
if not idxs:
|
854
|
+
res.append(s)
|
855
|
+
res.append('</pre>')
|
856
|
+
|
857
|
+
if only_body:
|
858
|
+
return ''.join(res)
|
859
|
+
else:
|
860
|
+
return get_jinja_template('highlight_code.html').render(title=title, body=''.join(res), use_mathjax=use_mathjax)
|
861
|
+
|
862
|
+
|
863
|
+
class StrIdxBack:
|
864
|
+
r"""字符串删除部分干扰字符后,对新字符串匹配并回溯找原字符串的下标
|
865
|
+
|
866
|
+
>>> ob = StrIdxBack('bxx ax xbxax')
|
867
|
+
>>> ob.delchars(r'[ x]+')
|
868
|
+
>>> ob # 删除空格、删除字符x
|
869
|
+
baba
|
870
|
+
>>> print(ob.idx) # keystr中与原字符串对应位置:(0, 5, 9, 11)
|
871
|
+
(0, 5, 9, 11)
|
872
|
+
>>> m = re.match(r'b(ab)', ob.keystr)
|
873
|
+
>>> m = ob.matchback(m)
|
874
|
+
>>> m.group(1)
|
875
|
+
'ax xb'
|
876
|
+
>>> ob.search('ab') # 找出原字符串中内容:'ax xb'
|
877
|
+
'ax xb'
|
878
|
+
"""
|
879
|
+
|
880
|
+
def __init__(self, s):
|
881
|
+
self.oristr = s
|
882
|
+
self.idx = tuple(range(len(s))) # 存储还保留着内容的下标
|
883
|
+
self.keystr = s
|
884
|
+
|
885
|
+
def delchars(self, pattern, flags=0):
|
886
|
+
r""" 模仿正则的替换语法
|
887
|
+
但是不用输入替换目标s,以及目标格式,因为都是删除操作
|
888
|
+
|
889
|
+
利用正则可以知道被删除的是哪个区间范围
|
890
|
+
>>> ob = StrIdxBack('abc123df4a'); ob.delchars(r'\d+'); str(ob)
|
891
|
+
'abcdfa'
|
892
|
+
>>> ob.idx
|
893
|
+
(0, 1, 2, 6, 7, 9)
|
894
|
+
"""
|
895
|
+
k = 0
|
896
|
+
idxs = []
|
897
|
+
|
898
|
+
def repl(m):
|
899
|
+
nonlocal k, idxs
|
900
|
+
idxs.append(self.idx[k:m.start(0)])
|
901
|
+
k = m.end(0)
|
902
|
+
return ''
|
903
|
+
|
904
|
+
self.keystr = re.sub(pattern, repl, self.keystr, flags=flags)
|
905
|
+
idxs.append(self.idx[k:])
|
906
|
+
self.idx = tuple(itertools.chain(*idxs))
|
907
|
+
|
908
|
+
def compare_newstr(self, limit=300):
|
909
|
+
r"""比较直观的比较字符串前后变化
|
910
|
+
|
911
|
+
newstr相对于oldnew作展开,比较直观的显示字符串前后变化差异
|
912
|
+
>>> ob = StrIdxBack('abab'); ob.delchars('b'); ob.compare_newstr()
|
913
|
+
'a a '
|
914
|
+
"""
|
915
|
+
s1 = self.oristr
|
916
|
+
dd = set(self.idx)
|
917
|
+
|
918
|
+
s2 = []
|
919
|
+
k = 0
|
920
|
+
for i in range(min(len(s1), limit)):
|
921
|
+
if i in dd:
|
922
|
+
s2.append(s1[i])
|
923
|
+
k += 1
|
924
|
+
else:
|
925
|
+
if ord(s1[i]) < 128:
|
926
|
+
if s1[i] == ' ': # 原来是空格的,删除后要用_表示
|
927
|
+
s2.append('_')
|
928
|
+
else: # 原始不是空格的,可以用空格表示已被删除
|
929
|
+
s2.append(' ')
|
930
|
+
else: # 中文字符要用两个空格表示才能对齐
|
931
|
+
s2.append(' ')
|
932
|
+
s2 = ''.join(s2)
|
933
|
+
s2 = s2.replace('\n', r'\n')
|
934
|
+
|
935
|
+
return s2
|
936
|
+
|
937
|
+
def compare(self, limit=300):
|
938
|
+
"""比较直观的比较字符串前后变化"""
|
939
|
+
s1 = self.oristr
|
940
|
+
|
941
|
+
s1 = s1.replace('\n', r'\n')[:limit]
|
942
|
+
s2 = self.compare_newstr(limit)
|
943
|
+
|
944
|
+
return s1 + '\n' + s2 + '\n'
|
945
|
+
|
946
|
+
def matchback(self, m):
|
947
|
+
"""输入一个keystr匹配的match对象,将其映射回oristr的match对象"""
|
948
|
+
regs = []
|
949
|
+
for rs in getattr(m, 'regs'):
|
950
|
+
regs.append((self.idx[rs[0]], self.idx[rs[1] - 1] + 1)) # 注意右边界的处理有细节
|
951
|
+
return ReMatch(regs, self.oristr, m.pos, len(self.oristr), m.lastindex, m.lastgroup, m.re)
|
952
|
+
|
953
|
+
def search(self, pattern):
|
954
|
+
"""在新字符串上查找模式,但是返回的是原字符串的相关下标数据"""
|
955
|
+
m = re.search(pattern, self.keystr)
|
956
|
+
if m:
|
957
|
+
m = self.matchback(m) # pycharm这里会提示m没有regs的成员变量,其实是正常的,没问题
|
958
|
+
return m.group()
|
959
|
+
else:
|
960
|
+
return ''
|
961
|
+
|
962
|
+
def __repr__(self):
|
963
|
+
"""返回处理后当前的新字符串"""
|
964
|
+
return self.keystr
|