regex 2026.1.14__cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- regex/__init__.py +3 -0
- regex/_main.py +746 -0
- regex/_regex.cpython-311-s390x-linux-gnu.so +0 -0
- regex/_regex_core.py +4675 -0
- regex/tests/test_regex.py +4540 -0
- regex-2026.1.14.dist-info/METADATA +1059 -0
- regex-2026.1.14.dist-info/RECORD +10 -0
- regex-2026.1.14.dist-info/WHEEL +7 -0
- regex-2026.1.14.dist-info/licenses/LICENSE.txt +208 -0
- regex-2026.1.14.dist-info/top_level.txt +1 -0
regex/_regex_core.py
ADDED
|
@@ -0,0 +1,4675 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Secret Labs' Regular Expression Engine core module
|
|
3
|
+
#
|
|
4
|
+
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# This version of the SRE library can be redistributed under CNRI's
|
|
7
|
+
# Python 1.6 license. For any other use, please contact Secret Labs
|
|
8
|
+
# AB (info@pythonware.com).
|
|
9
|
+
#
|
|
10
|
+
# Portions of this engine have been developed in cooperation with
|
|
11
|
+
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
|
|
12
|
+
# other compatibility work.
|
|
13
|
+
#
|
|
14
|
+
# 2010-01-16 mrab Python front-end re-written and extended
|
|
15
|
+
|
|
16
|
+
import enum
|
|
17
|
+
import string
|
|
18
|
+
import unicodedata
|
|
19
|
+
from collections import defaultdict
|
|
20
|
+
|
|
21
|
+
from regex import _regex
|
|
22
|
+
|
|
23
|
+
__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
|
|
24
|
+
"F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
|
|
25
|
+
"POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
|
|
26
|
+
"V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
|
|
27
|
+
"Scanner", "RegexFlag"]
|
|
28
|
+
|
|
29
|
+
# The regex exception.
|
|
30
|
+
class error(Exception):
|
|
31
|
+
"""Exception raised for invalid regular expressions.
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
|
|
35
|
+
msg: The unformatted error message
|
|
36
|
+
pattern: The regular expression pattern
|
|
37
|
+
pos: The position in the pattern where compilation failed, or None
|
|
38
|
+
lineno: The line number where compilation failed, unless pos is None
|
|
39
|
+
colno: The column number where compilation failed, unless pos is None
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, message, pattern=None, pos=None):
|
|
43
|
+
newline = '\n' if isinstance(pattern, str) else b'\n'
|
|
44
|
+
self.msg = message
|
|
45
|
+
self.pattern = pattern
|
|
46
|
+
self.pos = pos
|
|
47
|
+
if pattern is not None and pos is not None:
|
|
48
|
+
self.lineno = pattern.count(newline, 0, pos) + 1
|
|
49
|
+
self.colno = pos - pattern.rfind(newline, 0, pos)
|
|
50
|
+
|
|
51
|
+
message = "{} at position {}".format(message, pos)
|
|
52
|
+
|
|
53
|
+
if newline in pattern:
|
|
54
|
+
message += " (line {}, column {})".format(self.lineno,
|
|
55
|
+
self.colno)
|
|
56
|
+
|
|
57
|
+
Exception.__init__(self, message)
|
|
58
|
+
|
|
59
|
+
# The exception for when a positional flag has been turned on in the old
|
|
60
|
+
# behaviour.
|
|
61
|
+
class _UnscopedFlagSet(Exception):
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
# The exception for when parsing fails and we want to try something else.
|
|
65
|
+
class ParseError(Exception):
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
# The exception for when there isn't a valid first set.
|
|
69
|
+
class _FirstSetError(Exception):
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
# Flags.
|
|
73
|
+
class RegexFlag(enum.IntFlag):
|
|
74
|
+
A = ASCII = 0x80 # Assume ASCII locale.
|
|
75
|
+
B = BESTMATCH = 0x1000 # Best fuzzy match.
|
|
76
|
+
D = DEBUG = 0x200 # Print parsed pattern.
|
|
77
|
+
E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
|
|
78
|
+
# fuzzy match.
|
|
79
|
+
F = FULLCASE = 0x4000 # Unicode full case-folding.
|
|
80
|
+
I = IGNORECASE = 0x2 # Ignore case.
|
|
81
|
+
L = LOCALE = 0x4 # Assume current 8-bit locale.
|
|
82
|
+
M = MULTILINE = 0x8 # Make anchors look for newline.
|
|
83
|
+
P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
|
|
84
|
+
R = REVERSE = 0x400 # Search backwards.
|
|
85
|
+
S = DOTALL = 0x10 # Make dot match newline.
|
|
86
|
+
U = UNICODE = 0x20 # Assume Unicode locale.
|
|
87
|
+
V0 = VERSION0 = 0x2000 # Old legacy behaviour.
|
|
88
|
+
V1 = VERSION1 = 0x100 # New enhanced behaviour.
|
|
89
|
+
W = WORD = 0x800 # Default Unicode word breaks.
|
|
90
|
+
X = VERBOSE = 0x40 # Ignore whitespace and comments.
|
|
91
|
+
T = TEMPLATE = 0x1 # Template (present because re module has it).
|
|
92
|
+
|
|
93
|
+
def __repr__(self):
|
|
94
|
+
if self._name_ is not None:
|
|
95
|
+
return 'regex.%s' % self._name_
|
|
96
|
+
|
|
97
|
+
value = self._value_
|
|
98
|
+
members = []
|
|
99
|
+
negative = value < 0
|
|
100
|
+
|
|
101
|
+
if negative:
|
|
102
|
+
value = ~value
|
|
103
|
+
|
|
104
|
+
for m in self.__class__:
|
|
105
|
+
if value & m._value_:
|
|
106
|
+
value &= ~m._value_
|
|
107
|
+
members.append('regex.%s' % m._name_)
|
|
108
|
+
|
|
109
|
+
if value:
|
|
110
|
+
members.append(hex(value))
|
|
111
|
+
|
|
112
|
+
res = '|'.join(members)
|
|
113
|
+
|
|
114
|
+
if negative:
|
|
115
|
+
if len(members) > 1:
|
|
116
|
+
res = '~(%s)' % res
|
|
117
|
+
else:
|
|
118
|
+
res = '~%s' % res
|
|
119
|
+
|
|
120
|
+
return res
|
|
121
|
+
|
|
122
|
+
__str__ = object.__str__
|
|
123
|
+
|
|
124
|
+
# Put the flags into the module namespace. Being explicit here helps tools like
|
|
125
|
+
# linters and IDEs understand the code better.
|
|
126
|
+
ASCII = RegexFlag.ASCII
|
|
127
|
+
BESTMATCH = RegexFlag.BESTMATCH
|
|
128
|
+
DEBUG = RegexFlag.DEBUG
|
|
129
|
+
DOTALL = RegexFlag.DOTALL
|
|
130
|
+
ENHANCEMATCH = RegexFlag.ENHANCEMATCH
|
|
131
|
+
FULLCASE = RegexFlag.FULLCASE
|
|
132
|
+
IGNORECASE = RegexFlag.IGNORECASE
|
|
133
|
+
LOCALE = RegexFlag.LOCALE
|
|
134
|
+
MULTILINE = RegexFlag.MULTILINE
|
|
135
|
+
POSIX = RegexFlag.POSIX
|
|
136
|
+
REVERSE = RegexFlag.REVERSE
|
|
137
|
+
TEMPLATE = RegexFlag.TEMPLATE
|
|
138
|
+
UNICODE = RegexFlag.UNICODE
|
|
139
|
+
VERBOSE = RegexFlag.VERBOSE
|
|
140
|
+
VERSION0 = RegexFlag.VERSION0
|
|
141
|
+
VERSION1 = RegexFlag.VERSION1
|
|
142
|
+
WORD = RegexFlag.WORD
|
|
143
|
+
A = RegexFlag.A
|
|
144
|
+
B = RegexFlag.B
|
|
145
|
+
D = RegexFlag.D
|
|
146
|
+
E = RegexFlag.E
|
|
147
|
+
F = RegexFlag.F
|
|
148
|
+
I = RegexFlag.I
|
|
149
|
+
L = RegexFlag.L
|
|
150
|
+
M = RegexFlag.M
|
|
151
|
+
P = RegexFlag.P
|
|
152
|
+
R = RegexFlag.R
|
|
153
|
+
S = RegexFlag.S
|
|
154
|
+
U = RegexFlag.U
|
|
155
|
+
V0 = RegexFlag.V0
|
|
156
|
+
V1 = RegexFlag.V1
|
|
157
|
+
W = RegexFlag.W
|
|
158
|
+
X = RegexFlag.X
|
|
159
|
+
T = RegexFlag.T
|
|
160
|
+
|
|
161
|
+
DEFAULT_VERSION = VERSION1
|
|
162
|
+
|
|
163
|
+
_ALL_VERSIONS = VERSION0 | VERSION1
|
|
164
|
+
_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
|
|
165
|
+
|
|
166
|
+
# The default flags for the various versions.
|
|
167
|
+
DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
|
|
168
|
+
|
|
169
|
+
# The mask for the flags.
|
|
170
|
+
GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
|
|
171
|
+
REVERSE)
|
|
172
|
+
SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
|
|
173
|
+
_ALL_ENCODINGS)
|
|
174
|
+
|
|
175
|
+
ALPHA = frozenset(string.ascii_letters)
|
|
176
|
+
DIGITS = frozenset(string.digits)
|
|
177
|
+
ALNUM = ALPHA | DIGITS
|
|
178
|
+
OCT_DIGITS = frozenset(string.octdigits)
|
|
179
|
+
HEX_DIGITS = frozenset(string.hexdigits)
|
|
180
|
+
SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
|
|
181
|
+
NAMED_CHAR_PART = ALNUM | frozenset(" -")
|
|
182
|
+
PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
|
|
183
|
+
SET_OPS = ("||", "~~", "&&", "--")
|
|
184
|
+
|
|
185
|
+
# The width of the code words inside the regex engine.
|
|
186
|
+
BYTES_PER_CODE = _regex.get_code_size()
|
|
187
|
+
BITS_PER_CODE = BYTES_PER_CODE * 8
|
|
188
|
+
|
|
189
|
+
# The repeat count which represents infinity.
|
|
190
|
+
UNLIMITED = (1 << BITS_PER_CODE) - 1
|
|
191
|
+
|
|
192
|
+
# The regular expression flags.
|
|
193
|
+
REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
|
|
194
|
+
"i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
|
|
195
|
+
"s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
|
|
196
|
+
VERBOSE}
|
|
197
|
+
|
|
198
|
+
# The case flags.
|
|
199
|
+
CASE_FLAGS = FULLCASE | IGNORECASE
|
|
200
|
+
NOCASE = 0
|
|
201
|
+
FULLIGNORECASE = FULLCASE | IGNORECASE
|
|
202
|
+
|
|
203
|
+
FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
|
|
204
|
+
|
|
205
|
+
CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
|
|
206
|
+
FULLIGNORECASE: FULLIGNORECASE}
|
|
207
|
+
|
|
208
|
+
# The number of digits in hexadecimal escapes.
|
|
209
|
+
HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
|
|
210
|
+
|
|
211
|
+
# The names of the opcodes.
|
|
212
|
+
OPCODES = """
|
|
213
|
+
FAILURE
|
|
214
|
+
SUCCESS
|
|
215
|
+
ANY
|
|
216
|
+
ANY_ALL
|
|
217
|
+
ANY_ALL_REV
|
|
218
|
+
ANY_REV
|
|
219
|
+
ANY_U
|
|
220
|
+
ANY_U_REV
|
|
221
|
+
ATOMIC
|
|
222
|
+
BOUNDARY
|
|
223
|
+
BRANCH
|
|
224
|
+
CALL_REF
|
|
225
|
+
CHARACTER
|
|
226
|
+
CHARACTER_IGN
|
|
227
|
+
CHARACTER_IGN_REV
|
|
228
|
+
CHARACTER_REV
|
|
229
|
+
CONDITIONAL
|
|
230
|
+
DEFAULT_BOUNDARY
|
|
231
|
+
DEFAULT_END_OF_WORD
|
|
232
|
+
DEFAULT_START_OF_WORD
|
|
233
|
+
END
|
|
234
|
+
END_OF_LINE
|
|
235
|
+
END_OF_LINE_U
|
|
236
|
+
END_OF_STRING
|
|
237
|
+
END_OF_STRING_LINE
|
|
238
|
+
END_OF_STRING_LINE_U
|
|
239
|
+
END_OF_WORD
|
|
240
|
+
FUZZY
|
|
241
|
+
GRAPHEME_BOUNDARY
|
|
242
|
+
GREEDY_REPEAT
|
|
243
|
+
GROUP
|
|
244
|
+
GROUP_CALL
|
|
245
|
+
GROUP_EXISTS
|
|
246
|
+
KEEP
|
|
247
|
+
LAZY_REPEAT
|
|
248
|
+
LOOKAROUND
|
|
249
|
+
NEXT
|
|
250
|
+
PROPERTY
|
|
251
|
+
PROPERTY_IGN
|
|
252
|
+
PROPERTY_IGN_REV
|
|
253
|
+
PROPERTY_REV
|
|
254
|
+
PRUNE
|
|
255
|
+
RANGE
|
|
256
|
+
RANGE_IGN
|
|
257
|
+
RANGE_IGN_REV
|
|
258
|
+
RANGE_REV
|
|
259
|
+
REF_GROUP
|
|
260
|
+
REF_GROUP_FLD
|
|
261
|
+
REF_GROUP_FLD_REV
|
|
262
|
+
REF_GROUP_IGN
|
|
263
|
+
REF_GROUP_IGN_REV
|
|
264
|
+
REF_GROUP_REV
|
|
265
|
+
SEARCH_ANCHOR
|
|
266
|
+
SET_DIFF
|
|
267
|
+
SET_DIFF_IGN
|
|
268
|
+
SET_DIFF_IGN_REV
|
|
269
|
+
SET_DIFF_REV
|
|
270
|
+
SET_INTER
|
|
271
|
+
SET_INTER_IGN
|
|
272
|
+
SET_INTER_IGN_REV
|
|
273
|
+
SET_INTER_REV
|
|
274
|
+
SET_SYM_DIFF
|
|
275
|
+
SET_SYM_DIFF_IGN
|
|
276
|
+
SET_SYM_DIFF_IGN_REV
|
|
277
|
+
SET_SYM_DIFF_REV
|
|
278
|
+
SET_UNION
|
|
279
|
+
SET_UNION_IGN
|
|
280
|
+
SET_UNION_IGN_REV
|
|
281
|
+
SET_UNION_REV
|
|
282
|
+
SKIP
|
|
283
|
+
START_OF_LINE
|
|
284
|
+
START_OF_LINE_U
|
|
285
|
+
START_OF_STRING
|
|
286
|
+
START_OF_WORD
|
|
287
|
+
STRING
|
|
288
|
+
STRING_FLD
|
|
289
|
+
STRING_FLD_REV
|
|
290
|
+
STRING_IGN
|
|
291
|
+
STRING_IGN_REV
|
|
292
|
+
STRING_REV
|
|
293
|
+
FUZZY_EXT
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
# Define the opcodes in a namespace.
|
|
297
|
+
class Namespace:
|
|
298
|
+
pass
|
|
299
|
+
|
|
300
|
+
OP = Namespace()
|
|
301
|
+
for i, op in enumerate(OPCODES.split()):
|
|
302
|
+
setattr(OP, op, i)
|
|
303
|
+
|
|
304
|
+
def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
|
|
305
|
+
"""Make room in the given cache.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
cache_dict: The cache dictionary to modify.
|
|
309
|
+
args_dict: The dictionary of named list args used by patterns.
|
|
310
|
+
max_length: Maximum # of entries in cache_dict before it is shrunk.
|
|
311
|
+
divisor: Cache will shrink to max_length - 1/divisor*max_length items.
|
|
312
|
+
"""
|
|
313
|
+
# Toss out a fraction of the entries at random to make room for new ones.
|
|
314
|
+
# A random algorithm was chosen as opposed to simply cache_dict.popitem()
|
|
315
|
+
# as popitem could penalize the same regular expression repeatedly based
|
|
316
|
+
# on its internal hash value. Being random should spread the cache miss
|
|
317
|
+
# love around.
|
|
318
|
+
cache_keys = tuple(cache_dict.keys())
|
|
319
|
+
overage = len(cache_keys) - max_length
|
|
320
|
+
if overage < 0:
|
|
321
|
+
# Cache is already within limits. Normally this should not happen
|
|
322
|
+
# but it could due to multithreading.
|
|
323
|
+
return
|
|
324
|
+
|
|
325
|
+
number_to_toss = max_length // divisor + overage
|
|
326
|
+
|
|
327
|
+
# The import is done here to avoid a circular dependency.
|
|
328
|
+
import random
|
|
329
|
+
if not hasattr(random, 'sample'):
|
|
330
|
+
# Do nothing while resolving the circular dependency:
|
|
331
|
+
# re->random->warnings->tokenize->string->re
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
for doomed_key in random.sample(cache_keys, number_to_toss):
|
|
335
|
+
try:
|
|
336
|
+
del cache_dict[doomed_key]
|
|
337
|
+
except KeyError:
|
|
338
|
+
# Ignore problems if the cache changed from another thread.
|
|
339
|
+
pass
|
|
340
|
+
|
|
341
|
+
# Rebuild the arguments and locale-sensitivity dictionaries.
|
|
342
|
+
args_dict.clear()
|
|
343
|
+
sensitivity_dict = {}
|
|
344
|
+
for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
|
|
345
|
+
args_dict[pattern, pattern_type, flags, default_version, locale] = args
|
|
346
|
+
try:
|
|
347
|
+
sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
|
|
348
|
+
except KeyError:
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
locale_sensitive.clear()
|
|
352
|
+
locale_sensitive.update(sensitivity_dict)
|
|
353
|
+
|
|
354
|
+
def _fold_case(info, string):
|
|
355
|
+
"Folds the case of a string."
|
|
356
|
+
flags = info.flags
|
|
357
|
+
if (flags & _ALL_ENCODINGS) == 0:
|
|
358
|
+
flags |= info.guess_encoding
|
|
359
|
+
|
|
360
|
+
return _regex.fold_case(flags, string)
|
|
361
|
+
|
|
362
|
+
def is_cased_i(info, char):
|
|
363
|
+
"Checks whether a character is cased."
|
|
364
|
+
return len(_regex.get_all_cases(info.flags, char)) > 1
|
|
365
|
+
|
|
366
|
+
def is_cased_f(flags, char):
|
|
367
|
+
"Checks whether a character is cased."
|
|
368
|
+
return len(_regex.get_all_cases(flags, char)) > 1
|
|
369
|
+
|
|
370
|
+
def _compile_firstset(info, fs):
|
|
371
|
+
"Compiles the firstset for the pattern."
|
|
372
|
+
reverse = bool(info.flags & REVERSE)
|
|
373
|
+
fs = _check_firstset(info, reverse, fs)
|
|
374
|
+
if not fs or isinstance(fs, AnyAll):
|
|
375
|
+
return []
|
|
376
|
+
|
|
377
|
+
# Compile the firstset.
|
|
378
|
+
return fs.compile(reverse)
|
|
379
|
+
|
|
380
|
+
def _check_firstset(info, reverse, fs):
|
|
381
|
+
"Checks the firstset for the pattern."
|
|
382
|
+
if not fs or None in fs:
|
|
383
|
+
return None
|
|
384
|
+
|
|
385
|
+
# If we ignore the case, for simplicity we won't build a firstset.
|
|
386
|
+
members = set()
|
|
387
|
+
case_flags = NOCASE
|
|
388
|
+
for i in fs:
|
|
389
|
+
if isinstance(i, Character) and not i.positive:
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
# if i.case_flags:
|
|
393
|
+
# if isinstance(i, Character):
|
|
394
|
+
# if is_cased_i(info, i.value):
|
|
395
|
+
# return []
|
|
396
|
+
# elif isinstance(i, SetBase):
|
|
397
|
+
# return []
|
|
398
|
+
case_flags |= i.case_flags
|
|
399
|
+
members.add(i.with_flags(case_flags=NOCASE))
|
|
400
|
+
|
|
401
|
+
if case_flags == (FULLCASE | IGNORECASE):
|
|
402
|
+
return None
|
|
403
|
+
|
|
404
|
+
# Build the firstset.
|
|
405
|
+
fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
|
|
406
|
+
zerowidth=True)
|
|
407
|
+
fs = fs.optimise(info, reverse, in_set=True)
|
|
408
|
+
|
|
409
|
+
return fs
|
|
410
|
+
|
|
411
|
+
def _flatten_code(code):
|
|
412
|
+
"Flattens the code from a list of tuples."
|
|
413
|
+
flat_code = []
|
|
414
|
+
for c in code:
|
|
415
|
+
flat_code.extend(c)
|
|
416
|
+
|
|
417
|
+
return flat_code
|
|
418
|
+
|
|
419
|
+
def make_case_flags(info):
|
|
420
|
+
"Makes the case flags."
|
|
421
|
+
flags = info.flags & CASE_FLAGS
|
|
422
|
+
|
|
423
|
+
# Turn off FULLCASE if ASCII is turned on.
|
|
424
|
+
if info.flags & ASCII:
|
|
425
|
+
flags &= ~FULLCASE
|
|
426
|
+
|
|
427
|
+
return flags
|
|
428
|
+
|
|
429
|
+
def make_character(info, value, in_set=False):
|
|
430
|
+
"Makes a character literal."
|
|
431
|
+
if in_set:
|
|
432
|
+
# A character set is built case-sensitively.
|
|
433
|
+
return Character(value)
|
|
434
|
+
|
|
435
|
+
return Character(value, case_flags=make_case_flags(info))
|
|
436
|
+
|
|
437
|
+
def make_ref_group(info, name, position):
|
|
438
|
+
"Makes a group reference."
|
|
439
|
+
return RefGroup(info, name, position, case_flags=make_case_flags(info))
|
|
440
|
+
|
|
441
|
+
def make_string_set(info, name):
|
|
442
|
+
"Makes a string set."
|
|
443
|
+
return StringSet(info, name, case_flags=make_case_flags(info))
|
|
444
|
+
|
|
445
|
+
def make_property(info, prop, in_set):
|
|
446
|
+
"Makes a property."
|
|
447
|
+
if in_set:
|
|
448
|
+
return prop
|
|
449
|
+
|
|
450
|
+
return prop.with_flags(case_flags=make_case_flags(info))
|
|
451
|
+
|
|
452
|
+
def _parse_pattern(source, info):
|
|
453
|
+
"Parses a pattern, eg. 'a|b|c'."
|
|
454
|
+
branches = [parse_sequence(source, info)]
|
|
455
|
+
while source.match("|"):
|
|
456
|
+
branches.append(parse_sequence(source, info))
|
|
457
|
+
|
|
458
|
+
if len(branches) == 1:
|
|
459
|
+
return branches[0]
|
|
460
|
+
return Branch(branches)
|
|
461
|
+
|
|
462
|
+
def parse_sequence(source, info):
|
|
463
|
+
"Parses a sequence, eg. 'abc'."
|
|
464
|
+
sequence = [None]
|
|
465
|
+
case_flags = make_case_flags(info)
|
|
466
|
+
while True:
|
|
467
|
+
saved_pos = source.pos
|
|
468
|
+
ch = source.get()
|
|
469
|
+
if ch in SPECIAL_CHARS:
|
|
470
|
+
if ch in ")|":
|
|
471
|
+
# The end of a sequence. At the end of the pattern ch is "".
|
|
472
|
+
source.pos = saved_pos
|
|
473
|
+
break
|
|
474
|
+
elif ch == "\\":
|
|
475
|
+
# An escape sequence outside a set.
|
|
476
|
+
sequence.append(parse_escape(source, info, False))
|
|
477
|
+
elif ch == "(":
|
|
478
|
+
# A parenthesised subpattern or a flag.
|
|
479
|
+
element = parse_paren(source, info)
|
|
480
|
+
if element is None:
|
|
481
|
+
case_flags = make_case_flags(info)
|
|
482
|
+
else:
|
|
483
|
+
sequence.append(element)
|
|
484
|
+
elif ch == ".":
|
|
485
|
+
# Any character.
|
|
486
|
+
if info.flags & DOTALL:
|
|
487
|
+
sequence.append(AnyAll())
|
|
488
|
+
elif info.flags & WORD:
|
|
489
|
+
sequence.append(AnyU())
|
|
490
|
+
else:
|
|
491
|
+
sequence.append(Any())
|
|
492
|
+
elif ch == "[":
|
|
493
|
+
# A character set.
|
|
494
|
+
sequence.append(parse_set(source, info))
|
|
495
|
+
elif ch == "^":
|
|
496
|
+
# The start of a line or the string.
|
|
497
|
+
if info.flags & MULTILINE:
|
|
498
|
+
if info.flags & WORD:
|
|
499
|
+
sequence.append(StartOfLineU())
|
|
500
|
+
else:
|
|
501
|
+
sequence.append(StartOfLine())
|
|
502
|
+
else:
|
|
503
|
+
sequence.append(StartOfString())
|
|
504
|
+
elif ch == "$":
|
|
505
|
+
# The end of a line or the string.
|
|
506
|
+
if info.flags & MULTILINE:
|
|
507
|
+
if info.flags & WORD:
|
|
508
|
+
sequence.append(EndOfLineU())
|
|
509
|
+
else:
|
|
510
|
+
sequence.append(EndOfLine())
|
|
511
|
+
else:
|
|
512
|
+
if info.flags & WORD:
|
|
513
|
+
sequence.append(EndOfStringLineU())
|
|
514
|
+
else:
|
|
515
|
+
sequence.append(EndOfStringLine())
|
|
516
|
+
elif ch in "?*+{":
|
|
517
|
+
# Looks like a quantifier.
|
|
518
|
+
counts = parse_quantifier(source, info, ch)
|
|
519
|
+
if counts:
|
|
520
|
+
# It _is_ a quantifier.
|
|
521
|
+
apply_quantifier(source, info, counts, case_flags, ch,
|
|
522
|
+
saved_pos, sequence)
|
|
523
|
+
sequence.append(None)
|
|
524
|
+
else:
|
|
525
|
+
# It's not a quantifier. Maybe it's a fuzzy constraint.
|
|
526
|
+
constraints = parse_fuzzy(source, info, ch, case_flags)
|
|
527
|
+
|
|
528
|
+
if constraints:
|
|
529
|
+
# It _is_ a fuzzy constraint.
|
|
530
|
+
if is_actually_fuzzy(constraints):
|
|
531
|
+
apply_constraint(source, info, constraints, case_flags,
|
|
532
|
+
saved_pos, sequence)
|
|
533
|
+
sequence.append(None)
|
|
534
|
+
else:
|
|
535
|
+
# The element was just a literal.
|
|
536
|
+
sequence.append(Character(ord(ch),
|
|
537
|
+
case_flags=case_flags))
|
|
538
|
+
else:
|
|
539
|
+
# A literal.
|
|
540
|
+
sequence.append(Character(ord(ch), case_flags=case_flags))
|
|
541
|
+
else:
|
|
542
|
+
# A literal.
|
|
543
|
+
sequence.append(Character(ord(ch), case_flags=case_flags))
|
|
544
|
+
|
|
545
|
+
sequence = [item for item in sequence if item is not None]
|
|
546
|
+
return Sequence(sequence)
|
|
547
|
+
|
|
548
|
+
def is_actually_fuzzy(constraints):
|
|
549
|
+
"Checks whether a fuzzy constraint is actually fuzzy."
|
|
550
|
+
if constraints.get("e") == (0, 0):
|
|
551
|
+
return False
|
|
552
|
+
|
|
553
|
+
if (constraints.get("s"), constraints.get("i"), constraints.get("d")) == ((0, 0), (0, 0), (0, 0)):
|
|
554
|
+
return False
|
|
555
|
+
|
|
556
|
+
return True
|
|
557
|
+
|
|
558
|
+
def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
|
|
559
|
+
sequence):
|
|
560
|
+
element = sequence.pop()
|
|
561
|
+
if element is None:
|
|
562
|
+
if sequence:
|
|
563
|
+
raise error("multiple repeat", source.string, saved_pos)
|
|
564
|
+
raise error("nothing to repeat", source.string, saved_pos)
|
|
565
|
+
|
|
566
|
+
if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
|
|
567
|
+
raise error("multiple repeat", source.string, saved_pos)
|
|
568
|
+
|
|
569
|
+
min_count, max_count = counts
|
|
570
|
+
saved_pos = source.pos
|
|
571
|
+
ch = source.get()
|
|
572
|
+
if ch == "?":
|
|
573
|
+
# The "?" suffix that means it's a lazy repeat.
|
|
574
|
+
repeated = LazyRepeat
|
|
575
|
+
elif ch == "+":
|
|
576
|
+
# The "+" suffix that means it's a possessive repeat.
|
|
577
|
+
repeated = PossessiveRepeat
|
|
578
|
+
else:
|
|
579
|
+
# No suffix means that it's a greedy repeat.
|
|
580
|
+
source.pos = saved_pos
|
|
581
|
+
repeated = GreedyRepeat
|
|
582
|
+
|
|
583
|
+
# Ignore the quantifier if it applies to a zero-width item or the number of
|
|
584
|
+
# repeats is fixed at 1.
|
|
585
|
+
if not element.is_empty() and (min_count != 1 or max_count != 1):
|
|
586
|
+
element = repeated(element, min_count, max_count)
|
|
587
|
+
|
|
588
|
+
sequence.append(element)
|
|
589
|
+
|
|
590
|
+
def apply_constraint(source, info, constraints, case_flags, saved_pos,
|
|
591
|
+
sequence):
|
|
592
|
+
element = sequence.pop()
|
|
593
|
+
if element is None:
|
|
594
|
+
raise error("nothing for fuzzy constraint", source.string, saved_pos)
|
|
595
|
+
|
|
596
|
+
# If a group is marked as fuzzy then put all of the fuzzy part in the
|
|
597
|
+
# group.
|
|
598
|
+
if isinstance(element, Group):
|
|
599
|
+
element.subpattern = Fuzzy(element.subpattern, constraints)
|
|
600
|
+
sequence.append(element)
|
|
601
|
+
else:
|
|
602
|
+
sequence.append(Fuzzy(element, constraints))
|
|
603
|
+
|
|
604
|
+
_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
|
|
605
|
+
|
|
606
|
+
def parse_quantifier(source, info, ch):
|
|
607
|
+
"Parses a quantifier."
|
|
608
|
+
q = _QUANTIFIERS.get(ch)
|
|
609
|
+
if q:
|
|
610
|
+
# It's a quantifier.
|
|
611
|
+
return q
|
|
612
|
+
|
|
613
|
+
if ch == "{":
|
|
614
|
+
# Looks like a limited repeated element, eg. 'a{2,3}'.
|
|
615
|
+
counts = parse_limited_quantifier(source)
|
|
616
|
+
if counts:
|
|
617
|
+
return counts
|
|
618
|
+
|
|
619
|
+
return None
|
|
620
|
+
|
|
621
|
+
def is_above_limit(count):
|
|
622
|
+
"Checks whether a count is above the maximum."
|
|
623
|
+
return count is not None and count >= UNLIMITED
|
|
624
|
+
|
|
625
|
+
def parse_limited_quantifier(source):
|
|
626
|
+
"Parses a limited quantifier."
|
|
627
|
+
saved_pos = source.pos
|
|
628
|
+
min_count = parse_count(source)
|
|
629
|
+
if source.match(","):
|
|
630
|
+
max_count = parse_count(source)
|
|
631
|
+
|
|
632
|
+
# No minimum means 0 and no maximum means unlimited.
|
|
633
|
+
min_count = int(min_count or 0)
|
|
634
|
+
max_count = int(max_count) if max_count else None
|
|
635
|
+
else:
|
|
636
|
+
if not min_count:
|
|
637
|
+
source.pos = saved_pos
|
|
638
|
+
return None
|
|
639
|
+
|
|
640
|
+
min_count = max_count = int(min_count)
|
|
641
|
+
|
|
642
|
+
if not source.match ("}"):
|
|
643
|
+
source.pos = saved_pos
|
|
644
|
+
return None
|
|
645
|
+
|
|
646
|
+
if is_above_limit(min_count) or is_above_limit(max_count):
|
|
647
|
+
raise error("repeat count too big", source.string, saved_pos)
|
|
648
|
+
|
|
649
|
+
if max_count is not None and min_count > max_count:
|
|
650
|
+
raise error("min repeat greater than max repeat", source.string,
|
|
651
|
+
saved_pos)
|
|
652
|
+
|
|
653
|
+
return min_count, max_count
|
|
654
|
+
|
|
655
|
+
def parse_fuzzy(source, info, ch, case_flags):
|
|
656
|
+
"Parses a fuzzy setting, if present."
|
|
657
|
+
saved_pos = source.pos
|
|
658
|
+
|
|
659
|
+
if ch != "{":
|
|
660
|
+
return None
|
|
661
|
+
|
|
662
|
+
constraints = {}
|
|
663
|
+
try:
|
|
664
|
+
parse_fuzzy_item(source, constraints)
|
|
665
|
+
while source.match(","):
|
|
666
|
+
parse_fuzzy_item(source, constraints)
|
|
667
|
+
except ParseError:
|
|
668
|
+
source.pos = saved_pos
|
|
669
|
+
return None
|
|
670
|
+
|
|
671
|
+
if source.match(":"):
|
|
672
|
+
constraints["test"] = parse_fuzzy_test(source, info, case_flags)
|
|
673
|
+
|
|
674
|
+
if not source.match("}"):
|
|
675
|
+
raise error("expected }", source.string, source.pos)
|
|
676
|
+
|
|
677
|
+
return constraints
|
|
678
|
+
|
|
679
|
+
def parse_fuzzy_item(source, constraints):
|
|
680
|
+
"Parses a fuzzy setting item."
|
|
681
|
+
saved_pos = source.pos
|
|
682
|
+
try:
|
|
683
|
+
parse_cost_constraint(source, constraints)
|
|
684
|
+
except ParseError:
|
|
685
|
+
source.pos = saved_pos
|
|
686
|
+
|
|
687
|
+
parse_cost_equation(source, constraints)
|
|
688
|
+
|
|
689
|
+
def parse_cost_constraint(source, constraints):
|
|
690
|
+
"Parses a cost constraint."
|
|
691
|
+
saved_pos = source.pos
|
|
692
|
+
ch = source.get()
|
|
693
|
+
if ch in ALPHA:
|
|
694
|
+
# Syntax: constraint [("<=" | "<") cost]
|
|
695
|
+
constraint = parse_constraint(source, constraints, ch)
|
|
696
|
+
|
|
697
|
+
max_inc = parse_fuzzy_compare(source)
|
|
698
|
+
|
|
699
|
+
if max_inc is None:
|
|
700
|
+
# No maximum cost.
|
|
701
|
+
constraints[constraint] = 0, None
|
|
702
|
+
else:
|
|
703
|
+
# There's a maximum cost.
|
|
704
|
+
cost_pos = source.pos
|
|
705
|
+
max_cost = parse_cost_limit(source)
|
|
706
|
+
|
|
707
|
+
# Inclusive or exclusive limit?
|
|
708
|
+
if not max_inc:
|
|
709
|
+
max_cost -= 1
|
|
710
|
+
|
|
711
|
+
if max_cost < 0:
|
|
712
|
+
raise error("bad fuzzy cost limit", source.string, cost_pos)
|
|
713
|
+
|
|
714
|
+
constraints[constraint] = 0, max_cost
|
|
715
|
+
elif ch in DIGITS:
|
|
716
|
+
# Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
|
|
717
|
+
source.pos = saved_pos
|
|
718
|
+
|
|
719
|
+
# Minimum cost.
|
|
720
|
+
cost_pos = source.pos
|
|
721
|
+
min_cost = parse_cost_limit(source)
|
|
722
|
+
|
|
723
|
+
min_inc = parse_fuzzy_compare(source)
|
|
724
|
+
if min_inc is None:
|
|
725
|
+
raise ParseError()
|
|
726
|
+
|
|
727
|
+
constraint = parse_constraint(source, constraints, source.get())
|
|
728
|
+
|
|
729
|
+
max_inc = parse_fuzzy_compare(source)
|
|
730
|
+
if max_inc is None:
|
|
731
|
+
raise ParseError()
|
|
732
|
+
|
|
733
|
+
# Maximum cost.
|
|
734
|
+
cost_pos = source.pos
|
|
735
|
+
max_cost = parse_cost_limit(source)
|
|
736
|
+
|
|
737
|
+
# Inclusive or exclusive limits?
|
|
738
|
+
if not min_inc:
|
|
739
|
+
min_cost += 1
|
|
740
|
+
if not max_inc:
|
|
741
|
+
max_cost -= 1
|
|
742
|
+
|
|
743
|
+
if not 0 <= min_cost <= max_cost:
|
|
744
|
+
raise error("bad fuzzy cost limit", source.string, cost_pos)
|
|
745
|
+
|
|
746
|
+
constraints[constraint] = min_cost, max_cost
|
|
747
|
+
else:
|
|
748
|
+
raise ParseError()
|
|
749
|
+
|
|
750
|
+
def parse_cost_limit(source):
|
|
751
|
+
"Parses a cost limit."
|
|
752
|
+
cost_pos = source.pos
|
|
753
|
+
digits = parse_count(source)
|
|
754
|
+
|
|
755
|
+
try:
|
|
756
|
+
return int(digits)
|
|
757
|
+
except ValueError:
|
|
758
|
+
pass
|
|
759
|
+
|
|
760
|
+
raise error("bad fuzzy cost limit", source.string, cost_pos)
|
|
761
|
+
|
|
762
|
+
def parse_constraint(source, constraints, ch):
|
|
763
|
+
"Parses a constraint."
|
|
764
|
+
if ch not in "deis":
|
|
765
|
+
raise ParseError()
|
|
766
|
+
|
|
767
|
+
if ch in constraints:
|
|
768
|
+
raise ParseError()
|
|
769
|
+
|
|
770
|
+
return ch
|
|
771
|
+
|
|
772
|
+
def parse_fuzzy_compare(source):
|
|
773
|
+
"Parses a cost comparator."
|
|
774
|
+
if source.match("<="):
|
|
775
|
+
return True
|
|
776
|
+
elif source.match("<"):
|
|
777
|
+
return False
|
|
778
|
+
else:
|
|
779
|
+
return None
|
|
780
|
+
|
|
781
|
+
def parse_cost_equation(source, constraints):
|
|
782
|
+
"Parses a cost equation."
|
|
783
|
+
if "cost" in constraints:
|
|
784
|
+
raise error("more than one cost equation", source.string, source.pos)
|
|
785
|
+
|
|
786
|
+
cost = {}
|
|
787
|
+
|
|
788
|
+
parse_cost_term(source, cost)
|
|
789
|
+
while source.match("+"):
|
|
790
|
+
parse_cost_term(source, cost)
|
|
791
|
+
|
|
792
|
+
max_inc = parse_fuzzy_compare(source)
|
|
793
|
+
if max_inc is None:
|
|
794
|
+
raise ParseError()
|
|
795
|
+
|
|
796
|
+
max_cost = int(parse_count(source))
|
|
797
|
+
|
|
798
|
+
if not max_inc:
|
|
799
|
+
max_cost -= 1
|
|
800
|
+
|
|
801
|
+
if max_cost < 0:
|
|
802
|
+
raise error("bad fuzzy cost limit", source.string, source.pos)
|
|
803
|
+
|
|
804
|
+
cost["max"] = max_cost
|
|
805
|
+
|
|
806
|
+
constraints["cost"] = cost
|
|
807
|
+
|
|
808
|
+
def parse_cost_term(source, cost):
|
|
809
|
+
"Parses a cost equation term."
|
|
810
|
+
coeff = parse_count(source)
|
|
811
|
+
ch = source.get()
|
|
812
|
+
if ch not in "dis":
|
|
813
|
+
raise ParseError()
|
|
814
|
+
|
|
815
|
+
if ch in cost:
|
|
816
|
+
raise error("repeated fuzzy cost", source.string, source.pos)
|
|
817
|
+
|
|
818
|
+
cost[ch] = int(coeff or 1)
|
|
819
|
+
|
|
820
|
+
def parse_fuzzy_test(source, info, case_flags):
|
|
821
|
+
saved_pos = source.pos
|
|
822
|
+
ch = source.get()
|
|
823
|
+
if ch in SPECIAL_CHARS:
|
|
824
|
+
if ch == "\\":
|
|
825
|
+
# An escape sequence outside a set.
|
|
826
|
+
return parse_escape(source, info, False)
|
|
827
|
+
elif ch == ".":
|
|
828
|
+
# Any character.
|
|
829
|
+
if info.flags & DOTALL:
|
|
830
|
+
return AnyAll()
|
|
831
|
+
elif info.flags & WORD:
|
|
832
|
+
return AnyU()
|
|
833
|
+
else:
|
|
834
|
+
return Any()
|
|
835
|
+
elif ch == "[":
|
|
836
|
+
# A character set.
|
|
837
|
+
return parse_set(source, info)
|
|
838
|
+
else:
|
|
839
|
+
raise error("expected character set", source.string, saved_pos)
|
|
840
|
+
elif ch:
|
|
841
|
+
# A literal.
|
|
842
|
+
return Character(ord(ch), case_flags=case_flags)
|
|
843
|
+
else:
|
|
844
|
+
raise error("expected character set", source.string, saved_pos)
|
|
845
|
+
|
|
846
|
+
def parse_count(source):
|
|
847
|
+
"Parses a quantifier's count, which can be empty."
|
|
848
|
+
return source.get_while(DIGITS)
|
|
849
|
+
|
|
850
|
+
def parse_paren(source, info):
|
|
851
|
+
"""Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
|
|
852
|
+
inline flag.
|
|
853
|
+
"""
|
|
854
|
+
saved_pos = source.pos
|
|
855
|
+
ch = source.get(True)
|
|
856
|
+
if ch == "?":
|
|
857
|
+
# (?...
|
|
858
|
+
saved_pos_2 = source.pos
|
|
859
|
+
ch = source.get(True)
|
|
860
|
+
if ch == "<":
|
|
861
|
+
# (?<...
|
|
862
|
+
saved_pos_3 = source.pos
|
|
863
|
+
ch = source.get()
|
|
864
|
+
if ch in ("=", "!"):
|
|
865
|
+
# (?<=... or (?<!...: lookbehind.
|
|
866
|
+
return parse_lookaround(source, info, True, ch == "=")
|
|
867
|
+
|
|
868
|
+
# (?<...: a named capture group.
|
|
869
|
+
source.pos = saved_pos_3
|
|
870
|
+
name = parse_name(source)
|
|
871
|
+
group = info.open_group(name)
|
|
872
|
+
source.expect(">")
|
|
873
|
+
saved_flags = info.flags
|
|
874
|
+
try:
|
|
875
|
+
subpattern = _parse_pattern(source, info)
|
|
876
|
+
source.expect(")")
|
|
877
|
+
finally:
|
|
878
|
+
info.flags = saved_flags
|
|
879
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
880
|
+
|
|
881
|
+
info.close_group()
|
|
882
|
+
return Group(info, group, subpattern)
|
|
883
|
+
if ch in ("=", "!"):
|
|
884
|
+
# (?=... or (?!...: lookahead.
|
|
885
|
+
return parse_lookaround(source, info, False, ch == "=")
|
|
886
|
+
if ch == "P":
|
|
887
|
+
# (?P...: a Python extension.
|
|
888
|
+
return parse_extension(source, info)
|
|
889
|
+
if ch == "#":
|
|
890
|
+
# (?#...: a comment.
|
|
891
|
+
return parse_comment(source)
|
|
892
|
+
if ch == "(":
|
|
893
|
+
# (?(...: a conditional subpattern.
|
|
894
|
+
return parse_conditional(source, info)
|
|
895
|
+
if ch == ">":
|
|
896
|
+
# (?>...: an atomic subpattern.
|
|
897
|
+
return parse_atomic(source, info)
|
|
898
|
+
if ch == "|":
|
|
899
|
+
# (?|...: a common/reset groups branch.
|
|
900
|
+
return parse_common(source, info)
|
|
901
|
+
if ch == "R" or "0" <= ch <= "9":
|
|
902
|
+
# (?R...: probably a call to a group.
|
|
903
|
+
return parse_call_group(source, info, ch, saved_pos_2)
|
|
904
|
+
if ch == "&":
|
|
905
|
+
# (?&...: a call to a named group.
|
|
906
|
+
return parse_call_named_group(source, info, saved_pos_2)
|
|
907
|
+
if (ch == "+" or ch == "-") and source.peek() in DIGITS:
|
|
908
|
+
return parse_rel_call_group(source, info, ch, saved_pos_2)
|
|
909
|
+
|
|
910
|
+
# (?...: probably a flags subpattern.
|
|
911
|
+
source.pos = saved_pos_2
|
|
912
|
+
return parse_flags_subpattern(source, info)
|
|
913
|
+
|
|
914
|
+
if ch == "*":
|
|
915
|
+
# (*...
|
|
916
|
+
saved_pos_2 = source.pos
|
|
917
|
+
word = source.get_while(set(")>"), include=False)
|
|
918
|
+
if word[ : 1].isalpha():
|
|
919
|
+
verb = VERBS.get(word)
|
|
920
|
+
if not verb:
|
|
921
|
+
raise error("unknown verb", source.string, saved_pos_2)
|
|
922
|
+
|
|
923
|
+
source.expect(")")
|
|
924
|
+
|
|
925
|
+
return verb
|
|
926
|
+
|
|
927
|
+
# (...: an unnamed capture group.
|
|
928
|
+
source.pos = saved_pos
|
|
929
|
+
group = info.open_group()
|
|
930
|
+
saved_flags = info.flags
|
|
931
|
+
try:
|
|
932
|
+
subpattern = _parse_pattern(source, info)
|
|
933
|
+
source.expect(")")
|
|
934
|
+
finally:
|
|
935
|
+
info.flags = saved_flags
|
|
936
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
937
|
+
|
|
938
|
+
info.close_group()
|
|
939
|
+
|
|
940
|
+
return Group(info, group, subpattern)
|
|
941
|
+
|
|
942
|
+
def parse_extension(source, info):
|
|
943
|
+
"Parses a Python extension."
|
|
944
|
+
saved_pos = source.pos
|
|
945
|
+
ch = source.get()
|
|
946
|
+
if ch == "<":
|
|
947
|
+
# (?P<...: a named capture group.
|
|
948
|
+
name = parse_name(source)
|
|
949
|
+
group = info.open_group(name)
|
|
950
|
+
source.expect(">")
|
|
951
|
+
saved_flags = info.flags
|
|
952
|
+
try:
|
|
953
|
+
subpattern = _parse_pattern(source, info)
|
|
954
|
+
source.expect(")")
|
|
955
|
+
finally:
|
|
956
|
+
info.flags = saved_flags
|
|
957
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
958
|
+
|
|
959
|
+
info.close_group()
|
|
960
|
+
|
|
961
|
+
return Group(info, group, subpattern)
|
|
962
|
+
if ch == "=":
|
|
963
|
+
# (?P=...: a named group reference.
|
|
964
|
+
name = parse_name(source, allow_numeric=True)
|
|
965
|
+
source.expect(")")
|
|
966
|
+
if info.is_open_group(name):
|
|
967
|
+
raise error("cannot refer to an open group", source.string,
|
|
968
|
+
saved_pos)
|
|
969
|
+
|
|
970
|
+
return make_ref_group(info, name, saved_pos)
|
|
971
|
+
if ch == ">" or ch == "&":
|
|
972
|
+
# (?P>...: a call to a group.
|
|
973
|
+
return parse_call_named_group(source, info, saved_pos)
|
|
974
|
+
|
|
975
|
+
source.pos = saved_pos
|
|
976
|
+
raise error("unknown extension", source.string, saved_pos)
|
|
977
|
+
|
|
978
|
+
def parse_comment(source):
|
|
979
|
+
"Parses a comment."
|
|
980
|
+
while True:
|
|
981
|
+
saved_pos = source.pos
|
|
982
|
+
c = source.get(True)
|
|
983
|
+
|
|
984
|
+
if not c or c == ")":
|
|
985
|
+
break
|
|
986
|
+
|
|
987
|
+
if c == "\\":
|
|
988
|
+
c = source.get(True)
|
|
989
|
+
|
|
990
|
+
source.pos = saved_pos
|
|
991
|
+
source.expect(")")
|
|
992
|
+
|
|
993
|
+
return None
|
|
994
|
+
|
|
995
|
+
def parse_lookaround(source, info, behind, positive):
|
|
996
|
+
"Parses a lookaround."
|
|
997
|
+
saved_flags = info.flags
|
|
998
|
+
try:
|
|
999
|
+
subpattern = _parse_pattern(source, info)
|
|
1000
|
+
source.expect(")")
|
|
1001
|
+
finally:
|
|
1002
|
+
info.flags = saved_flags
|
|
1003
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
1004
|
+
|
|
1005
|
+
return LookAround(behind, positive, subpattern)
|
|
1006
|
+
|
|
1007
|
+
def parse_conditional(source, info):
|
|
1008
|
+
"Parses a conditional subpattern."
|
|
1009
|
+
saved_flags = info.flags
|
|
1010
|
+
saved_pos = source.pos
|
|
1011
|
+
ch = source.get()
|
|
1012
|
+
if ch == "?":
|
|
1013
|
+
# (?(?...
|
|
1014
|
+
ch = source.get()
|
|
1015
|
+
if ch in ("=", "!"):
|
|
1016
|
+
# (?(?=... or (?(?!...: lookahead conditional.
|
|
1017
|
+
return parse_lookaround_conditional(source, info, False, ch == "=")
|
|
1018
|
+
if ch == "<":
|
|
1019
|
+
# (?(?<...
|
|
1020
|
+
ch = source.get()
|
|
1021
|
+
if ch in ("=", "!"):
|
|
1022
|
+
# (?(?<=... or (?(?<!...: lookbehind conditional.
|
|
1023
|
+
return parse_lookaround_conditional(source, info, True, ch ==
|
|
1024
|
+
"=")
|
|
1025
|
+
|
|
1026
|
+
source.pos = saved_pos
|
|
1027
|
+
raise error("expected lookaround conditional", source.string,
|
|
1028
|
+
source.pos)
|
|
1029
|
+
|
|
1030
|
+
source.pos = saved_pos
|
|
1031
|
+
try:
|
|
1032
|
+
group = parse_name(source, True)
|
|
1033
|
+
source.expect(")")
|
|
1034
|
+
yes_branch = parse_sequence(source, info)
|
|
1035
|
+
if source.match("|"):
|
|
1036
|
+
no_branch = parse_sequence(source, info)
|
|
1037
|
+
else:
|
|
1038
|
+
no_branch = Sequence()
|
|
1039
|
+
|
|
1040
|
+
source.expect(")")
|
|
1041
|
+
finally:
|
|
1042
|
+
info.flags = saved_flags
|
|
1043
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
1044
|
+
|
|
1045
|
+
if yes_branch.is_empty() and no_branch.is_empty():
|
|
1046
|
+
return Sequence()
|
|
1047
|
+
|
|
1048
|
+
return Conditional(info, group, yes_branch, no_branch, saved_pos)
|
|
1049
|
+
|
|
1050
|
+
def parse_lookaround_conditional(source, info, behind, positive):
|
|
1051
|
+
saved_flags = info.flags
|
|
1052
|
+
try:
|
|
1053
|
+
subpattern = _parse_pattern(source, info)
|
|
1054
|
+
source.expect(")")
|
|
1055
|
+
finally:
|
|
1056
|
+
info.flags = saved_flags
|
|
1057
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
1058
|
+
|
|
1059
|
+
yes_branch = parse_sequence(source, info)
|
|
1060
|
+
if source.match("|"):
|
|
1061
|
+
no_branch = parse_sequence(source, info)
|
|
1062
|
+
else:
|
|
1063
|
+
no_branch = Sequence()
|
|
1064
|
+
|
|
1065
|
+
source.expect(")")
|
|
1066
|
+
|
|
1067
|
+
return LookAroundConditional(behind, positive, subpattern, yes_branch,
|
|
1068
|
+
no_branch)
|
|
1069
|
+
|
|
1070
|
+
def parse_atomic(source, info):
|
|
1071
|
+
"Parses an atomic subpattern."
|
|
1072
|
+
saved_flags = info.flags
|
|
1073
|
+
try:
|
|
1074
|
+
subpattern = _parse_pattern(source, info)
|
|
1075
|
+
source.expect(")")
|
|
1076
|
+
finally:
|
|
1077
|
+
info.flags = saved_flags
|
|
1078
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
1079
|
+
|
|
1080
|
+
return Atomic(subpattern)
|
|
1081
|
+
|
|
1082
|
+
def parse_common(source, info):
|
|
1083
|
+
"Parses a common groups branch."
|
|
1084
|
+
# Capture group numbers in different branches can reuse the group numbers.
|
|
1085
|
+
initial_group_count = info.group_count
|
|
1086
|
+
branches = [parse_sequence(source, info)]
|
|
1087
|
+
final_group_count = info.group_count
|
|
1088
|
+
while source.match("|"):
|
|
1089
|
+
info.group_count = initial_group_count
|
|
1090
|
+
branches.append(parse_sequence(source, info))
|
|
1091
|
+
final_group_count = max(final_group_count, info.group_count)
|
|
1092
|
+
|
|
1093
|
+
info.group_count = final_group_count
|
|
1094
|
+
source.expect(")")
|
|
1095
|
+
|
|
1096
|
+
if len(branches) == 1:
|
|
1097
|
+
return branches[0]
|
|
1098
|
+
return Branch(branches)
|
|
1099
|
+
|
|
1100
|
+
def parse_call_group(source, info, ch, pos):
|
|
1101
|
+
"Parses a call to a group."
|
|
1102
|
+
if ch == "R":
|
|
1103
|
+
group = "0"
|
|
1104
|
+
else:
|
|
1105
|
+
group = ch + source.get_while(DIGITS)
|
|
1106
|
+
|
|
1107
|
+
source.expect(")")
|
|
1108
|
+
|
|
1109
|
+
return CallGroup(info, group, pos)
|
|
1110
|
+
|
|
1111
|
+
def parse_rel_call_group(source, info, ch, pos):
|
|
1112
|
+
"Parses a relative call to a group."
|
|
1113
|
+
digits = source.get_while(DIGITS)
|
|
1114
|
+
if not digits:
|
|
1115
|
+
raise error("missing relative group number", source.string, source.pos)
|
|
1116
|
+
|
|
1117
|
+
offset = int(digits)
|
|
1118
|
+
group = info.group_count + offset if ch == "+" else info.group_count - offset + 1
|
|
1119
|
+
if group <= 0:
|
|
1120
|
+
raise error("invalid relative group number", source.string, source.pos)
|
|
1121
|
+
|
|
1122
|
+
source.expect(")")
|
|
1123
|
+
|
|
1124
|
+
return CallGroup(info, group, pos)
|
|
1125
|
+
|
|
1126
|
+
def parse_call_named_group(source, info, pos):
|
|
1127
|
+
"Parses a call to a named group."
|
|
1128
|
+
group = parse_name(source)
|
|
1129
|
+
source.expect(")")
|
|
1130
|
+
|
|
1131
|
+
return CallGroup(info, group, pos)
|
|
1132
|
+
|
|
1133
|
+
def parse_flag_set(source):
|
|
1134
|
+
"Parses a set of inline flags."
|
|
1135
|
+
flags = 0
|
|
1136
|
+
|
|
1137
|
+
try:
|
|
1138
|
+
while True:
|
|
1139
|
+
saved_pos = source.pos
|
|
1140
|
+
ch = source.get()
|
|
1141
|
+
if ch == "V":
|
|
1142
|
+
ch += source.get()
|
|
1143
|
+
flags |= REGEX_FLAGS[ch]
|
|
1144
|
+
except KeyError:
|
|
1145
|
+
source.pos = saved_pos
|
|
1146
|
+
|
|
1147
|
+
return flags
|
|
1148
|
+
|
|
1149
|
+
def parse_flags(source, info):
|
|
1150
|
+
"Parses flags being turned on/off."
|
|
1151
|
+
flags_on = parse_flag_set(source)
|
|
1152
|
+
if source.match("-"):
|
|
1153
|
+
flags_off = parse_flag_set(source)
|
|
1154
|
+
if not flags_off:
|
|
1155
|
+
raise error("bad inline flags: no flags after '-'", source.string,
|
|
1156
|
+
source.pos)
|
|
1157
|
+
else:
|
|
1158
|
+
flags_off = 0
|
|
1159
|
+
|
|
1160
|
+
if flags_on & LOCALE:
|
|
1161
|
+
# Remember that this pattern as an inline locale flag.
|
|
1162
|
+
info.inline_locale = True
|
|
1163
|
+
|
|
1164
|
+
return flags_on, flags_off
|
|
1165
|
+
|
|
1166
|
+
def parse_subpattern(source, info, flags_on, flags_off):
|
|
1167
|
+
"Parses a subpattern with scoped flags."
|
|
1168
|
+
saved_flags = info.flags
|
|
1169
|
+
info.flags = (info.flags | flags_on) & ~flags_off
|
|
1170
|
+
|
|
1171
|
+
# Ensure that there aren't multiple encoding flags set.
|
|
1172
|
+
if info.flags & (ASCII | LOCALE | UNICODE):
|
|
1173
|
+
info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on
|
|
1174
|
+
|
|
1175
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
1176
|
+
try:
|
|
1177
|
+
subpattern = _parse_pattern(source, info)
|
|
1178
|
+
source.expect(")")
|
|
1179
|
+
finally:
|
|
1180
|
+
info.flags = saved_flags
|
|
1181
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
1182
|
+
|
|
1183
|
+
return subpattern
|
|
1184
|
+
|
|
1185
|
+
def parse_flags_subpattern(source, info):
|
|
1186
|
+
"""Parses a flags subpattern. It could be inline flags or a subpattern
|
|
1187
|
+
possibly with local flags. If it's a subpattern, then that's returned;
|
|
1188
|
+
if it's a inline flags, then None is returned.
|
|
1189
|
+
"""
|
|
1190
|
+
flags_on, flags_off = parse_flags(source, info)
|
|
1191
|
+
|
|
1192
|
+
if flags_off & GLOBAL_FLAGS:
|
|
1193
|
+
raise error("bad inline flags: cannot turn off global flag",
|
|
1194
|
+
source.string, source.pos)
|
|
1195
|
+
|
|
1196
|
+
if flags_on & flags_off:
|
|
1197
|
+
raise error("bad inline flags: flag turned on and off", source.string,
|
|
1198
|
+
source.pos)
|
|
1199
|
+
|
|
1200
|
+
# Handle flags which are global in all regex behaviours.
|
|
1201
|
+
new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
|
|
1202
|
+
if new_global_flags:
|
|
1203
|
+
info.global_flags |= new_global_flags
|
|
1204
|
+
|
|
1205
|
+
# A global has been turned on, so reparse the pattern.
|
|
1206
|
+
raise _UnscopedFlagSet(info.global_flags)
|
|
1207
|
+
|
|
1208
|
+
# Ensure that from now on we have only scoped flags.
|
|
1209
|
+
flags_on &= ~GLOBAL_FLAGS
|
|
1210
|
+
|
|
1211
|
+
if source.match(":"):
|
|
1212
|
+
return parse_subpattern(source, info, flags_on, flags_off)
|
|
1213
|
+
|
|
1214
|
+
if source.match(")"):
|
|
1215
|
+
parse_positional_flags(source, info, flags_on, flags_off)
|
|
1216
|
+
return None
|
|
1217
|
+
|
|
1218
|
+
raise error("unknown extension", source.string, source.pos)
|
|
1219
|
+
|
|
1220
|
+
def parse_positional_flags(source, info, flags_on, flags_off):
|
|
1221
|
+
"Parses positional flags."
|
|
1222
|
+
info.flags = (info.flags | flags_on) & ~flags_off
|
|
1223
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
1224
|
+
|
|
1225
|
+
def parse_name(source, allow_numeric=False, allow_group_0=False):
|
|
1226
|
+
"Parses a name."
|
|
1227
|
+
name = source.get_while(set(")>"), include=False)
|
|
1228
|
+
|
|
1229
|
+
if not name:
|
|
1230
|
+
raise error("missing group name", source.string, source.pos)
|
|
1231
|
+
|
|
1232
|
+
if name.isdigit():
|
|
1233
|
+
min_group = 0 if allow_group_0 else 1
|
|
1234
|
+
if not allow_numeric or int(name) < min_group:
|
|
1235
|
+
raise error("bad character in group name", source.string,
|
|
1236
|
+
source.pos)
|
|
1237
|
+
else:
|
|
1238
|
+
if not name.isidentifier():
|
|
1239
|
+
raise error("bad character in group name", source.string,
|
|
1240
|
+
source.pos)
|
|
1241
|
+
|
|
1242
|
+
return name
|
|
1243
|
+
|
|
1244
|
+
def is_octal(string):
|
|
1245
|
+
"Checks whether a string is octal."
|
|
1246
|
+
return all(ch in OCT_DIGITS for ch in string)
|
|
1247
|
+
|
|
1248
|
+
def is_decimal(string):
|
|
1249
|
+
"Checks whether a string is decimal."
|
|
1250
|
+
return all(ch in DIGITS for ch in string)
|
|
1251
|
+
|
|
1252
|
+
def is_hexadecimal(string):
|
|
1253
|
+
"Checks whether a string is hexadecimal."
|
|
1254
|
+
return all(ch in HEX_DIGITS for ch in string)
|
|
1255
|
+
|
|
1256
|
+
def parse_escape(source, info, in_set):
|
|
1257
|
+
"Parses an escape sequence."
|
|
1258
|
+
saved_ignore = source.ignore_space
|
|
1259
|
+
source.ignore_space = False
|
|
1260
|
+
ch = source.get()
|
|
1261
|
+
source.ignore_space = saved_ignore
|
|
1262
|
+
if not ch:
|
|
1263
|
+
# A backslash at the end of the pattern.
|
|
1264
|
+
raise error("bad escape (end of pattern)", source.string, source.pos)
|
|
1265
|
+
if ch in HEX_ESCAPES:
|
|
1266
|
+
# A hexadecimal escape sequence.
|
|
1267
|
+
return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
|
|
1268
|
+
elif ch == "g" and not in_set:
|
|
1269
|
+
# A group reference.
|
|
1270
|
+
saved_pos = source.pos
|
|
1271
|
+
try:
|
|
1272
|
+
return parse_group_ref(source, info)
|
|
1273
|
+
except error:
|
|
1274
|
+
# Invalid as a group reference, so assume it's a literal.
|
|
1275
|
+
source.pos = saved_pos
|
|
1276
|
+
|
|
1277
|
+
return make_character(info, ord(ch), in_set)
|
|
1278
|
+
elif ch == "G" and not in_set:
|
|
1279
|
+
# A search anchor.
|
|
1280
|
+
return SearchAnchor()
|
|
1281
|
+
elif ch == "L" and not in_set:
|
|
1282
|
+
# A string set.
|
|
1283
|
+
return parse_string_set(source, info)
|
|
1284
|
+
elif ch == "N":
|
|
1285
|
+
# A named codepoint.
|
|
1286
|
+
return parse_named_char(source, info, in_set)
|
|
1287
|
+
elif ch in "pP":
|
|
1288
|
+
# A Unicode property, positive or negative.
|
|
1289
|
+
return parse_property(source, info, ch == "p", in_set)
|
|
1290
|
+
elif ch == "R" and not in_set:
|
|
1291
|
+
# A line ending.
|
|
1292
|
+
charset = [0x0A, 0x0B, 0x0C, 0x0D]
|
|
1293
|
+
if info.guess_encoding == UNICODE:
|
|
1294
|
+
charset.extend([0x85, 0x2028, 0x2029])
|
|
1295
|
+
|
|
1296
|
+
return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
|
|
1297
|
+
for c in charset])]))
|
|
1298
|
+
elif ch == "X" and not in_set:
|
|
1299
|
+
# A grapheme cluster.
|
|
1300
|
+
return Grapheme()
|
|
1301
|
+
elif ch in ALPHA:
|
|
1302
|
+
# An alphabetic escape sequence.
|
|
1303
|
+
# Positional escapes aren't allowed inside a character set.
|
|
1304
|
+
if not in_set:
|
|
1305
|
+
if info.flags & WORD:
|
|
1306
|
+
value = WORD_POSITION_ESCAPES.get(ch)
|
|
1307
|
+
elif info.flags & ASCII:
|
|
1308
|
+
value = ASCII_POSITION_ESCAPES.get(ch)
|
|
1309
|
+
elif info.flags & UNICODE:
|
|
1310
|
+
value = UNICODE_POSITION_ESCAPES.get(ch)
|
|
1311
|
+
else:
|
|
1312
|
+
value = POSITION_ESCAPES.get(ch)
|
|
1313
|
+
|
|
1314
|
+
if value:
|
|
1315
|
+
return value
|
|
1316
|
+
|
|
1317
|
+
if info.flags & ASCII:
|
|
1318
|
+
value = ASCII_CHARSET_ESCAPES.get(ch)
|
|
1319
|
+
elif info.flags & UNICODE:
|
|
1320
|
+
value = UNICODE_CHARSET_ESCAPES.get(ch)
|
|
1321
|
+
else:
|
|
1322
|
+
value = CHARSET_ESCAPES.get(ch)
|
|
1323
|
+
|
|
1324
|
+
if value:
|
|
1325
|
+
return value
|
|
1326
|
+
|
|
1327
|
+
value = CHARACTER_ESCAPES.get(ch)
|
|
1328
|
+
if value:
|
|
1329
|
+
return Character(ord(value))
|
|
1330
|
+
|
|
1331
|
+
raise error("bad escape \\%s" % ch, source.string, source.pos)
|
|
1332
|
+
elif ch in DIGITS:
|
|
1333
|
+
# A numeric escape sequence.
|
|
1334
|
+
return parse_numeric_escape(source, info, ch, in_set)
|
|
1335
|
+
else:
|
|
1336
|
+
# A literal.
|
|
1337
|
+
return make_character(info, ord(ch), in_set)
|
|
1338
|
+
|
|
1339
|
+
def parse_numeric_escape(source, info, ch, in_set):
|
|
1340
|
+
"Parses a numeric escape sequence."
|
|
1341
|
+
if in_set or ch == "0":
|
|
1342
|
+
# Octal escape sequence, max 3 digits.
|
|
1343
|
+
return parse_octal_escape(source, info, [ch], in_set)
|
|
1344
|
+
|
|
1345
|
+
# At least 1 digit, so either octal escape or group.
|
|
1346
|
+
digits = ch
|
|
1347
|
+
saved_pos = source.pos
|
|
1348
|
+
ch = source.get()
|
|
1349
|
+
if ch in DIGITS:
|
|
1350
|
+
# At least 2 digits, so either octal escape or group.
|
|
1351
|
+
digits += ch
|
|
1352
|
+
saved_pos = source.pos
|
|
1353
|
+
ch = source.get()
|
|
1354
|
+
if is_octal(digits) and ch in OCT_DIGITS:
|
|
1355
|
+
# 3 octal digits, so octal escape sequence.
|
|
1356
|
+
encoding = info.flags & _ALL_ENCODINGS
|
|
1357
|
+
if encoding == ASCII or encoding == LOCALE:
|
|
1358
|
+
octal_mask = 0xFF
|
|
1359
|
+
else:
|
|
1360
|
+
octal_mask = 0x1FF
|
|
1361
|
+
|
|
1362
|
+
value = int(digits + ch, 8) & octal_mask
|
|
1363
|
+
return make_character(info, value)
|
|
1364
|
+
|
|
1365
|
+
# Group reference.
|
|
1366
|
+
source.pos = saved_pos
|
|
1367
|
+
if info.is_open_group(digits):
|
|
1368
|
+
raise error("cannot refer to an open group", source.string, source.pos)
|
|
1369
|
+
|
|
1370
|
+
return make_ref_group(info, digits, source.pos)
|
|
1371
|
+
|
|
1372
|
+
def parse_octal_escape(source, info, digits, in_set):
|
|
1373
|
+
"Parses an octal escape sequence."
|
|
1374
|
+
saved_pos = source.pos
|
|
1375
|
+
ch = source.get()
|
|
1376
|
+
while len(digits) < 3 and ch in OCT_DIGITS:
|
|
1377
|
+
digits.append(ch)
|
|
1378
|
+
saved_pos = source.pos
|
|
1379
|
+
ch = source.get()
|
|
1380
|
+
|
|
1381
|
+
source.pos = saved_pos
|
|
1382
|
+
try:
|
|
1383
|
+
value = int("".join(digits), 8)
|
|
1384
|
+
return make_character(info, value, in_set)
|
|
1385
|
+
except ValueError:
|
|
1386
|
+
if digits[0] in OCT_DIGITS:
|
|
1387
|
+
raise error("incomplete escape \\%s" % ''.join(digits),
|
|
1388
|
+
source.string, source.pos)
|
|
1389
|
+
else:
|
|
1390
|
+
raise error("bad escape \\%s" % digits[0], source.string,
|
|
1391
|
+
source.pos)
|
|
1392
|
+
|
|
1393
|
+
def parse_hex_escape(source, info, esc, expected_len, in_set, type):
|
|
1394
|
+
"Parses a hex escape sequence."
|
|
1395
|
+
saved_pos = source.pos
|
|
1396
|
+
digits = []
|
|
1397
|
+
for i in range(expected_len):
|
|
1398
|
+
ch = source.get()
|
|
1399
|
+
if ch not in HEX_DIGITS:
|
|
1400
|
+
raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
|
|
1401
|
+
source.string, saved_pos)
|
|
1402
|
+
digits.append(ch)
|
|
1403
|
+
|
|
1404
|
+
try:
|
|
1405
|
+
value = int("".join(digits), 16)
|
|
1406
|
+
except ValueError:
|
|
1407
|
+
pass
|
|
1408
|
+
else:
|
|
1409
|
+
if value < 0x110000:
|
|
1410
|
+
return make_character(info, value, in_set)
|
|
1411
|
+
|
|
1412
|
+
# Bad hex escape.
|
|
1413
|
+
raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
|
|
1414
|
+
source.string, saved_pos)
|
|
1415
|
+
|
|
1416
|
+
def parse_group_ref(source, info):
|
|
1417
|
+
"Parses a group reference."
|
|
1418
|
+
source.expect("<")
|
|
1419
|
+
saved_pos = source.pos
|
|
1420
|
+
name = parse_name(source, True)
|
|
1421
|
+
source.expect(">")
|
|
1422
|
+
if info.is_open_group(name):
|
|
1423
|
+
raise error("cannot refer to an open group", source.string, source.pos)
|
|
1424
|
+
|
|
1425
|
+
return make_ref_group(info, name, saved_pos)
|
|
1426
|
+
|
|
1427
|
+
def parse_string_set(source, info):
|
|
1428
|
+
"Parses a string set reference."
|
|
1429
|
+
source.expect("<")
|
|
1430
|
+
name = parse_name(source, True)
|
|
1431
|
+
source.expect(">")
|
|
1432
|
+
if name is None or name not in info.kwargs:
|
|
1433
|
+
raise error("undefined named list", source.string, source.pos)
|
|
1434
|
+
|
|
1435
|
+
return make_string_set(info, name)
|
|
1436
|
+
|
|
1437
|
+
def parse_named_char(source, info, in_set):
|
|
1438
|
+
"Parses a named character."
|
|
1439
|
+
saved_pos = source.pos
|
|
1440
|
+
if source.match("{"):
|
|
1441
|
+
name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
|
|
1442
|
+
if source.match("}"):
|
|
1443
|
+
try:
|
|
1444
|
+
value = unicodedata.lookup(name)
|
|
1445
|
+
return make_character(info, ord(value), in_set)
|
|
1446
|
+
except KeyError:
|
|
1447
|
+
raise error("undefined character name", source.string,
|
|
1448
|
+
source.pos)
|
|
1449
|
+
|
|
1450
|
+
source.pos = saved_pos
|
|
1451
|
+
return make_character(info, ord("N"), in_set)
|
|
1452
|
+
|
|
1453
|
+
def parse_property(source, info, positive, in_set):
|
|
1454
|
+
"Parses a Unicode property."
|
|
1455
|
+
saved_pos = source.pos
|
|
1456
|
+
ch = source.get()
|
|
1457
|
+
if ch == "{":
|
|
1458
|
+
negate = source.match("^")
|
|
1459
|
+
prop_name, name = parse_property_name(source)
|
|
1460
|
+
if source.match("}"):
|
|
1461
|
+
# It's correctly delimited.
|
|
1462
|
+
if info.flags & ASCII:
|
|
1463
|
+
encoding = ASCII_ENCODING
|
|
1464
|
+
elif info.flags & UNICODE:
|
|
1465
|
+
encoding = UNICODE_ENCODING
|
|
1466
|
+
else:
|
|
1467
|
+
encoding = 0
|
|
1468
|
+
|
|
1469
|
+
prop = lookup_property(prop_name, name, positive != negate, source,
|
|
1470
|
+
encoding=encoding)
|
|
1471
|
+
return make_property(info, prop, in_set)
|
|
1472
|
+
elif ch and ch in "CLMNPSZ":
|
|
1473
|
+
# An abbreviated property, eg \pL.
|
|
1474
|
+
if info.flags & ASCII:
|
|
1475
|
+
encoding = ASCII_ENCODING
|
|
1476
|
+
elif info.flags & UNICODE:
|
|
1477
|
+
encoding = UNICODE_ENCODING
|
|
1478
|
+
else:
|
|
1479
|
+
encoding = 0
|
|
1480
|
+
|
|
1481
|
+
prop = lookup_property(None, ch, positive, source, encoding=encoding)
|
|
1482
|
+
return make_property(info, prop, in_set)
|
|
1483
|
+
|
|
1484
|
+
# Not a property, so treat as a literal "p" or "P".
|
|
1485
|
+
source.pos = saved_pos
|
|
1486
|
+
ch = "p" if positive else "P"
|
|
1487
|
+
return make_character(info, ord(ch), in_set)
|
|
1488
|
+
|
|
1489
|
+
def parse_property_name(source):
|
|
1490
|
+
"Parses a property name, which may be qualified."
|
|
1491
|
+
name = source.get_while(PROPERTY_NAME_PART)
|
|
1492
|
+
saved_pos = source.pos
|
|
1493
|
+
|
|
1494
|
+
ch = source.get()
|
|
1495
|
+
if ch and ch in ":=":
|
|
1496
|
+
prop_name = name
|
|
1497
|
+
name = source.get_while(ALNUM | set(" &_-./")).strip()
|
|
1498
|
+
|
|
1499
|
+
if name:
|
|
1500
|
+
# Name after the ":" or "=", so it's a qualified name.
|
|
1501
|
+
saved_pos = source.pos
|
|
1502
|
+
else:
|
|
1503
|
+
# No name after the ":" or "=", so assume it's an unqualified name.
|
|
1504
|
+
prop_name, name = None, prop_name
|
|
1505
|
+
else:
|
|
1506
|
+
prop_name = None
|
|
1507
|
+
|
|
1508
|
+
source.pos = saved_pos
|
|
1509
|
+
return prop_name, name
|
|
1510
|
+
|
|
1511
|
+
def parse_set(source, info):
|
|
1512
|
+
"Parses a character set."
|
|
1513
|
+
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
|
|
1514
|
+
|
|
1515
|
+
saved_ignore = source.ignore_space
|
|
1516
|
+
source.ignore_space = False
|
|
1517
|
+
# Negative set?
|
|
1518
|
+
negate = source.match("^")
|
|
1519
|
+
try:
|
|
1520
|
+
if version == VERSION0:
|
|
1521
|
+
item = parse_set_imp_union(source, info)
|
|
1522
|
+
else:
|
|
1523
|
+
item = parse_set_union(source, info)
|
|
1524
|
+
|
|
1525
|
+
if not source.match("]"):
|
|
1526
|
+
raise error("missing ]", source.string, source.pos)
|
|
1527
|
+
finally:
|
|
1528
|
+
source.ignore_space = saved_ignore
|
|
1529
|
+
|
|
1530
|
+
if negate:
|
|
1531
|
+
item = item.with_flags(positive=not item.positive)
|
|
1532
|
+
|
|
1533
|
+
item = item.with_flags(case_flags=make_case_flags(info))
|
|
1534
|
+
|
|
1535
|
+
return item
|
|
1536
|
+
|
|
1537
|
+
def parse_set_union(source, info):
|
|
1538
|
+
"Parses a set union ([x||y])."
|
|
1539
|
+
items = [parse_set_symm_diff(source, info)]
|
|
1540
|
+
while source.match("||"):
|
|
1541
|
+
items.append(parse_set_symm_diff(source, info))
|
|
1542
|
+
|
|
1543
|
+
if len(items) == 1:
|
|
1544
|
+
return items[0]
|
|
1545
|
+
return SetUnion(info, items)
|
|
1546
|
+
|
|
1547
|
+
def parse_set_symm_diff(source, info):
|
|
1548
|
+
"Parses a set symmetric difference ([x~~y])."
|
|
1549
|
+
items = [parse_set_inter(source, info)]
|
|
1550
|
+
while source.match("~~"):
|
|
1551
|
+
items.append(parse_set_inter(source, info))
|
|
1552
|
+
|
|
1553
|
+
if len(items) == 1:
|
|
1554
|
+
return items[0]
|
|
1555
|
+
return SetSymDiff(info, items)
|
|
1556
|
+
|
|
1557
|
+
def parse_set_inter(source, info):
|
|
1558
|
+
"Parses a set intersection ([x&&y])."
|
|
1559
|
+
items = [parse_set_diff(source, info)]
|
|
1560
|
+
while source.match("&&"):
|
|
1561
|
+
items.append(parse_set_diff(source, info))
|
|
1562
|
+
|
|
1563
|
+
if len(items) == 1:
|
|
1564
|
+
return items[0]
|
|
1565
|
+
return SetInter(info, items)
|
|
1566
|
+
|
|
1567
|
+
def parse_set_diff(source, info):
|
|
1568
|
+
"Parses a set difference ([x--y])."
|
|
1569
|
+
items = [parse_set_imp_union(source, info)]
|
|
1570
|
+
while source.match("--"):
|
|
1571
|
+
items.append(parse_set_imp_union(source, info))
|
|
1572
|
+
|
|
1573
|
+
if len(items) == 1:
|
|
1574
|
+
return items[0]
|
|
1575
|
+
return SetDiff(info, items)
|
|
1576
|
+
|
|
1577
|
+
def parse_set_imp_union(source, info):
|
|
1578
|
+
"Parses a set implicit union ([xy])."
|
|
1579
|
+
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
|
|
1580
|
+
|
|
1581
|
+
items = [parse_set_member(source, info)]
|
|
1582
|
+
while True:
|
|
1583
|
+
saved_pos = source.pos
|
|
1584
|
+
if source.match("]"):
|
|
1585
|
+
# End of the set.
|
|
1586
|
+
source.pos = saved_pos
|
|
1587
|
+
break
|
|
1588
|
+
|
|
1589
|
+
if version == VERSION1 and any(source.match(op) for op in SET_OPS):
|
|
1590
|
+
# The new behaviour has set operators.
|
|
1591
|
+
source.pos = saved_pos
|
|
1592
|
+
break
|
|
1593
|
+
|
|
1594
|
+
items.append(parse_set_member(source, info))
|
|
1595
|
+
|
|
1596
|
+
if len(items) == 1:
|
|
1597
|
+
return items[0]
|
|
1598
|
+
return SetUnion(info, items)
|
|
1599
|
+
|
|
1600
|
+
def parse_set_member(source, info):
|
|
1601
|
+
"Parses a member in a character set."
|
|
1602
|
+
# Parse a set item.
|
|
1603
|
+
start = parse_set_item(source, info)
|
|
1604
|
+
saved_pos1 = source.pos
|
|
1605
|
+
if (not isinstance(start, Character) or not start.positive or not
|
|
1606
|
+
source.match("-")):
|
|
1607
|
+
# It's not the start of a range.
|
|
1608
|
+
return start
|
|
1609
|
+
|
|
1610
|
+
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
|
|
1611
|
+
|
|
1612
|
+
# It looks like the start of a range of characters.
|
|
1613
|
+
saved_pos2 = source.pos
|
|
1614
|
+
if version == VERSION1 and source.match("-"):
|
|
1615
|
+
# It's actually the set difference operator '--', so return the
|
|
1616
|
+
# character.
|
|
1617
|
+
source.pos = saved_pos1
|
|
1618
|
+
return start
|
|
1619
|
+
|
|
1620
|
+
if source.match("]"):
|
|
1621
|
+
# We've reached the end of the set, so return both the character and
|
|
1622
|
+
# hyphen.
|
|
1623
|
+
source.pos = saved_pos2
|
|
1624
|
+
return SetUnion(info, [start, Character(ord("-"))])
|
|
1625
|
+
|
|
1626
|
+
# Parse a set item.
|
|
1627
|
+
end = parse_set_item(source, info)
|
|
1628
|
+
if not isinstance(end, Character) or not end.positive:
|
|
1629
|
+
# It's not a range, so return the character, hyphen and property.
|
|
1630
|
+
return SetUnion(info, [start, Character(ord("-")), end])
|
|
1631
|
+
|
|
1632
|
+
# It _is_ a range.
|
|
1633
|
+
if start.value > end.value:
|
|
1634
|
+
raise error("bad character range", source.string, source.pos)
|
|
1635
|
+
|
|
1636
|
+
if start.value == end.value:
|
|
1637
|
+
return start
|
|
1638
|
+
|
|
1639
|
+
return Range(start.value, end.value)
|
|
1640
|
+
|
|
1641
|
+
def parse_set_item(source, info):
|
|
1642
|
+
"Parses an item in a character set."
|
|
1643
|
+
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
|
|
1644
|
+
|
|
1645
|
+
if source.match("\\"):
|
|
1646
|
+
# An escape sequence in a set.
|
|
1647
|
+
return parse_escape(source, info, True)
|
|
1648
|
+
|
|
1649
|
+
saved_pos = source.pos
|
|
1650
|
+
if source.match("[:"):
|
|
1651
|
+
# Looks like a POSIX character class.
|
|
1652
|
+
try:
|
|
1653
|
+
return parse_posix_class(source, info)
|
|
1654
|
+
except ParseError:
|
|
1655
|
+
# Not a POSIX character class.
|
|
1656
|
+
source.pos = saved_pos
|
|
1657
|
+
|
|
1658
|
+
if version == VERSION1 and source.match("["):
|
|
1659
|
+
# It's the start of a nested set.
|
|
1660
|
+
|
|
1661
|
+
# Negative set?
|
|
1662
|
+
negate = source.match("^")
|
|
1663
|
+
item = parse_set_union(source, info)
|
|
1664
|
+
|
|
1665
|
+
if not source.match("]"):
|
|
1666
|
+
raise error("missing ]", source.string, source.pos)
|
|
1667
|
+
|
|
1668
|
+
if negate:
|
|
1669
|
+
item = item.with_flags(positive=not item.positive)
|
|
1670
|
+
|
|
1671
|
+
return item
|
|
1672
|
+
|
|
1673
|
+
ch = source.get()
|
|
1674
|
+
if not ch:
|
|
1675
|
+
raise error("unterminated character set", source.string, source.pos)
|
|
1676
|
+
|
|
1677
|
+
return Character(ord(ch))
|
|
1678
|
+
|
|
1679
|
+
def parse_posix_class(source, info):
|
|
1680
|
+
"Parses a POSIX character class."
|
|
1681
|
+
negate = source.match("^")
|
|
1682
|
+
prop_name, name = parse_property_name(source)
|
|
1683
|
+
if not source.match(":]"):
|
|
1684
|
+
raise ParseError()
|
|
1685
|
+
|
|
1686
|
+
return lookup_property(prop_name, name, not negate, source, posix=True)
|
|
1687
|
+
|
|
1688
|
+
def float_to_rational(flt):
|
|
1689
|
+
"Converts a float to a rational pair."
|
|
1690
|
+
int_part = int(flt)
|
|
1691
|
+
error = flt - int_part
|
|
1692
|
+
if abs(error) < 0.0001:
|
|
1693
|
+
return int_part, 1
|
|
1694
|
+
|
|
1695
|
+
den, num = float_to_rational(1.0 / error)
|
|
1696
|
+
|
|
1697
|
+
return int_part * den + num, den
|
|
1698
|
+
|
|
1699
|
+
def numeric_to_rational(numeric):
|
|
1700
|
+
"Converts a numeric string to a rational string, if possible."
|
|
1701
|
+
if numeric[ : 1] == "-":
|
|
1702
|
+
sign, numeric = numeric[0], numeric[1 : ]
|
|
1703
|
+
else:
|
|
1704
|
+
sign = ""
|
|
1705
|
+
|
|
1706
|
+
parts = numeric.split("/")
|
|
1707
|
+
if len(parts) == 2:
|
|
1708
|
+
num, den = float_to_rational(float(parts[0]) / float(parts[1]))
|
|
1709
|
+
elif len(parts) == 1:
|
|
1710
|
+
num, den = float_to_rational(float(parts[0]))
|
|
1711
|
+
else:
|
|
1712
|
+
raise ValueError()
|
|
1713
|
+
|
|
1714
|
+
result = "{}{}/{}".format(sign, num, den)
|
|
1715
|
+
if result.endswith("/1"):
|
|
1716
|
+
return result[ : -2]
|
|
1717
|
+
|
|
1718
|
+
return result
|
|
1719
|
+
|
|
1720
|
+
def standardise_name(name):
|
|
1721
|
+
"Standardises a property or value name."
|
|
1722
|
+
try:
|
|
1723
|
+
return numeric_to_rational("".join(name))
|
|
1724
|
+
except (ValueError, ZeroDivisionError):
|
|
1725
|
+
return "".join(ch for ch in name if ch not in "_- ").upper()
|
|
1726
|
+
|
|
1727
|
+
_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
|
|
1728
|
+
|
|
1729
|
+
_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
|
|
1730
|
+
|
|
1731
|
+
def lookup_property(property, value, positive, source=None, posix=False, encoding=0):
|
|
1732
|
+
"Looks up a property."
|
|
1733
|
+
# Normalise the names (which may still be lists).
|
|
1734
|
+
property = standardise_name(property) if property else None
|
|
1735
|
+
value = standardise_name(value)
|
|
1736
|
+
|
|
1737
|
+
if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
|
|
1738
|
+
property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
|
|
1739
|
+
|
|
1740
|
+
if posix and not property and value.upper() in _POSIX_CLASSES:
|
|
1741
|
+
value = 'POSIX' + value
|
|
1742
|
+
|
|
1743
|
+
if property:
|
|
1744
|
+
# Both the property and the value are provided.
|
|
1745
|
+
prop = PROPERTIES.get(property)
|
|
1746
|
+
if not prop:
|
|
1747
|
+
if not source:
|
|
1748
|
+
raise error("unknown property")
|
|
1749
|
+
|
|
1750
|
+
raise error("unknown property", source.string, source.pos)
|
|
1751
|
+
|
|
1752
|
+
prop_id, value_dict = prop
|
|
1753
|
+
val_id = value_dict.get(value)
|
|
1754
|
+
if val_id is None:
|
|
1755
|
+
if not source:
|
|
1756
|
+
raise error("unknown property value")
|
|
1757
|
+
|
|
1758
|
+
raise error("unknown property value", source.string, source.pos)
|
|
1759
|
+
|
|
1760
|
+
return Property((prop_id << 16) | val_id, positive, encoding=encoding)
|
|
1761
|
+
|
|
1762
|
+
# Only the value is provided.
|
|
1763
|
+
# It might be the name of a GC, script or block value.
|
|
1764
|
+
for property in ("GC", "SCRIPT", "BLOCK"):
|
|
1765
|
+
prop_id, value_dict = PROPERTIES.get(property)
|
|
1766
|
+
val_id = value_dict.get(value)
|
|
1767
|
+
if val_id is not None:
|
|
1768
|
+
return Property((prop_id << 16) | val_id, positive, encoding=encoding)
|
|
1769
|
+
|
|
1770
|
+
# It might be the name of a binary property.
|
|
1771
|
+
prop = PROPERTIES.get(value)
|
|
1772
|
+
if prop:
|
|
1773
|
+
prop_id, value_dict = prop
|
|
1774
|
+
if set(value_dict) == _BINARY_VALUES:
|
|
1775
|
+
return Property((prop_id << 16) | 1, positive, encoding=encoding)
|
|
1776
|
+
|
|
1777
|
+
return Property(prop_id << 16, not positive, encoding=encoding)
|
|
1778
|
+
|
|
1779
|
+
# It might be the name of a binary property starting with a prefix.
|
|
1780
|
+
if value.startswith("IS"):
|
|
1781
|
+
prop = PROPERTIES.get(value[2 : ])
|
|
1782
|
+
if prop:
|
|
1783
|
+
prop_id, value_dict = prop
|
|
1784
|
+
if "YES" in value_dict:
|
|
1785
|
+
return Property((prop_id << 16) | 1, positive, encoding=encoding)
|
|
1786
|
+
|
|
1787
|
+
# It might be the name of a script or block starting with a prefix.
|
|
1788
|
+
for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
|
|
1789
|
+
if value.startswith(prefix):
|
|
1790
|
+
prop_id, value_dict = PROPERTIES.get(property)
|
|
1791
|
+
val_id = value_dict.get(value[2 : ])
|
|
1792
|
+
if val_id is not None:
|
|
1793
|
+
return Property((prop_id << 16) | val_id, positive, encoding=encoding)
|
|
1794
|
+
|
|
1795
|
+
# Unknown property.
|
|
1796
|
+
if not source:
|
|
1797
|
+
raise error("unknown property")
|
|
1798
|
+
|
|
1799
|
+
raise error("unknown property", source.string, source.pos)
|
|
1800
|
+
|
|
1801
|
+
def _compile_replacement(source, pattern, is_unicode):
|
|
1802
|
+
"Compiles a replacement template escape sequence."
|
|
1803
|
+
ch = source.get()
|
|
1804
|
+
if ch in ALPHA:
|
|
1805
|
+
# An alphabetic escape sequence.
|
|
1806
|
+
value = CHARACTER_ESCAPES.get(ch)
|
|
1807
|
+
if value:
|
|
1808
|
+
return False, [ord(value)]
|
|
1809
|
+
|
|
1810
|
+
if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
|
|
1811
|
+
# A hexadecimal escape sequence.
|
|
1812
|
+
return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
|
|
1813
|
+
|
|
1814
|
+
if ch == "g":
|
|
1815
|
+
# A group preference.
|
|
1816
|
+
return True, [compile_repl_group(source, pattern)]
|
|
1817
|
+
|
|
1818
|
+
if ch == "N" and is_unicode:
|
|
1819
|
+
# A named character.
|
|
1820
|
+
value = parse_repl_named_char(source)
|
|
1821
|
+
if value is not None:
|
|
1822
|
+
return False, [value]
|
|
1823
|
+
|
|
1824
|
+
raise error("bad escape \\%s" % ch, source.string, source.pos)
|
|
1825
|
+
|
|
1826
|
+
if isinstance(source.sep, bytes):
|
|
1827
|
+
octal_mask = 0xFF
|
|
1828
|
+
else:
|
|
1829
|
+
octal_mask = 0x1FF
|
|
1830
|
+
|
|
1831
|
+
if ch == "0":
|
|
1832
|
+
# An octal escape sequence.
|
|
1833
|
+
digits = ch
|
|
1834
|
+
while len(digits) < 3:
|
|
1835
|
+
saved_pos = source.pos
|
|
1836
|
+
ch = source.get()
|
|
1837
|
+
if ch not in OCT_DIGITS:
|
|
1838
|
+
source.pos = saved_pos
|
|
1839
|
+
break
|
|
1840
|
+
digits += ch
|
|
1841
|
+
|
|
1842
|
+
return False, [int(digits, 8) & octal_mask]
|
|
1843
|
+
|
|
1844
|
+
if ch in DIGITS:
|
|
1845
|
+
# Either an octal escape sequence (3 digits) or a group reference (max
|
|
1846
|
+
# 2 digits).
|
|
1847
|
+
digits = ch
|
|
1848
|
+
saved_pos = source.pos
|
|
1849
|
+
ch = source.get()
|
|
1850
|
+
if ch in DIGITS:
|
|
1851
|
+
digits += ch
|
|
1852
|
+
saved_pos = source.pos
|
|
1853
|
+
ch = source.get()
|
|
1854
|
+
if ch and is_octal(digits + ch):
|
|
1855
|
+
# An octal escape sequence.
|
|
1856
|
+
return False, [int(digits + ch, 8) & octal_mask]
|
|
1857
|
+
|
|
1858
|
+
# A group reference.
|
|
1859
|
+
source.pos = saved_pos
|
|
1860
|
+
return True, [int(digits)]
|
|
1861
|
+
|
|
1862
|
+
if ch == "\\":
|
|
1863
|
+
# An escaped backslash is a backslash.
|
|
1864
|
+
return False, [ord("\\")]
|
|
1865
|
+
|
|
1866
|
+
if not ch:
|
|
1867
|
+
# A trailing backslash.
|
|
1868
|
+
raise error("bad escape (end of pattern)", source.string, source.pos)
|
|
1869
|
+
|
|
1870
|
+
# An escaped non-backslash is a backslash followed by the literal.
|
|
1871
|
+
return False, [ord("\\"), ord(ch)]
|
|
1872
|
+
|
|
1873
|
+
def parse_repl_hex_escape(source, expected_len, type):
|
|
1874
|
+
"Parses a hex escape sequence in a replacement string."
|
|
1875
|
+
digits = []
|
|
1876
|
+
for i in range(expected_len):
|
|
1877
|
+
ch = source.get()
|
|
1878
|
+
if ch not in HEX_DIGITS:
|
|
1879
|
+
raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
|
|
1880
|
+
source.string, source.pos)
|
|
1881
|
+
digits.append(ch)
|
|
1882
|
+
|
|
1883
|
+
return int("".join(digits), 16)
|
|
1884
|
+
|
|
1885
|
+
def parse_repl_named_char(source):
|
|
1886
|
+
"Parses a named character in a replacement string."
|
|
1887
|
+
saved_pos = source.pos
|
|
1888
|
+
if source.match("{"):
|
|
1889
|
+
name = source.get_while(ALPHA | set(" "))
|
|
1890
|
+
|
|
1891
|
+
if source.match("}"):
|
|
1892
|
+
try:
|
|
1893
|
+
value = unicodedata.lookup(name)
|
|
1894
|
+
return ord(value)
|
|
1895
|
+
except KeyError:
|
|
1896
|
+
raise error("undefined character name", source.string,
|
|
1897
|
+
source.pos)
|
|
1898
|
+
|
|
1899
|
+
source.pos = saved_pos
|
|
1900
|
+
return None
|
|
1901
|
+
|
|
1902
|
+
def compile_repl_group(source, pattern):
|
|
1903
|
+
"Compiles a replacement template group reference."
|
|
1904
|
+
source.expect("<")
|
|
1905
|
+
name = parse_name(source, True, True)
|
|
1906
|
+
|
|
1907
|
+
source.expect(">")
|
|
1908
|
+
if name.isdigit():
|
|
1909
|
+
index = int(name)
|
|
1910
|
+
if not 0 <= index <= pattern.groups:
|
|
1911
|
+
raise error("invalid group reference", source.string, source.pos)
|
|
1912
|
+
|
|
1913
|
+
return index
|
|
1914
|
+
|
|
1915
|
+
try:
|
|
1916
|
+
return pattern.groupindex[name]
|
|
1917
|
+
except KeyError:
|
|
1918
|
+
raise IndexError("unknown group")
|
|
1919
|
+
|
|
1920
|
+
# The regular expression is parsed into a syntax tree. The different types of
|
|
1921
|
+
# node are defined below.
|
|
1922
|
+
|
|
1923
|
+
INDENT = " "
|
|
1924
|
+
POSITIVE_OP = 0x1
|
|
1925
|
+
ZEROWIDTH_OP = 0x2
|
|
1926
|
+
FUZZY_OP = 0x4
|
|
1927
|
+
REVERSE_OP = 0x8
|
|
1928
|
+
REQUIRED_OP = 0x10
|
|
1929
|
+
ENCODING_OP_SHIFT = 5
|
|
1930
|
+
|
|
1931
|
+
POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
|
|
1932
|
+
CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
|
|
1933
|
+
FULLIGNORECASE: " FULL_IGNORE_CASE"}
|
|
1934
|
+
|
|
1935
|
+
def make_sequence(items):
|
|
1936
|
+
if len(items) == 1:
|
|
1937
|
+
return items[0]
|
|
1938
|
+
return Sequence(items)
|
|
1939
|
+
|
|
1940
|
+
# Common base class for all nodes.
|
|
1941
|
+
class RegexBase:
|
|
1942
|
+
def __init__(self):
|
|
1943
|
+
self._key = self.__class__
|
|
1944
|
+
|
|
1945
|
+
def with_flags(self, positive=None, case_flags=None, zerowidth=None):
|
|
1946
|
+
if positive is None:
|
|
1947
|
+
positive = self.positive
|
|
1948
|
+
else:
|
|
1949
|
+
positive = bool(positive)
|
|
1950
|
+
if case_flags is None:
|
|
1951
|
+
case_flags = self.case_flags
|
|
1952
|
+
else:
|
|
1953
|
+
case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
|
|
1954
|
+
if zerowidth is None:
|
|
1955
|
+
zerowidth = self.zerowidth
|
|
1956
|
+
else:
|
|
1957
|
+
zerowidth = bool(zerowidth)
|
|
1958
|
+
|
|
1959
|
+
if (positive == self.positive and case_flags == self.case_flags and
|
|
1960
|
+
zerowidth == self.zerowidth):
|
|
1961
|
+
return self
|
|
1962
|
+
|
|
1963
|
+
return self.rebuild(positive, case_flags, zerowidth)
|
|
1964
|
+
|
|
1965
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
1966
|
+
pass
|
|
1967
|
+
|
|
1968
|
+
def optimise(self, info, reverse):
|
|
1969
|
+
return self
|
|
1970
|
+
|
|
1971
|
+
def pack_characters(self, info):
|
|
1972
|
+
return self
|
|
1973
|
+
|
|
1974
|
+
def remove_captures(self):
|
|
1975
|
+
return self
|
|
1976
|
+
|
|
1977
|
+
def is_atomic(self):
|
|
1978
|
+
return True
|
|
1979
|
+
|
|
1980
|
+
def can_be_affix(self):
|
|
1981
|
+
return True
|
|
1982
|
+
|
|
1983
|
+
def contains_group(self):
|
|
1984
|
+
return False
|
|
1985
|
+
|
|
1986
|
+
def get_firstset(self, reverse):
|
|
1987
|
+
raise _FirstSetError()
|
|
1988
|
+
|
|
1989
|
+
def has_simple_start(self):
|
|
1990
|
+
return False
|
|
1991
|
+
|
|
1992
|
+
def compile(self, reverse=False, fuzzy=False):
|
|
1993
|
+
return self._compile(reverse, fuzzy)
|
|
1994
|
+
|
|
1995
|
+
def is_empty(self):
|
|
1996
|
+
return False
|
|
1997
|
+
|
|
1998
|
+
def __hash__(self):
|
|
1999
|
+
return hash(self._key)
|
|
2000
|
+
|
|
2001
|
+
def __eq__(self, other):
|
|
2002
|
+
return type(self) is type(other) and self._key == other._key
|
|
2003
|
+
|
|
2004
|
+
def __ne__(self, other):
|
|
2005
|
+
return not self.__eq__(other)
|
|
2006
|
+
|
|
2007
|
+
def get_required_string(self, reverse):
|
|
2008
|
+
return self.max_width(), None
|
|
2009
|
+
|
|
2010
|
+
# Base class for zero-width nodes.
|
|
2011
|
+
class ZeroWidthBase(RegexBase):
|
|
2012
|
+
def __init__(self, positive=True, encoding=0):
|
|
2013
|
+
RegexBase.__init__(self)
|
|
2014
|
+
self.positive = bool(positive)
|
|
2015
|
+
self.encoding = encoding
|
|
2016
|
+
|
|
2017
|
+
self._key = self.__class__, self.positive
|
|
2018
|
+
|
|
2019
|
+
def get_firstset(self, reverse):
|
|
2020
|
+
return set([None])
|
|
2021
|
+
|
|
2022
|
+
def _compile(self, reverse, fuzzy):
|
|
2023
|
+
flags = 0
|
|
2024
|
+
if self.positive:
|
|
2025
|
+
flags |= POSITIVE_OP
|
|
2026
|
+
if fuzzy:
|
|
2027
|
+
flags |= FUZZY_OP
|
|
2028
|
+
if reverse:
|
|
2029
|
+
flags |= REVERSE_OP
|
|
2030
|
+
flags |= self.encoding << ENCODING_OP_SHIFT
|
|
2031
|
+
return [(self._opcode, flags)]
|
|
2032
|
+
|
|
2033
|
+
def dump(self, indent, reverse):
|
|
2034
|
+
print("{}{} {}{}".format(INDENT * indent, self._op_name,
|
|
2035
|
+
POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))
|
|
2036
|
+
|
|
2037
|
+
def max_width(self):
|
|
2038
|
+
return 0
|
|
2039
|
+
|
|
2040
|
+
class Any(RegexBase):
|
|
2041
|
+
_opcode = {False: OP.ANY, True: OP.ANY_REV}
|
|
2042
|
+
_op_name = "ANY"
|
|
2043
|
+
|
|
2044
|
+
def has_simple_start(self):
|
|
2045
|
+
return True
|
|
2046
|
+
|
|
2047
|
+
def _compile(self, reverse, fuzzy):
|
|
2048
|
+
flags = 0
|
|
2049
|
+
if fuzzy:
|
|
2050
|
+
flags |= FUZZY_OP
|
|
2051
|
+
return [(self._opcode[reverse], flags)]
|
|
2052
|
+
|
|
2053
|
+
def dump(self, indent, reverse):
|
|
2054
|
+
print("{}{}".format(INDENT * indent, self._op_name))
|
|
2055
|
+
|
|
2056
|
+
def max_width(self):
|
|
2057
|
+
return 1
|
|
2058
|
+
|
|
2059
|
+
class AnyAll(Any):
|
|
2060
|
+
_opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
|
|
2061
|
+
_op_name = "ANY_ALL"
|
|
2062
|
+
|
|
2063
|
+
def __init__(self):
|
|
2064
|
+
self.positive = True
|
|
2065
|
+
self.zerowidth = False
|
|
2066
|
+
self.case_flags = 0
|
|
2067
|
+
|
|
2068
|
+
self._key = self.__class__, self.positive
|
|
2069
|
+
|
|
2070
|
+
class AnyU(Any):
|
|
2071
|
+
_opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
|
|
2072
|
+
_op_name = "ANY_U"
|
|
2073
|
+
|
|
2074
|
+
class Atomic(RegexBase):
|
|
2075
|
+
def __init__(self, subpattern):
|
|
2076
|
+
RegexBase.__init__(self)
|
|
2077
|
+
self.subpattern = subpattern
|
|
2078
|
+
|
|
2079
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
2080
|
+
self.subpattern.fix_groups(pattern, reverse, fuzzy)
|
|
2081
|
+
|
|
2082
|
+
def optimise(self, info, reverse):
|
|
2083
|
+
self.subpattern = self.subpattern.optimise(info, reverse)
|
|
2084
|
+
|
|
2085
|
+
if self.subpattern.is_empty():
|
|
2086
|
+
return self.subpattern
|
|
2087
|
+
return self
|
|
2088
|
+
|
|
2089
|
+
def pack_characters(self, info):
|
|
2090
|
+
self.subpattern = self.subpattern.pack_characters(info)
|
|
2091
|
+
return self
|
|
2092
|
+
|
|
2093
|
+
def remove_captures(self):
|
|
2094
|
+
self.subpattern = self.subpattern.remove_captures()
|
|
2095
|
+
return self
|
|
2096
|
+
|
|
2097
|
+
def can_be_affix(self):
|
|
2098
|
+
return self.subpattern.can_be_affix()
|
|
2099
|
+
|
|
2100
|
+
def contains_group(self):
|
|
2101
|
+
return self.subpattern.contains_group()
|
|
2102
|
+
|
|
2103
|
+
def get_firstset(self, reverse):
|
|
2104
|
+
return self.subpattern.get_firstset(reverse)
|
|
2105
|
+
|
|
2106
|
+
def has_simple_start(self):
|
|
2107
|
+
return self.subpattern.has_simple_start()
|
|
2108
|
+
|
|
2109
|
+
def _compile(self, reverse, fuzzy):
|
|
2110
|
+
return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
|
|
2111
|
+
[(OP.END, )])
|
|
2112
|
+
|
|
2113
|
+
def dump(self, indent, reverse):
|
|
2114
|
+
print("{}ATOMIC".format(INDENT * indent))
|
|
2115
|
+
self.subpattern.dump(indent + 1, reverse)
|
|
2116
|
+
|
|
2117
|
+
def is_empty(self):
|
|
2118
|
+
return self.subpattern.is_empty()
|
|
2119
|
+
|
|
2120
|
+
def __eq__(self, other):
|
|
2121
|
+
return (type(self) is type(other) and self.subpattern ==
|
|
2122
|
+
other.subpattern)
|
|
2123
|
+
|
|
2124
|
+
def max_width(self):
|
|
2125
|
+
return self.subpattern.max_width()
|
|
2126
|
+
|
|
2127
|
+
def get_required_string(self, reverse):
|
|
2128
|
+
return self.subpattern.get_required_string(reverse)
|
|
2129
|
+
|
|
2130
|
+
class Boundary(ZeroWidthBase):
|
|
2131
|
+
_opcode = OP.BOUNDARY
|
|
2132
|
+
_op_name = "BOUNDARY"
|
|
2133
|
+
|
|
2134
|
+
class Branch(RegexBase):
|
|
2135
|
+
def __init__(self, branches):
|
|
2136
|
+
RegexBase.__init__(self)
|
|
2137
|
+
self.branches = branches
|
|
2138
|
+
|
|
2139
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
2140
|
+
for b in self.branches:
|
|
2141
|
+
b.fix_groups(pattern, reverse, fuzzy)
|
|
2142
|
+
|
|
2143
|
+
def optimise(self, info, reverse):
|
|
2144
|
+
if not self.branches:
|
|
2145
|
+
return Sequence([])
|
|
2146
|
+
|
|
2147
|
+
# Flatten branches within branches.
|
|
2148
|
+
branches = Branch._flatten_branches(info, reverse, self.branches)
|
|
2149
|
+
|
|
2150
|
+
# Move any common prefix or suffix out of the branches.
|
|
2151
|
+
if reverse:
|
|
2152
|
+
suffix, branches = Branch._split_common_suffix(info, branches)
|
|
2153
|
+
prefix = []
|
|
2154
|
+
else:
|
|
2155
|
+
prefix, branches = Branch._split_common_prefix(info, branches)
|
|
2156
|
+
suffix = []
|
|
2157
|
+
|
|
2158
|
+
# Try to reduce adjacent single-character branches to sets.
|
|
2159
|
+
branches = Branch._reduce_to_set(info, reverse, branches)
|
|
2160
|
+
|
|
2161
|
+
if len(branches) > 1:
|
|
2162
|
+
sequence = [Branch(branches)]
|
|
2163
|
+
|
|
2164
|
+
if not prefix or not suffix:
|
|
2165
|
+
# We might be able to add a quick precheck before the branches.
|
|
2166
|
+
firstset = self._add_precheck(info, reverse, branches)
|
|
2167
|
+
|
|
2168
|
+
if firstset:
|
|
2169
|
+
if reverse:
|
|
2170
|
+
sequence.append(firstset)
|
|
2171
|
+
else:
|
|
2172
|
+
sequence.insert(0, firstset)
|
|
2173
|
+
else:
|
|
2174
|
+
sequence = branches
|
|
2175
|
+
|
|
2176
|
+
return make_sequence(prefix + sequence + suffix)
|
|
2177
|
+
|
|
2178
|
+
def _add_precheck(self, info, reverse, branches):
|
|
2179
|
+
charset = set()
|
|
2180
|
+
pos = -1 if reverse else 0
|
|
2181
|
+
|
|
2182
|
+
for branch in branches:
|
|
2183
|
+
if type(branch) is Literal and branch.case_flags == NOCASE:
|
|
2184
|
+
charset.add(branch.characters[pos])
|
|
2185
|
+
else:
|
|
2186
|
+
return
|
|
2187
|
+
|
|
2188
|
+
if not charset:
|
|
2189
|
+
return None
|
|
2190
|
+
|
|
2191
|
+
return _check_firstset(info, reverse, [Character(c) for c in charset])
|
|
2192
|
+
|
|
2193
|
+
def pack_characters(self, info):
|
|
2194
|
+
self.branches = [b.pack_characters(info) for b in self.branches]
|
|
2195
|
+
return self
|
|
2196
|
+
|
|
2197
|
+
def remove_captures(self):
|
|
2198
|
+
self.branches = [b.remove_captures() for b in self.branches]
|
|
2199
|
+
return self
|
|
2200
|
+
|
|
2201
|
+
def is_atomic(self):
|
|
2202
|
+
return all(b.is_atomic() for b in self.branches)
|
|
2203
|
+
|
|
2204
|
+
def can_be_affix(self):
|
|
2205
|
+
return all(b.can_be_affix() for b in self.branches)
|
|
2206
|
+
|
|
2207
|
+
def contains_group(self):
|
|
2208
|
+
return any(b.contains_group() for b in self.branches)
|
|
2209
|
+
|
|
2210
|
+
def get_firstset(self, reverse):
|
|
2211
|
+
fs = set()
|
|
2212
|
+
for b in self.branches:
|
|
2213
|
+
fs |= b.get_firstset(reverse)
|
|
2214
|
+
|
|
2215
|
+
return fs or set([None])
|
|
2216
|
+
|
|
2217
|
+
def _compile(self, reverse, fuzzy):
|
|
2218
|
+
if not self.branches:
|
|
2219
|
+
return []
|
|
2220
|
+
|
|
2221
|
+
code = [(OP.BRANCH, )]
|
|
2222
|
+
for b in self.branches:
|
|
2223
|
+
code.extend(b.compile(reverse, fuzzy))
|
|
2224
|
+
code.append((OP.NEXT, ))
|
|
2225
|
+
|
|
2226
|
+
code[-1] = (OP.END, )
|
|
2227
|
+
|
|
2228
|
+
return code
|
|
2229
|
+
|
|
2230
|
+
def dump(self, indent, reverse):
|
|
2231
|
+
print("{}BRANCH".format(INDENT * indent))
|
|
2232
|
+
self.branches[0].dump(indent + 1, reverse)
|
|
2233
|
+
for b in self.branches[1 : ]:
|
|
2234
|
+
print("{}OR".format(INDENT * indent))
|
|
2235
|
+
b.dump(indent + 1, reverse)
|
|
2236
|
+
|
|
2237
|
+
@staticmethod
|
|
2238
|
+
def _flatten_branches(info, reverse, branches):
|
|
2239
|
+
# Flatten the branches so that there aren't branches of branches.
|
|
2240
|
+
new_branches = []
|
|
2241
|
+
for b in branches:
|
|
2242
|
+
b = b.optimise(info, reverse)
|
|
2243
|
+
if isinstance(b, Branch):
|
|
2244
|
+
new_branches.extend(b.branches)
|
|
2245
|
+
else:
|
|
2246
|
+
new_branches.append(b)
|
|
2247
|
+
|
|
2248
|
+
return new_branches
|
|
2249
|
+
|
|
2250
|
+
@staticmethod
|
|
2251
|
+
def _split_common_prefix(info, branches):
|
|
2252
|
+
# Common leading items can be moved out of the branches.
|
|
2253
|
+
# Get the items in the branches.
|
|
2254
|
+
alternatives = []
|
|
2255
|
+
for b in branches:
|
|
2256
|
+
if isinstance(b, Sequence):
|
|
2257
|
+
alternatives.append(b.items)
|
|
2258
|
+
else:
|
|
2259
|
+
alternatives.append([b])
|
|
2260
|
+
|
|
2261
|
+
# What is the maximum possible length of the prefix?
|
|
2262
|
+
max_count = min(len(a) for a in alternatives)
|
|
2263
|
+
|
|
2264
|
+
# What is the longest common prefix?
|
|
2265
|
+
prefix = alternatives[0]
|
|
2266
|
+
pos = 0
|
|
2267
|
+
end_pos = max_count
|
|
2268
|
+
while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
|
|
2269
|
+
prefix[pos] for a in alternatives):
|
|
2270
|
+
pos += 1
|
|
2271
|
+
count = pos
|
|
2272
|
+
|
|
2273
|
+
if info.flags & UNICODE:
|
|
2274
|
+
# We need to check that we're not splitting a sequence of
|
|
2275
|
+
# characters which could form part of full case-folding.
|
|
2276
|
+
count = pos
|
|
2277
|
+
while count > 0 and not all(Branch._can_split(a, count) for a in
|
|
2278
|
+
alternatives):
|
|
2279
|
+
count -= 1
|
|
2280
|
+
|
|
2281
|
+
# No common prefix is possible.
|
|
2282
|
+
if count == 0:
|
|
2283
|
+
return [], branches
|
|
2284
|
+
|
|
2285
|
+
# Rebuild the branches.
|
|
2286
|
+
new_branches = []
|
|
2287
|
+
for a in alternatives:
|
|
2288
|
+
new_branches.append(make_sequence(a[count : ]))
|
|
2289
|
+
|
|
2290
|
+
return prefix[ : count], new_branches
|
|
2291
|
+
|
|
2292
|
+
@staticmethod
|
|
2293
|
+
def _split_common_suffix(info, branches):
|
|
2294
|
+
# Common trailing items can be moved out of the branches.
|
|
2295
|
+
# Get the items in the branches.
|
|
2296
|
+
alternatives = []
|
|
2297
|
+
for b in branches:
|
|
2298
|
+
if isinstance(b, Sequence):
|
|
2299
|
+
alternatives.append(b.items)
|
|
2300
|
+
else:
|
|
2301
|
+
alternatives.append([b])
|
|
2302
|
+
|
|
2303
|
+
# What is the maximum possible length of the suffix?
|
|
2304
|
+
max_count = min(len(a) for a in alternatives)
|
|
2305
|
+
|
|
2306
|
+
# What is the longest common suffix?
|
|
2307
|
+
suffix = alternatives[0]
|
|
2308
|
+
pos = -1
|
|
2309
|
+
end_pos = -1 - max_count
|
|
2310
|
+
while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
|
|
2311
|
+
suffix[pos] for a in alternatives):
|
|
2312
|
+
pos -= 1
|
|
2313
|
+
count = -1 - pos
|
|
2314
|
+
|
|
2315
|
+
if info.flags & UNICODE:
|
|
2316
|
+
# We need to check that we're not splitting a sequence of
|
|
2317
|
+
# characters which could form part of full case-folding.
|
|
2318
|
+
while count > 0 and not all(Branch._can_split_rev(a, count) for a
|
|
2319
|
+
in alternatives):
|
|
2320
|
+
count -= 1
|
|
2321
|
+
|
|
2322
|
+
# No common suffix is possible.
|
|
2323
|
+
if count == 0:
|
|
2324
|
+
return [], branches
|
|
2325
|
+
|
|
2326
|
+
# Rebuild the branches.
|
|
2327
|
+
new_branches = []
|
|
2328
|
+
for a in alternatives:
|
|
2329
|
+
new_branches.append(make_sequence(a[ : -count]))
|
|
2330
|
+
|
|
2331
|
+
return suffix[-count : ], new_branches
|
|
2332
|
+
|
|
2333
|
+
@staticmethod
|
|
2334
|
+
def _can_split(items, count):
|
|
2335
|
+
# Check the characters either side of the proposed split.
|
|
2336
|
+
if not Branch._is_full_case(items, count - 1):
|
|
2337
|
+
return True
|
|
2338
|
+
|
|
2339
|
+
if not Branch._is_full_case(items, count):
|
|
2340
|
+
return True
|
|
2341
|
+
|
|
2342
|
+
# Check whether a 1-1 split would be OK.
|
|
2343
|
+
if Branch._is_folded(items[count - 1 : count + 1]):
|
|
2344
|
+
return False
|
|
2345
|
+
|
|
2346
|
+
# Check whether a 1-2 split would be OK.
|
|
2347
|
+
if (Branch._is_full_case(items, count + 2) and
|
|
2348
|
+
Branch._is_folded(items[count - 1 : count + 2])):
|
|
2349
|
+
return False
|
|
2350
|
+
|
|
2351
|
+
# Check whether a 2-1 split would be OK.
|
|
2352
|
+
if (Branch._is_full_case(items, count - 2) and
|
|
2353
|
+
Branch._is_folded(items[count - 2 : count + 1])):
|
|
2354
|
+
return False
|
|
2355
|
+
|
|
2356
|
+
return True
|
|
2357
|
+
|
|
2358
|
+
@staticmethod
|
|
2359
|
+
def _can_split_rev(items, count):
|
|
2360
|
+
end = len(items)
|
|
2361
|
+
|
|
2362
|
+
# Check the characters either side of the proposed split.
|
|
2363
|
+
if not Branch._is_full_case(items, end - count):
|
|
2364
|
+
return True
|
|
2365
|
+
|
|
2366
|
+
if not Branch._is_full_case(items, end - count - 1):
|
|
2367
|
+
return True
|
|
2368
|
+
|
|
2369
|
+
# Check whether a 1-1 split would be OK.
|
|
2370
|
+
if Branch._is_folded(items[end - count - 1 : end - count + 1]):
|
|
2371
|
+
return False
|
|
2372
|
+
|
|
2373
|
+
# Check whether a 1-2 split would be OK.
|
|
2374
|
+
if (Branch._is_full_case(items, end - count + 2) and
|
|
2375
|
+
Branch._is_folded(items[end - count - 1 : end - count + 2])):
|
|
2376
|
+
return False
|
|
2377
|
+
|
|
2378
|
+
# Check whether a 2-1 split would be OK.
|
|
2379
|
+
if (Branch._is_full_case(items, end - count - 2) and
|
|
2380
|
+
Branch._is_folded(items[end - count - 2 : end - count + 1])):
|
|
2381
|
+
return False
|
|
2382
|
+
|
|
2383
|
+
return True
|
|
2384
|
+
|
|
2385
|
+
@staticmethod
|
|
2386
|
+
def _merge_common_prefixes(info, reverse, branches):
|
|
2387
|
+
# Branches with the same case-sensitive character prefix can be grouped
|
|
2388
|
+
# together if they are separated only by other branches with a
|
|
2389
|
+
# character prefix.
|
|
2390
|
+
prefixed = defaultdict(list)
|
|
2391
|
+
order = {}
|
|
2392
|
+
new_branches = []
|
|
2393
|
+
for b in branches:
|
|
2394
|
+
if Branch._is_simple_character(b):
|
|
2395
|
+
# Branch starts with a simple character.
|
|
2396
|
+
prefixed[b.value].append([b])
|
|
2397
|
+
order.setdefault(b.value, len(order))
|
|
2398
|
+
elif (isinstance(b, Sequence) and b.items and
|
|
2399
|
+
Branch._is_simple_character(b.items[0])):
|
|
2400
|
+
# Branch starts with a simple character.
|
|
2401
|
+
prefixed[b.items[0].value].append(b.items)
|
|
2402
|
+
order.setdefault(b.items[0].value, len(order))
|
|
2403
|
+
else:
|
|
2404
|
+
Branch._flush_char_prefix(info, reverse, prefixed, order,
|
|
2405
|
+
new_branches)
|
|
2406
|
+
|
|
2407
|
+
new_branches.append(b)
|
|
2408
|
+
|
|
2409
|
+
Branch._flush_char_prefix(info, prefixed, order, new_branches)
|
|
2410
|
+
|
|
2411
|
+
return new_branches
|
|
2412
|
+
|
|
2413
|
+
@staticmethod
|
|
2414
|
+
def _is_simple_character(c):
|
|
2415
|
+
return isinstance(c, Character) and c.positive and not c.case_flags
|
|
2416
|
+
|
|
2417
|
+
@staticmethod
|
|
2418
|
+
def _reduce_to_set(info, reverse, branches):
|
|
2419
|
+
# Can the branches be reduced to a set?
|
|
2420
|
+
new_branches = []
|
|
2421
|
+
items = set()
|
|
2422
|
+
case_flags = NOCASE
|
|
2423
|
+
for b in branches:
|
|
2424
|
+
if isinstance(b, (Character, Property, SetBase)):
|
|
2425
|
+
# Branch starts with a single character.
|
|
2426
|
+
if b.case_flags != case_flags:
|
|
2427
|
+
# Different case sensitivity, so flush.
|
|
2428
|
+
Branch._flush_set_members(info, reverse, items, case_flags,
|
|
2429
|
+
new_branches)
|
|
2430
|
+
|
|
2431
|
+
case_flags = b.case_flags
|
|
2432
|
+
|
|
2433
|
+
items.add(b.with_flags(case_flags=NOCASE))
|
|
2434
|
+
else:
|
|
2435
|
+
Branch._flush_set_members(info, reverse, items, case_flags,
|
|
2436
|
+
new_branches)
|
|
2437
|
+
|
|
2438
|
+
new_branches.append(b)
|
|
2439
|
+
|
|
2440
|
+
Branch._flush_set_members(info, reverse, items, case_flags,
|
|
2441
|
+
new_branches)
|
|
2442
|
+
|
|
2443
|
+
return new_branches
|
|
2444
|
+
|
|
2445
|
+
@staticmethod
|
|
2446
|
+
def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
|
|
2447
|
+
# Flush the prefixed branches.
|
|
2448
|
+
if not prefixed:
|
|
2449
|
+
return
|
|
2450
|
+
|
|
2451
|
+
for value, branches in sorted(prefixed.items(), key=lambda pair:
|
|
2452
|
+
order[pair[0]]):
|
|
2453
|
+
if len(branches) == 1:
|
|
2454
|
+
new_branches.append(make_sequence(branches[0]))
|
|
2455
|
+
else:
|
|
2456
|
+
subbranches = []
|
|
2457
|
+
optional = False
|
|
2458
|
+
for b in branches:
|
|
2459
|
+
if len(b) > 1:
|
|
2460
|
+
subbranches.append(make_sequence(b[1 : ]))
|
|
2461
|
+
elif not optional:
|
|
2462
|
+
subbranches.append(Sequence())
|
|
2463
|
+
optional = True
|
|
2464
|
+
|
|
2465
|
+
sequence = Sequence([Character(value), Branch(subbranches)])
|
|
2466
|
+
new_branches.append(sequence.optimise(info, reverse))
|
|
2467
|
+
|
|
2468
|
+
prefixed.clear()
|
|
2469
|
+
order.clear()
|
|
2470
|
+
|
|
2471
|
+
@staticmethod
|
|
2472
|
+
def _flush_set_members(info, reverse, items, case_flags, new_branches):
|
|
2473
|
+
# Flush the set members.
|
|
2474
|
+
if not items:
|
|
2475
|
+
return
|
|
2476
|
+
|
|
2477
|
+
if len(items) == 1:
|
|
2478
|
+
item = list(items)[0]
|
|
2479
|
+
else:
|
|
2480
|
+
item = SetUnion(info, list(items)).optimise(info, reverse)
|
|
2481
|
+
|
|
2482
|
+
new_branches.append(item.with_flags(case_flags=case_flags))
|
|
2483
|
+
|
|
2484
|
+
items.clear()
|
|
2485
|
+
|
|
2486
|
+
@staticmethod
|
|
2487
|
+
def _is_full_case(items, i):
|
|
2488
|
+
if not 0 <= i < len(items):
|
|
2489
|
+
return False
|
|
2490
|
+
|
|
2491
|
+
item = items[i]
|
|
2492
|
+
return (isinstance(item, Character) and item.positive and
|
|
2493
|
+
(item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
|
|
2494
|
+
|
|
2495
|
+
@staticmethod
|
|
2496
|
+
def _is_folded(items):
|
|
2497
|
+
if len(items) < 2:
|
|
2498
|
+
return False
|
|
2499
|
+
|
|
2500
|
+
for i in items:
|
|
2501
|
+
if (not isinstance(i, Character) or not i.positive or not
|
|
2502
|
+
i.case_flags):
|
|
2503
|
+
return False
|
|
2504
|
+
|
|
2505
|
+
folded = "".join(chr(i.value) for i in items)
|
|
2506
|
+
folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
|
|
2507
|
+
|
|
2508
|
+
# Get the characters which expand to multiple codepoints on folding.
|
|
2509
|
+
expanding_chars = _regex.get_expand_on_folding()
|
|
2510
|
+
|
|
2511
|
+
for c in expanding_chars:
|
|
2512
|
+
if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
|
|
2513
|
+
return True
|
|
2514
|
+
|
|
2515
|
+
return False
|
|
2516
|
+
|
|
2517
|
+
def is_empty(self):
|
|
2518
|
+
return all(b.is_empty() for b in self.branches)
|
|
2519
|
+
|
|
2520
|
+
def __eq__(self, other):
|
|
2521
|
+
return type(self) is type(other) and self.branches == other.branches
|
|
2522
|
+
|
|
2523
|
+
def max_width(self):
|
|
2524
|
+
return max(b.max_width() for b in self.branches)
|
|
2525
|
+
|
|
2526
|
+
class CallGroup(RegexBase):
|
|
2527
|
+
def __init__(self, info, group, position):
|
|
2528
|
+
RegexBase.__init__(self)
|
|
2529
|
+
self.info = info
|
|
2530
|
+
self.group = group
|
|
2531
|
+
self.position = position
|
|
2532
|
+
|
|
2533
|
+
self._key = self.__class__, self.group
|
|
2534
|
+
|
|
2535
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
2536
|
+
try:
|
|
2537
|
+
self.group = int(self.group)
|
|
2538
|
+
except ValueError:
|
|
2539
|
+
try:
|
|
2540
|
+
self.group = self.info.group_index[self.group]
|
|
2541
|
+
except KeyError:
|
|
2542
|
+
raise error("invalid group reference", pattern, self.position)
|
|
2543
|
+
|
|
2544
|
+
if not 0 <= self.group <= self.info.group_count:
|
|
2545
|
+
raise error("unknown group", pattern, self.position)
|
|
2546
|
+
|
|
2547
|
+
if self.group > 0 and self.info.open_group_count[self.group] > 1:
|
|
2548
|
+
raise error("ambiguous group reference", pattern, self.position)
|
|
2549
|
+
|
|
2550
|
+
self.info.group_calls.append((self, reverse, fuzzy))
|
|
2551
|
+
|
|
2552
|
+
self._key = self.__class__, self.group
|
|
2553
|
+
|
|
2554
|
+
def remove_captures(self):
|
|
2555
|
+
raise error("group reference not allowed", self.pattern, self.position)
|
|
2556
|
+
|
|
2557
|
+
def _compile(self, reverse, fuzzy):
|
|
2558
|
+
return [(OP.GROUP_CALL, self.call_ref)]
|
|
2559
|
+
|
|
2560
|
+
def dump(self, indent, reverse):
|
|
2561
|
+
print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
|
|
2562
|
+
|
|
2563
|
+
def __eq__(self, other):
|
|
2564
|
+
return type(self) is type(other) and self.group == other.group
|
|
2565
|
+
|
|
2566
|
+
def max_width(self):
|
|
2567
|
+
return UNLIMITED
|
|
2568
|
+
|
|
2569
|
+
def __del__(self):
|
|
2570
|
+
self.info = None
|
|
2571
|
+
|
|
2572
|
+
class CallRef(RegexBase):
|
|
2573
|
+
def __init__(self, ref, parsed):
|
|
2574
|
+
self.ref = ref
|
|
2575
|
+
self.parsed = parsed
|
|
2576
|
+
|
|
2577
|
+
def _compile(self, reverse, fuzzy):
|
|
2578
|
+
return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
|
|
2579
|
+
fuzzy) + [(OP.END, )])
|
|
2580
|
+
|
|
2581
|
+
class Character(RegexBase):
|
|
2582
|
+
_opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
|
|
2583
|
+
OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
|
|
2584
|
+
False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
|
|
2585
|
+
True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
|
|
2586
|
+
(FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
|
|
2587
|
+
|
|
2588
|
+
def __init__(self, value, positive=True, case_flags=NOCASE,
|
|
2589
|
+
zerowidth=False):
|
|
2590
|
+
RegexBase.__init__(self)
|
|
2591
|
+
self.value = value
|
|
2592
|
+
self.positive = bool(positive)
|
|
2593
|
+
self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
|
|
2594
|
+
self.zerowidth = bool(zerowidth)
|
|
2595
|
+
|
|
2596
|
+
if (self.positive and (self.case_flags & FULLIGNORECASE) ==
|
|
2597
|
+
FULLIGNORECASE):
|
|
2598
|
+
self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
|
|
2599
|
+
else:
|
|
2600
|
+
self.folded = chr(self.value)
|
|
2601
|
+
|
|
2602
|
+
self._key = (self.__class__, self.value, self.positive,
|
|
2603
|
+
self.case_flags, self.zerowidth)
|
|
2604
|
+
|
|
2605
|
+
def rebuild(self, positive, case_flags, zerowidth):
|
|
2606
|
+
return Character(self.value, positive, case_flags, zerowidth)
|
|
2607
|
+
|
|
2608
|
+
def optimise(self, info, reverse, in_set=False):
|
|
2609
|
+
return self
|
|
2610
|
+
|
|
2611
|
+
def get_firstset(self, reverse):
|
|
2612
|
+
return set([self])
|
|
2613
|
+
|
|
2614
|
+
def has_simple_start(self):
|
|
2615
|
+
return True
|
|
2616
|
+
|
|
2617
|
+
def _compile(self, reverse, fuzzy):
|
|
2618
|
+
flags = 0
|
|
2619
|
+
if self.positive:
|
|
2620
|
+
flags |= POSITIVE_OP
|
|
2621
|
+
if self.zerowidth:
|
|
2622
|
+
flags |= ZEROWIDTH_OP
|
|
2623
|
+
if fuzzy:
|
|
2624
|
+
flags |= FUZZY_OP
|
|
2625
|
+
|
|
2626
|
+
code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
|
|
2627
|
+
self.value])
|
|
2628
|
+
|
|
2629
|
+
if len(self.folded) > 1:
|
|
2630
|
+
# The character expands on full case-folding.
|
|
2631
|
+
code = Branch([code, String([ord(c) for c in self.folded],
|
|
2632
|
+
case_flags=self.case_flags)])
|
|
2633
|
+
|
|
2634
|
+
return code.compile(reverse, fuzzy)
|
|
2635
|
+
|
|
2636
|
+
def dump(self, indent, reverse):
|
|
2637
|
+
display = ascii(chr(self.value)).lstrip("bu")
|
|
2638
|
+
print("{}CHARACTER {} {}{}".format(INDENT * indent,
|
|
2639
|
+
POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
|
|
2640
|
+
|
|
2641
|
+
def matches(self, ch):
|
|
2642
|
+
return (ch == self.value) == self.positive
|
|
2643
|
+
|
|
2644
|
+
def max_width(self):
|
|
2645
|
+
return len(self.folded)
|
|
2646
|
+
|
|
2647
|
+
def get_required_string(self, reverse):
|
|
2648
|
+
if not self.positive:
|
|
2649
|
+
return 1, None
|
|
2650
|
+
|
|
2651
|
+
self.folded_characters = tuple(ord(c) for c in self.folded)
|
|
2652
|
+
|
|
2653
|
+
return 0, self
|
|
2654
|
+
|
|
2655
|
+
class Conditional(RegexBase):
|
|
2656
|
+
def __init__(self, info, group, yes_item, no_item, position):
|
|
2657
|
+
RegexBase.__init__(self)
|
|
2658
|
+
self.info = info
|
|
2659
|
+
self.group = group
|
|
2660
|
+
self.yes_item = yes_item
|
|
2661
|
+
self.no_item = no_item
|
|
2662
|
+
self.position = position
|
|
2663
|
+
|
|
2664
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
2665
|
+
try:
|
|
2666
|
+
self.group = int(self.group)
|
|
2667
|
+
except ValueError:
|
|
2668
|
+
try:
|
|
2669
|
+
self.group = self.info.group_index[self.group]
|
|
2670
|
+
except KeyError:
|
|
2671
|
+
if self.group == 'DEFINE':
|
|
2672
|
+
# 'DEFINE' is a special name unless there's a group with
|
|
2673
|
+
# that name.
|
|
2674
|
+
self.group = 0
|
|
2675
|
+
else:
|
|
2676
|
+
raise error("unknown group", pattern, self.position)
|
|
2677
|
+
|
|
2678
|
+
if not 0 <= self.group <= self.info.group_count:
|
|
2679
|
+
raise error("invalid group reference", pattern, self.position)
|
|
2680
|
+
|
|
2681
|
+
self.yes_item.fix_groups(pattern, reverse, fuzzy)
|
|
2682
|
+
self.no_item.fix_groups(pattern, reverse, fuzzy)
|
|
2683
|
+
|
|
2684
|
+
def optimise(self, info, reverse):
|
|
2685
|
+
yes_item = self.yes_item.optimise(info, reverse)
|
|
2686
|
+
no_item = self.no_item.optimise(info, reverse)
|
|
2687
|
+
|
|
2688
|
+
return Conditional(info, self.group, yes_item, no_item, self.position)
|
|
2689
|
+
|
|
2690
|
+
def pack_characters(self, info):
|
|
2691
|
+
self.yes_item = self.yes_item.pack_characters(info)
|
|
2692
|
+
self.no_item = self.no_item.pack_characters(info)
|
|
2693
|
+
return self
|
|
2694
|
+
|
|
2695
|
+
def remove_captures(self):
|
|
2696
|
+
self.yes_item = self.yes_item.remove_captures()
|
|
2697
|
+
self.no_item = self.no_item.remove_captures()
|
|
2698
|
+
|
|
2699
|
+
def is_atomic(self):
|
|
2700
|
+
return self.yes_item.is_atomic() and self.no_item.is_atomic()
|
|
2701
|
+
|
|
2702
|
+
def can_be_affix(self):
|
|
2703
|
+
return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
|
|
2704
|
+
|
|
2705
|
+
def contains_group(self):
|
|
2706
|
+
return self.yes_item.contains_group() or self.no_item.contains_group()
|
|
2707
|
+
|
|
2708
|
+
def get_firstset(self, reverse):
|
|
2709
|
+
return (self.yes_item.get_firstset(reverse) |
|
|
2710
|
+
self.no_item.get_firstset(reverse))
|
|
2711
|
+
|
|
2712
|
+
def _compile(self, reverse, fuzzy):
|
|
2713
|
+
code = [(OP.GROUP_EXISTS, self.group)]
|
|
2714
|
+
code.extend(self.yes_item.compile(reverse, fuzzy))
|
|
2715
|
+
add_code = self.no_item.compile(reverse, fuzzy)
|
|
2716
|
+
if add_code:
|
|
2717
|
+
code.append((OP.NEXT, ))
|
|
2718
|
+
code.extend(add_code)
|
|
2719
|
+
|
|
2720
|
+
code.append((OP.END, ))
|
|
2721
|
+
|
|
2722
|
+
return code
|
|
2723
|
+
|
|
2724
|
+
def dump(self, indent, reverse):
|
|
2725
|
+
print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
|
|
2726
|
+
self.yes_item.dump(indent + 1, reverse)
|
|
2727
|
+
if not self.no_item.is_empty():
|
|
2728
|
+
print("{}OR".format(INDENT * indent))
|
|
2729
|
+
self.no_item.dump(indent + 1, reverse)
|
|
2730
|
+
|
|
2731
|
+
def is_empty(self):
|
|
2732
|
+
return self.yes_item.is_empty() and self.no_item.is_empty()
|
|
2733
|
+
|
|
2734
|
+
def __eq__(self, other):
|
|
2735
|
+
return type(self) is type(other) and (self.group, self.yes_item,
|
|
2736
|
+
self.no_item) == (other.group, other.yes_item, other.no_item)
|
|
2737
|
+
|
|
2738
|
+
def max_width(self):
|
|
2739
|
+
return max(self.yes_item.max_width(), self.no_item.max_width())
|
|
2740
|
+
|
|
2741
|
+
def __del__(self):
|
|
2742
|
+
self.info = None
|
|
2743
|
+
|
|
2744
|
+
class DefaultBoundary(ZeroWidthBase):
|
|
2745
|
+
_opcode = OP.DEFAULT_BOUNDARY
|
|
2746
|
+
_op_name = "DEFAULT_BOUNDARY"
|
|
2747
|
+
|
|
2748
|
+
class DefaultEndOfWord(ZeroWidthBase):
|
|
2749
|
+
_opcode = OP.DEFAULT_END_OF_WORD
|
|
2750
|
+
_op_name = "DEFAULT_END_OF_WORD"
|
|
2751
|
+
|
|
2752
|
+
class DefaultStartOfWord(ZeroWidthBase):
|
|
2753
|
+
_opcode = OP.DEFAULT_START_OF_WORD
|
|
2754
|
+
_op_name = "DEFAULT_START_OF_WORD"
|
|
2755
|
+
|
|
2756
|
+
class EndOfLine(ZeroWidthBase):
|
|
2757
|
+
_opcode = OP.END_OF_LINE
|
|
2758
|
+
_op_name = "END_OF_LINE"
|
|
2759
|
+
|
|
2760
|
+
class EndOfLineU(EndOfLine):
|
|
2761
|
+
_opcode = OP.END_OF_LINE_U
|
|
2762
|
+
_op_name = "END_OF_LINE_U"
|
|
2763
|
+
|
|
2764
|
+
class EndOfString(ZeroWidthBase):
|
|
2765
|
+
_opcode = OP.END_OF_STRING
|
|
2766
|
+
_op_name = "END_OF_STRING"
|
|
2767
|
+
|
|
2768
|
+
class EndOfStringLine(ZeroWidthBase):
|
|
2769
|
+
_opcode = OP.END_OF_STRING_LINE
|
|
2770
|
+
_op_name = "END_OF_STRING_LINE"
|
|
2771
|
+
|
|
2772
|
+
class EndOfStringLineU(EndOfStringLine):
|
|
2773
|
+
_opcode = OP.END_OF_STRING_LINE_U
|
|
2774
|
+
_op_name = "END_OF_STRING_LINE_U"
|
|
2775
|
+
|
|
2776
|
+
class EndOfWord(ZeroWidthBase):
|
|
2777
|
+
_opcode = OP.END_OF_WORD
|
|
2778
|
+
_op_name = "END_OF_WORD"
|
|
2779
|
+
|
|
2780
|
+
class Failure(ZeroWidthBase):
|
|
2781
|
+
_op_name = "FAILURE"
|
|
2782
|
+
|
|
2783
|
+
def _compile(self, reverse, fuzzy):
|
|
2784
|
+
return [(OP.FAILURE, )]
|
|
2785
|
+
|
|
2786
|
+
class Fuzzy(RegexBase):
|
|
2787
|
+
def __init__(self, subpattern, constraints=None):
|
|
2788
|
+
RegexBase.__init__(self)
|
|
2789
|
+
if constraints is None:
|
|
2790
|
+
constraints = {}
|
|
2791
|
+
self.subpattern = subpattern
|
|
2792
|
+
self.constraints = constraints
|
|
2793
|
+
|
|
2794
|
+
# If an error type is mentioned in the cost equation, then its maximum
|
|
2795
|
+
# defaults to unlimited.
|
|
2796
|
+
if "cost" in constraints:
|
|
2797
|
+
for e in "dis":
|
|
2798
|
+
if e in constraints["cost"]:
|
|
2799
|
+
constraints.setdefault(e, (0, None))
|
|
2800
|
+
|
|
2801
|
+
# If any error type is mentioned, then all the error maxima default to
|
|
2802
|
+
# 0, otherwise they default to unlimited.
|
|
2803
|
+
if set(constraints) & set("dis"):
|
|
2804
|
+
for e in "dis":
|
|
2805
|
+
constraints.setdefault(e, (0, 0))
|
|
2806
|
+
else:
|
|
2807
|
+
for e in "dis":
|
|
2808
|
+
constraints.setdefault(e, (0, None))
|
|
2809
|
+
|
|
2810
|
+
# The maximum of the generic error type defaults to unlimited.
|
|
2811
|
+
constraints.setdefault("e", (0, None))
|
|
2812
|
+
|
|
2813
|
+
# The cost equation defaults to equal costs. Also, the cost of any
|
|
2814
|
+
# error type not mentioned in the cost equation defaults to 0.
|
|
2815
|
+
if "cost" in constraints:
|
|
2816
|
+
for e in "dis":
|
|
2817
|
+
constraints["cost"].setdefault(e, 0)
|
|
2818
|
+
else:
|
|
2819
|
+
constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
|
|
2820
|
+
constraints["e"][1]}
|
|
2821
|
+
|
|
2822
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
2823
|
+
self.subpattern.fix_groups(pattern, reverse, True)
|
|
2824
|
+
|
|
2825
|
+
def pack_characters(self, info):
|
|
2826
|
+
self.subpattern = self.subpattern.pack_characters(info)
|
|
2827
|
+
return self
|
|
2828
|
+
|
|
2829
|
+
def remove_captures(self):
|
|
2830
|
+
self.subpattern = self.subpattern.remove_captures()
|
|
2831
|
+
return self
|
|
2832
|
+
|
|
2833
|
+
def is_atomic(self):
|
|
2834
|
+
return self.subpattern.is_atomic()
|
|
2835
|
+
|
|
2836
|
+
def contains_group(self):
|
|
2837
|
+
return self.subpattern.contains_group()
|
|
2838
|
+
|
|
2839
|
+
def _compile(self, reverse, fuzzy):
|
|
2840
|
+
# The individual limits.
|
|
2841
|
+
arguments = []
|
|
2842
|
+
for e in "dise":
|
|
2843
|
+
v = self.constraints[e]
|
|
2844
|
+
arguments.append(v[0])
|
|
2845
|
+
arguments.append(UNLIMITED if v[1] is None else v[1])
|
|
2846
|
+
|
|
2847
|
+
# The coeffs of the cost equation.
|
|
2848
|
+
for e in "dis":
|
|
2849
|
+
arguments.append(self.constraints["cost"][e])
|
|
2850
|
+
|
|
2851
|
+
# The maximum of the cost equation.
|
|
2852
|
+
v = self.constraints["cost"]["max"]
|
|
2853
|
+
arguments.append(UNLIMITED if v is None else v)
|
|
2854
|
+
|
|
2855
|
+
flags = 0
|
|
2856
|
+
if reverse:
|
|
2857
|
+
flags |= REVERSE_OP
|
|
2858
|
+
|
|
2859
|
+
test = self.constraints.get("test")
|
|
2860
|
+
|
|
2861
|
+
if test:
|
|
2862
|
+
return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
|
|
2863
|
+
test.compile(reverse, True) + [(OP.NEXT,)] +
|
|
2864
|
+
self.subpattern.compile(reverse, True) + [(OP.END,)])
|
|
2865
|
+
|
|
2866
|
+
return ([(OP.FUZZY, flags) + tuple(arguments)] +
|
|
2867
|
+
self.subpattern.compile(reverse, True) + [(OP.END,)])
|
|
2868
|
+
|
|
2869
|
+
def dump(self, indent, reverse):
|
|
2870
|
+
constraints = self._constraints_to_string()
|
|
2871
|
+
if constraints:
|
|
2872
|
+
constraints = " " + constraints
|
|
2873
|
+
print("{}FUZZY{}".format(INDENT * indent, constraints))
|
|
2874
|
+
self.subpattern.dump(indent + 1, reverse)
|
|
2875
|
+
|
|
2876
|
+
def is_empty(self):
|
|
2877
|
+
return self.subpattern.is_empty()
|
|
2878
|
+
|
|
2879
|
+
def __eq__(self, other):
|
|
2880
|
+
return (type(self) is type(other) and self.subpattern ==
|
|
2881
|
+
other.subpattern and self.constraints == other.constraints)
|
|
2882
|
+
|
|
2883
|
+
def max_width(self):
|
|
2884
|
+
return UNLIMITED
|
|
2885
|
+
|
|
2886
|
+
def _constraints_to_string(self):
|
|
2887
|
+
constraints = []
|
|
2888
|
+
|
|
2889
|
+
for name in "ids":
|
|
2890
|
+
min, max = self.constraints[name]
|
|
2891
|
+
if max == 0:
|
|
2892
|
+
continue
|
|
2893
|
+
|
|
2894
|
+
con = ""
|
|
2895
|
+
|
|
2896
|
+
if min > 0:
|
|
2897
|
+
con = "{}<=".format(min)
|
|
2898
|
+
|
|
2899
|
+
con += name
|
|
2900
|
+
|
|
2901
|
+
if max is not None:
|
|
2902
|
+
con += "<={}".format(max)
|
|
2903
|
+
|
|
2904
|
+
constraints.append(con)
|
|
2905
|
+
|
|
2906
|
+
cost = []
|
|
2907
|
+
for name in "ids":
|
|
2908
|
+
coeff = self.constraints["cost"][name]
|
|
2909
|
+
if coeff > 0:
|
|
2910
|
+
cost.append("{}{}".format(coeff, name))
|
|
2911
|
+
|
|
2912
|
+
limit = self.constraints["cost"]["max"]
|
|
2913
|
+
if limit is not None and limit > 0:
|
|
2914
|
+
cost = "{}<={}".format("+".join(cost), limit)
|
|
2915
|
+
constraints.append(cost)
|
|
2916
|
+
|
|
2917
|
+
return ",".join(constraints)
|
|
2918
|
+
|
|
2919
|
+
class Grapheme(RegexBase):
|
|
2920
|
+
def _compile(self, reverse, fuzzy):
|
|
2921
|
+
# Match at least 1 character until a grapheme boundary is reached. Note
|
|
2922
|
+
# that this is the same whether matching forwards or backwards.
|
|
2923
|
+
grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
|
|
2924
|
+
GraphemeBoundary()]))
|
|
2925
|
+
|
|
2926
|
+
return grapheme_matcher.compile(reverse, fuzzy)
|
|
2927
|
+
|
|
2928
|
+
def dump(self, indent, reverse):
|
|
2929
|
+
print("{}GRAPHEME".format(INDENT * indent))
|
|
2930
|
+
|
|
2931
|
+
def max_width(self):
|
|
2932
|
+
return UNLIMITED
|
|
2933
|
+
|
|
2934
|
+
class GraphemeBoundary:
|
|
2935
|
+
def compile(self, reverse, fuzzy):
|
|
2936
|
+
return [(OP.GRAPHEME_BOUNDARY, 1)]
|
|
2937
|
+
|
|
2938
|
+
class GreedyRepeat(RegexBase):
|
|
2939
|
+
_opcode = OP.GREEDY_REPEAT
|
|
2940
|
+
_op_name = "GREEDY_REPEAT"
|
|
2941
|
+
|
|
2942
|
+
def __init__(self, subpattern, min_count, max_count):
|
|
2943
|
+
RegexBase.__init__(self)
|
|
2944
|
+
self.subpattern = subpattern
|
|
2945
|
+
self.min_count = min_count
|
|
2946
|
+
self.max_count = max_count
|
|
2947
|
+
|
|
2948
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
2949
|
+
self.subpattern.fix_groups(pattern, reverse, fuzzy)
|
|
2950
|
+
|
|
2951
|
+
def optimise(self, info, reverse):
|
|
2952
|
+
subpattern = self.subpattern.optimise(info, reverse)
|
|
2953
|
+
|
|
2954
|
+
return type(self)(subpattern, self.min_count, self.max_count)
|
|
2955
|
+
|
|
2956
|
+
def pack_characters(self, info):
|
|
2957
|
+
self.subpattern = self.subpattern.pack_characters(info)
|
|
2958
|
+
return self
|
|
2959
|
+
|
|
2960
|
+
def remove_captures(self):
|
|
2961
|
+
self.subpattern = self.subpattern.remove_captures()
|
|
2962
|
+
return self
|
|
2963
|
+
|
|
2964
|
+
def is_atomic(self):
|
|
2965
|
+
return self.min_count == self.max_count and self.subpattern.is_atomic()
|
|
2966
|
+
|
|
2967
|
+
def can_be_affix(self):
|
|
2968
|
+
return False
|
|
2969
|
+
|
|
2970
|
+
def contains_group(self):
|
|
2971
|
+
return self.subpattern.contains_group()
|
|
2972
|
+
|
|
2973
|
+
def get_firstset(self, reverse):
|
|
2974
|
+
fs = self.subpattern.get_firstset(reverse)
|
|
2975
|
+
if self.min_count == 0:
|
|
2976
|
+
fs.add(None)
|
|
2977
|
+
|
|
2978
|
+
return fs
|
|
2979
|
+
|
|
2980
|
+
def _compile(self, reverse, fuzzy):
|
|
2981
|
+
repeat = [self._opcode, self.min_count]
|
|
2982
|
+
if self.max_count is None:
|
|
2983
|
+
repeat.append(UNLIMITED)
|
|
2984
|
+
else:
|
|
2985
|
+
repeat.append(self.max_count)
|
|
2986
|
+
|
|
2987
|
+
subpattern = self.subpattern.compile(reverse, fuzzy)
|
|
2988
|
+
if not subpattern:
|
|
2989
|
+
return []
|
|
2990
|
+
|
|
2991
|
+
return ([tuple(repeat)] + subpattern + [(OP.END, )])
|
|
2992
|
+
|
|
2993
|
+
def dump(self, indent, reverse):
|
|
2994
|
+
if self.max_count is None:
|
|
2995
|
+
limit = "INF"
|
|
2996
|
+
else:
|
|
2997
|
+
limit = self.max_count
|
|
2998
|
+
print("{}{} {} {}".format(INDENT * indent, self._op_name,
|
|
2999
|
+
self.min_count, limit))
|
|
3000
|
+
|
|
3001
|
+
self.subpattern.dump(indent + 1, reverse)
|
|
3002
|
+
|
|
3003
|
+
def is_empty(self):
|
|
3004
|
+
return self.subpattern.is_empty()
|
|
3005
|
+
|
|
3006
|
+
def __eq__(self, other):
|
|
3007
|
+
return type(self) is type(other) and (self.subpattern, self.min_count,
|
|
3008
|
+
self.max_count) == (other.subpattern, other.min_count,
|
|
3009
|
+
other.max_count)
|
|
3010
|
+
|
|
3011
|
+
def max_width(self):
|
|
3012
|
+
if self.max_count is None:
|
|
3013
|
+
return UNLIMITED
|
|
3014
|
+
|
|
3015
|
+
return self.subpattern.max_width() * self.max_count
|
|
3016
|
+
|
|
3017
|
+
def get_required_string(self, reverse):
|
|
3018
|
+
max_count = UNLIMITED if self.max_count is None else self.max_count
|
|
3019
|
+
if self.min_count == 0:
|
|
3020
|
+
w = self.subpattern.max_width() * max_count
|
|
3021
|
+
return min(w, UNLIMITED), None
|
|
3022
|
+
|
|
3023
|
+
ofs, req = self.subpattern.get_required_string(reverse)
|
|
3024
|
+
if req:
|
|
3025
|
+
return ofs, req
|
|
3026
|
+
|
|
3027
|
+
w = self.subpattern.max_width() * max_count
|
|
3028
|
+
return min(w, UNLIMITED), None
|
|
3029
|
+
|
|
3030
|
+
class PossessiveRepeat(GreedyRepeat):
|
|
3031
|
+
def is_atomic(self):
|
|
3032
|
+
return True
|
|
3033
|
+
|
|
3034
|
+
def _compile(self, reverse, fuzzy):
|
|
3035
|
+
subpattern = self.subpattern.compile(reverse, fuzzy)
|
|
3036
|
+
if not subpattern:
|
|
3037
|
+
return []
|
|
3038
|
+
|
|
3039
|
+
repeat = [self._opcode, self.min_count]
|
|
3040
|
+
if self.max_count is None:
|
|
3041
|
+
repeat.append(UNLIMITED)
|
|
3042
|
+
else:
|
|
3043
|
+
repeat.append(self.max_count)
|
|
3044
|
+
|
|
3045
|
+
return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
|
|
3046
|
+
(OP.END, )])
|
|
3047
|
+
|
|
3048
|
+
def dump(self, indent, reverse):
|
|
3049
|
+
print("{}ATOMIC".format(INDENT * indent))
|
|
3050
|
+
|
|
3051
|
+
if self.max_count is None:
|
|
3052
|
+
limit = "INF"
|
|
3053
|
+
else:
|
|
3054
|
+
limit = self.max_count
|
|
3055
|
+
print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
|
|
3056
|
+
self.min_count, limit))
|
|
3057
|
+
|
|
3058
|
+
self.subpattern.dump(indent + 2, reverse)
|
|
3059
|
+
|
|
3060
|
+
class Group(RegexBase):
|
|
3061
|
+
def __init__(self, info, group, subpattern):
|
|
3062
|
+
RegexBase.__init__(self)
|
|
3063
|
+
self.info = info
|
|
3064
|
+
self.group = group
|
|
3065
|
+
self.subpattern = subpattern
|
|
3066
|
+
|
|
3067
|
+
self.call_ref = None
|
|
3068
|
+
|
|
3069
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
3070
|
+
self.info.defined_groups[self.group] = (self, reverse, fuzzy)
|
|
3071
|
+
self.subpattern.fix_groups(pattern, reverse, fuzzy)
|
|
3072
|
+
|
|
3073
|
+
def optimise(self, info, reverse):
|
|
3074
|
+
subpattern = self.subpattern.optimise(info, reverse)
|
|
3075
|
+
|
|
3076
|
+
return Group(self.info, self.group, subpattern)
|
|
3077
|
+
|
|
3078
|
+
def pack_characters(self, info):
|
|
3079
|
+
self.subpattern = self.subpattern.pack_characters(info)
|
|
3080
|
+
return self
|
|
3081
|
+
|
|
3082
|
+
def remove_captures(self):
|
|
3083
|
+
return self.subpattern.remove_captures()
|
|
3084
|
+
|
|
3085
|
+
def is_atomic(self):
|
|
3086
|
+
return self.subpattern.is_atomic()
|
|
3087
|
+
|
|
3088
|
+
def can_be_affix(self):
|
|
3089
|
+
return False
|
|
3090
|
+
|
|
3091
|
+
def contains_group(self):
|
|
3092
|
+
return True
|
|
3093
|
+
|
|
3094
|
+
def get_firstset(self, reverse):
|
|
3095
|
+
return self.subpattern.get_firstset(reverse)
|
|
3096
|
+
|
|
3097
|
+
def has_simple_start(self):
|
|
3098
|
+
return self.subpattern.has_simple_start()
|
|
3099
|
+
|
|
3100
|
+
def _compile(self, reverse, fuzzy):
|
|
3101
|
+
code = []
|
|
3102
|
+
|
|
3103
|
+
public_group = private_group = self.group
|
|
3104
|
+
if private_group < 0:
|
|
3105
|
+
public_group = self.info.private_groups[private_group]
|
|
3106
|
+
private_group = self.info.group_count - private_group
|
|
3107
|
+
|
|
3108
|
+
key = self.group, reverse, fuzzy
|
|
3109
|
+
ref = self.info.call_refs.get(key)
|
|
3110
|
+
if ref is not None:
|
|
3111
|
+
code += [(OP.CALL_REF, ref)]
|
|
3112
|
+
|
|
3113
|
+
code += [(OP.GROUP, int(not reverse), private_group, public_group)]
|
|
3114
|
+
code += self.subpattern.compile(reverse, fuzzy)
|
|
3115
|
+
code += [(OP.END, )]
|
|
3116
|
+
|
|
3117
|
+
if ref is not None:
|
|
3118
|
+
code += [(OP.END, )]
|
|
3119
|
+
|
|
3120
|
+
return code
|
|
3121
|
+
|
|
3122
|
+
def dump(self, indent, reverse):
|
|
3123
|
+
group = self.group
|
|
3124
|
+
if group < 0:
|
|
3125
|
+
group = self.info.private_groups[group]
|
|
3126
|
+
print("{}GROUP {}".format(INDENT * indent, group))
|
|
3127
|
+
self.subpattern.dump(indent + 1, reverse)
|
|
3128
|
+
|
|
3129
|
+
def __eq__(self, other):
|
|
3130
|
+
return (type(self) is type(other) and (self.group, self.subpattern) ==
|
|
3131
|
+
(other.group, other.subpattern))
|
|
3132
|
+
|
|
3133
|
+
def max_width(self):
|
|
3134
|
+
return self.subpattern.max_width()
|
|
3135
|
+
|
|
3136
|
+
def get_required_string(self, reverse):
|
|
3137
|
+
return self.subpattern.get_required_string(reverse)
|
|
3138
|
+
|
|
3139
|
+
def __del__(self):
|
|
3140
|
+
self.info = None
|
|
3141
|
+
|
|
3142
|
+
class Keep(ZeroWidthBase):
|
|
3143
|
+
_opcode = OP.KEEP
|
|
3144
|
+
_op_name = "KEEP"
|
|
3145
|
+
|
|
3146
|
+
class LazyRepeat(GreedyRepeat):
|
|
3147
|
+
_opcode = OP.LAZY_REPEAT
|
|
3148
|
+
_op_name = "LAZY_REPEAT"
|
|
3149
|
+
|
|
3150
|
+
class LookAround(RegexBase):
|
|
3151
|
+
_dir_text = {False: "AHEAD", True: "BEHIND"}
|
|
3152
|
+
|
|
3153
|
+
def __init__(self, behind, positive, subpattern):
|
|
3154
|
+
RegexBase.__init__(self)
|
|
3155
|
+
self.behind = bool(behind)
|
|
3156
|
+
self.positive = bool(positive)
|
|
3157
|
+
self.subpattern = subpattern
|
|
3158
|
+
|
|
3159
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
3160
|
+
self.subpattern.fix_groups(pattern, self.behind, fuzzy)
|
|
3161
|
+
|
|
3162
|
+
def optimise(self, info, reverse):
|
|
3163
|
+
subpattern = self.subpattern.optimise(info, self.behind)
|
|
3164
|
+
if self.positive and subpattern.is_empty():
|
|
3165
|
+
return subpattern
|
|
3166
|
+
|
|
3167
|
+
return LookAround(self.behind, self.positive, subpattern)
|
|
3168
|
+
|
|
3169
|
+
def pack_characters(self, info):
|
|
3170
|
+
self.subpattern = self.subpattern.pack_characters(info)
|
|
3171
|
+
return self
|
|
3172
|
+
|
|
3173
|
+
def remove_captures(self):
|
|
3174
|
+
return self.subpattern.remove_captures()
|
|
3175
|
+
|
|
3176
|
+
def is_atomic(self):
|
|
3177
|
+
return self.subpattern.is_atomic()
|
|
3178
|
+
|
|
3179
|
+
def can_be_affix(self):
|
|
3180
|
+
return self.subpattern.can_be_affix()
|
|
3181
|
+
|
|
3182
|
+
def contains_group(self):
|
|
3183
|
+
return self.subpattern.contains_group()
|
|
3184
|
+
|
|
3185
|
+
def get_firstset(self, reverse):
|
|
3186
|
+
if self.positive and self.behind == reverse:
|
|
3187
|
+
return self.subpattern.get_firstset(reverse)
|
|
3188
|
+
|
|
3189
|
+
return set([None])
|
|
3190
|
+
|
|
3191
|
+
def _compile(self, reverse, fuzzy):
|
|
3192
|
+
flags = 0
|
|
3193
|
+
if self.positive:
|
|
3194
|
+
flags |= POSITIVE_OP
|
|
3195
|
+
if fuzzy:
|
|
3196
|
+
flags |= FUZZY_OP
|
|
3197
|
+
if reverse:
|
|
3198
|
+
flags |= REVERSE_OP
|
|
3199
|
+
|
|
3200
|
+
return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
|
|
3201
|
+
self.subpattern.compile(self.behind) + [(OP.END, )])
|
|
3202
|
+
|
|
3203
|
+
def dump(self, indent, reverse):
|
|
3204
|
+
print("{}LOOK{} {}".format(INDENT * indent,
|
|
3205
|
+
self._dir_text[self.behind], POS_TEXT[self.positive]))
|
|
3206
|
+
self.subpattern.dump(indent + 1, self.behind)
|
|
3207
|
+
|
|
3208
|
+
def is_empty(self):
|
|
3209
|
+
return self.positive and self.subpattern.is_empty()
|
|
3210
|
+
|
|
3211
|
+
def __eq__(self, other):
|
|
3212
|
+
return type(self) is type(other) and (self.behind, self.positive,
|
|
3213
|
+
self.subpattern) == (other.behind, other.positive, other.subpattern)
|
|
3214
|
+
|
|
3215
|
+
def max_width(self):
|
|
3216
|
+
return 0
|
|
3217
|
+
|
|
3218
|
+
class LookAroundConditional(RegexBase):
|
|
3219
|
+
_dir_text = {False: "AHEAD", True: "BEHIND"}
|
|
3220
|
+
|
|
3221
|
+
def __init__(self, behind, positive, subpattern, yes_item, no_item):
|
|
3222
|
+
RegexBase.__init__(self)
|
|
3223
|
+
self.behind = bool(behind)
|
|
3224
|
+
self.positive = bool(positive)
|
|
3225
|
+
self.subpattern = subpattern
|
|
3226
|
+
self.yes_item = yes_item
|
|
3227
|
+
self.no_item = no_item
|
|
3228
|
+
|
|
3229
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
3230
|
+
self.subpattern.fix_groups(pattern, reverse, fuzzy)
|
|
3231
|
+
self.yes_item.fix_groups(pattern, reverse, fuzzy)
|
|
3232
|
+
self.no_item.fix_groups(pattern, reverse, fuzzy)
|
|
3233
|
+
|
|
3234
|
+
def optimise(self, info, reverse):
|
|
3235
|
+
subpattern = self.subpattern.optimise(info, self.behind)
|
|
3236
|
+
yes_item = self.yes_item.optimise(info, self.behind)
|
|
3237
|
+
no_item = self.no_item.optimise(info, self.behind)
|
|
3238
|
+
|
|
3239
|
+
return LookAroundConditional(self.behind, self.positive, subpattern,
|
|
3240
|
+
yes_item, no_item)
|
|
3241
|
+
|
|
3242
|
+
def pack_characters(self, info):
|
|
3243
|
+
self.subpattern = self.subpattern.pack_characters(info)
|
|
3244
|
+
self.yes_item = self.yes_item.pack_characters(info)
|
|
3245
|
+
self.no_item = self.no_item.pack_characters(info)
|
|
3246
|
+
return self
|
|
3247
|
+
|
|
3248
|
+
def remove_captures(self):
|
|
3249
|
+
self.subpattern = self.subpattern.remove_captures()
|
|
3250
|
+
self.yes_item = self.yes_item.remove_captures()
|
|
3251
|
+
self.no_item = self.no_item.remove_captures()
|
|
3252
|
+
|
|
3253
|
+
def is_atomic(self):
|
|
3254
|
+
return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
|
|
3255
|
+
self.no_item.is_atomic())
|
|
3256
|
+
|
|
3257
|
+
def can_be_affix(self):
|
|
3258
|
+
return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
|
|
3259
|
+
and self.no_item.can_be_affix())
|
|
3260
|
+
|
|
3261
|
+
def contains_group(self):
|
|
3262
|
+
return (self.subpattern.contains_group() or
|
|
3263
|
+
self.yes_item.contains_group() or self.no_item.contains_group())
|
|
3264
|
+
|
|
3265
|
+
def _compile(self, reverse, fuzzy):
|
|
3266
|
+
code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
|
|
3267
|
+
code.extend(self.subpattern.compile(self.behind, fuzzy))
|
|
3268
|
+
code.append((OP.NEXT, ))
|
|
3269
|
+
code.extend(self.yes_item.compile(reverse, fuzzy))
|
|
3270
|
+
add_code = self.no_item.compile(reverse, fuzzy)
|
|
3271
|
+
if add_code:
|
|
3272
|
+
code.append((OP.NEXT, ))
|
|
3273
|
+
code.extend(add_code)
|
|
3274
|
+
|
|
3275
|
+
code.append((OP.END, ))
|
|
3276
|
+
|
|
3277
|
+
return code
|
|
3278
|
+
|
|
3279
|
+
def dump(self, indent, reverse):
|
|
3280
|
+
print("{}CONDITIONAL {} {}".format(INDENT * indent,
|
|
3281
|
+
self._dir_text[self.behind], POS_TEXT[self.positive]))
|
|
3282
|
+
self.subpattern.dump(indent + 1, self.behind)
|
|
3283
|
+
print("{}EITHER".format(INDENT * indent))
|
|
3284
|
+
self.yes_item.dump(indent + 1, reverse)
|
|
3285
|
+
if not self.no_item.is_empty():
|
|
3286
|
+
print("{}OR".format(INDENT * indent))
|
|
3287
|
+
self.no_item.dump(indent + 1, reverse)
|
|
3288
|
+
|
|
3289
|
+
def is_empty(self):
|
|
3290
|
+
return (self.subpattern.is_empty() and self.yes_item.is_empty() or
|
|
3291
|
+
self.no_item.is_empty())
|
|
3292
|
+
|
|
3293
|
+
def __eq__(self, other):
|
|
3294
|
+
return type(self) is type(other) and (self.subpattern, self.yes_item,
|
|
3295
|
+
self.no_item) == (other.subpattern, other.yes_item, other.no_item)
|
|
3296
|
+
|
|
3297
|
+
def max_width(self):
|
|
3298
|
+
return max(self.yes_item.max_width(), self.no_item.max_width())
|
|
3299
|
+
|
|
3300
|
+
def get_required_string(self, reverse):
|
|
3301
|
+
return self.max_width(), None
|
|
3302
|
+
|
|
3303
|
+
class PrecompiledCode(RegexBase):
|
|
3304
|
+
def __init__(self, code):
|
|
3305
|
+
self.code = code
|
|
3306
|
+
|
|
3307
|
+
def _compile(self, reverse, fuzzy):
|
|
3308
|
+
return [tuple(self.code)]
|
|
3309
|
+
|
|
3310
|
+
class Property(RegexBase):
|
|
3311
|
+
_opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
|
|
3312
|
+
OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
|
|
3313
|
+
OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
|
|
3314
|
+
OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
|
|
3315
|
+
True): OP.PROPERTY_IGN_REV}
|
|
3316
|
+
|
|
3317
|
+
def __init__(self, value, positive=True, case_flags=NOCASE,
|
|
3318
|
+
zerowidth=False, encoding=0):
|
|
3319
|
+
RegexBase.__init__(self)
|
|
3320
|
+
self.value = value
|
|
3321
|
+
self.positive = bool(positive)
|
|
3322
|
+
self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
|
|
3323
|
+
self.zerowidth = bool(zerowidth)
|
|
3324
|
+
self.encoding = encoding
|
|
3325
|
+
|
|
3326
|
+
self._key = (self.__class__, self.value, self.positive,
|
|
3327
|
+
self.case_flags, self.zerowidth)
|
|
3328
|
+
|
|
3329
|
+
def rebuild(self, positive, case_flags, zerowidth):
|
|
3330
|
+
return Property(self.value, positive, case_flags, zerowidth,
|
|
3331
|
+
self.encoding)
|
|
3332
|
+
|
|
3333
|
+
def optimise(self, info, reverse, in_set=False):
|
|
3334
|
+
return self
|
|
3335
|
+
|
|
3336
|
+
def get_firstset(self, reverse):
|
|
3337
|
+
return set([self])
|
|
3338
|
+
|
|
3339
|
+
def has_simple_start(self):
|
|
3340
|
+
return True
|
|
3341
|
+
|
|
3342
|
+
def _compile(self, reverse, fuzzy):
|
|
3343
|
+
flags = 0
|
|
3344
|
+
if self.positive:
|
|
3345
|
+
flags |= POSITIVE_OP
|
|
3346
|
+
if self.zerowidth:
|
|
3347
|
+
flags |= ZEROWIDTH_OP
|
|
3348
|
+
if fuzzy:
|
|
3349
|
+
flags |= FUZZY_OP
|
|
3350
|
+
flags |= self.encoding << ENCODING_OP_SHIFT
|
|
3351
|
+
return [(self._opcode[self.case_flags, reverse], flags, self.value)]
|
|
3352
|
+
|
|
3353
|
+
def dump(self, indent, reverse):
|
|
3354
|
+
prop = PROPERTY_NAMES[self.value >> 16]
|
|
3355
|
+
name, value = prop[0], prop[1][self.value & 0xFFFF]
|
|
3356
|
+
print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,
|
|
3357
|
+
POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],
|
|
3358
|
+
["", " ASCII"][self.encoding]))
|
|
3359
|
+
|
|
3360
|
+
def matches(self, ch):
|
|
3361
|
+
return _regex.has_property_value(self.value, ch) == self.positive
|
|
3362
|
+
|
|
3363
|
+
def max_width(self):
|
|
3364
|
+
return 1
|
|
3365
|
+
|
|
3366
|
+
class Prune(ZeroWidthBase):
|
|
3367
|
+
_op_name = "PRUNE"
|
|
3368
|
+
|
|
3369
|
+
def _compile(self, reverse, fuzzy):
|
|
3370
|
+
return [(OP.PRUNE, )]
|
|
3371
|
+
|
|
3372
|
+
class Range(RegexBase):
|
|
3373
|
+
_opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
|
|
3374
|
+
(FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
|
|
3375
|
+
(NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
|
|
3376
|
+
(FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
|
|
3377
|
+
_op_name = "RANGE"
|
|
3378
|
+
|
|
3379
|
+
def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
|
|
3380
|
+
zerowidth=False):
|
|
3381
|
+
RegexBase.__init__(self)
|
|
3382
|
+
self.lower = lower
|
|
3383
|
+
self.upper = upper
|
|
3384
|
+
self.positive = bool(positive)
|
|
3385
|
+
self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
|
|
3386
|
+
self.zerowidth = bool(zerowidth)
|
|
3387
|
+
|
|
3388
|
+
self._key = (self.__class__, self.lower, self.upper, self.positive,
|
|
3389
|
+
self.case_flags, self.zerowidth)
|
|
3390
|
+
|
|
3391
|
+
def rebuild(self, positive, case_flags, zerowidth):
|
|
3392
|
+
return Range(self.lower, self.upper, positive, case_flags, zerowidth)
|
|
3393
|
+
|
|
3394
|
+
def optimise(self, info, reverse, in_set=False):
|
|
3395
|
+
# Is the range case-sensitive?
|
|
3396
|
+
if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
|
|
3397
|
+
return self
|
|
3398
|
+
|
|
3399
|
+
# Is full case-folding possible?
|
|
3400
|
+
if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
|
|
3401
|
+
FULLIGNORECASE):
|
|
3402
|
+
return self
|
|
3403
|
+
|
|
3404
|
+
# Get the characters which expand to multiple codepoints on folding.
|
|
3405
|
+
expanding_chars = _regex.get_expand_on_folding()
|
|
3406
|
+
|
|
3407
|
+
# Get the folded characters in the range.
|
|
3408
|
+
items = []
|
|
3409
|
+
for ch in expanding_chars:
|
|
3410
|
+
if self.lower <= ord(ch) <= self.upper:
|
|
3411
|
+
folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
|
|
3412
|
+
items.append(String([ord(c) for c in folded],
|
|
3413
|
+
case_flags=self.case_flags))
|
|
3414
|
+
|
|
3415
|
+
if not items:
|
|
3416
|
+
# We can fall back to simple case-folding.
|
|
3417
|
+
return self
|
|
3418
|
+
|
|
3419
|
+
if len(items) < self.upper - self.lower + 1:
|
|
3420
|
+
# Not all the characters are covered by the full case-folding.
|
|
3421
|
+
items.insert(0, self)
|
|
3422
|
+
|
|
3423
|
+
return Branch(items)
|
|
3424
|
+
|
|
3425
|
+
def _compile(self, reverse, fuzzy):
|
|
3426
|
+
flags = 0
|
|
3427
|
+
if self.positive:
|
|
3428
|
+
flags |= POSITIVE_OP
|
|
3429
|
+
if self.zerowidth:
|
|
3430
|
+
flags |= ZEROWIDTH_OP
|
|
3431
|
+
if fuzzy:
|
|
3432
|
+
flags |= FUZZY_OP
|
|
3433
|
+
return [(self._opcode[self.case_flags, reverse], flags, self.lower,
|
|
3434
|
+
self.upper)]
|
|
3435
|
+
|
|
3436
|
+
def dump(self, indent, reverse):
|
|
3437
|
+
display_lower = ascii(chr(self.lower)).lstrip("bu")
|
|
3438
|
+
display_upper = ascii(chr(self.upper)).lstrip("bu")
|
|
3439
|
+
print("{}RANGE {} {} {}{}".format(INDENT * indent,
|
|
3440
|
+
POS_TEXT[self.positive], display_lower, display_upper,
|
|
3441
|
+
CASE_TEXT[self.case_flags]))
|
|
3442
|
+
|
|
3443
|
+
def matches(self, ch):
|
|
3444
|
+
return (self.lower <= ch <= self.upper) == self.positive
|
|
3445
|
+
|
|
3446
|
+
def max_width(self):
|
|
3447
|
+
return 1
|
|
3448
|
+
|
|
3449
|
+
class RefGroup(RegexBase):
|
|
3450
|
+
_opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
|
|
3451
|
+
OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
|
|
3452
|
+
False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
|
|
3453
|
+
True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
|
|
3454
|
+
(FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
|
|
3455
|
+
|
|
3456
|
+
def __init__(self, info, group, position, case_flags=NOCASE):
|
|
3457
|
+
RegexBase.__init__(self)
|
|
3458
|
+
self.info = info
|
|
3459
|
+
self.group = group
|
|
3460
|
+
self.position = position
|
|
3461
|
+
self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
|
|
3462
|
+
|
|
3463
|
+
self._key = self.__class__, self.group, self.case_flags
|
|
3464
|
+
|
|
3465
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
3466
|
+
try:
|
|
3467
|
+
self.group = int(self.group)
|
|
3468
|
+
except ValueError:
|
|
3469
|
+
try:
|
|
3470
|
+
self.group = self.info.group_index[self.group]
|
|
3471
|
+
except KeyError:
|
|
3472
|
+
raise error("unknown group", pattern, self.position)
|
|
3473
|
+
|
|
3474
|
+
if not 1 <= self.group <= self.info.group_count:
|
|
3475
|
+
raise error("invalid group reference", pattern, self.position)
|
|
3476
|
+
|
|
3477
|
+
self._key = self.__class__, self.group, self.case_flags
|
|
3478
|
+
|
|
3479
|
+
def remove_captures(self):
|
|
3480
|
+
raise error("group reference not allowed", self.pattern, self.position)
|
|
3481
|
+
|
|
3482
|
+
def _compile(self, reverse, fuzzy):
|
|
3483
|
+
flags = 0
|
|
3484
|
+
if fuzzy:
|
|
3485
|
+
flags |= FUZZY_OP
|
|
3486
|
+
return [(self._opcode[self.case_flags, reverse], flags, self.group)]
|
|
3487
|
+
|
|
3488
|
+
def dump(self, indent, reverse):
|
|
3489
|
+
print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
|
|
3490
|
+
CASE_TEXT[self.case_flags]))
|
|
3491
|
+
|
|
3492
|
+
def max_width(self):
|
|
3493
|
+
return UNLIMITED
|
|
3494
|
+
|
|
3495
|
+
def __del__(self):
|
|
3496
|
+
self.info = None
|
|
3497
|
+
|
|
3498
|
+
class SearchAnchor(ZeroWidthBase):
|
|
3499
|
+
_opcode = OP.SEARCH_ANCHOR
|
|
3500
|
+
_op_name = "SEARCH_ANCHOR"
|
|
3501
|
+
|
|
3502
|
+
class Sequence(RegexBase):
|
|
3503
|
+
def __init__(self, items=None):
|
|
3504
|
+
RegexBase.__init__(self)
|
|
3505
|
+
if items is None:
|
|
3506
|
+
items = []
|
|
3507
|
+
|
|
3508
|
+
self.items = items
|
|
3509
|
+
|
|
3510
|
+
def fix_groups(self, pattern, reverse, fuzzy):
|
|
3511
|
+
for s in self.items:
|
|
3512
|
+
s.fix_groups(pattern, reverse, fuzzy)
|
|
3513
|
+
|
|
3514
|
+
def optimise(self, info, reverse):
|
|
3515
|
+
# Flatten the sequences.
|
|
3516
|
+
items = []
|
|
3517
|
+
for s in self.items:
|
|
3518
|
+
s = s.optimise(info, reverse)
|
|
3519
|
+
if isinstance(s, Sequence):
|
|
3520
|
+
items.extend(s.items)
|
|
3521
|
+
else:
|
|
3522
|
+
items.append(s)
|
|
3523
|
+
|
|
3524
|
+
return make_sequence(items)
|
|
3525
|
+
|
|
3526
|
+
def pack_characters(self, info):
|
|
3527
|
+
"Packs sequences of characters into strings."
|
|
3528
|
+
items = []
|
|
3529
|
+
characters = []
|
|
3530
|
+
case_flags = NOCASE
|
|
3531
|
+
for s in self.items:
|
|
3532
|
+
if type(s) is Character and s.positive and not s.zerowidth:
|
|
3533
|
+
if s.case_flags != case_flags:
|
|
3534
|
+
# Different case sensitivity, so flush, unless neither the
|
|
3535
|
+
# previous nor the new character are cased.
|
|
3536
|
+
if s.case_flags or is_cased_i(info, s.value):
|
|
3537
|
+
Sequence._flush_characters(info, characters,
|
|
3538
|
+
case_flags, items)
|
|
3539
|
+
|
|
3540
|
+
case_flags = s.case_flags
|
|
3541
|
+
|
|
3542
|
+
characters.append(s.value)
|
|
3543
|
+
elif type(s) is String or type(s) is Literal:
|
|
3544
|
+
if s.case_flags != case_flags:
|
|
3545
|
+
# Different case sensitivity, so flush, unless the neither
|
|
3546
|
+
# the previous nor the new string are cased.
|
|
3547
|
+
if s.case_flags or any(is_cased_i(info, c) for c in
|
|
3548
|
+
characters):
|
|
3549
|
+
Sequence._flush_characters(info, characters,
|
|
3550
|
+
case_flags, items)
|
|
3551
|
+
|
|
3552
|
+
case_flags = s.case_flags
|
|
3553
|
+
|
|
3554
|
+
characters.extend(s.characters)
|
|
3555
|
+
else:
|
|
3556
|
+
Sequence._flush_characters(info, characters, case_flags, items)
|
|
3557
|
+
|
|
3558
|
+
items.append(s.pack_characters(info))
|
|
3559
|
+
|
|
3560
|
+
Sequence._flush_characters(info, characters, case_flags, items)
|
|
3561
|
+
|
|
3562
|
+
return make_sequence(items)
|
|
3563
|
+
|
|
3564
|
+
def remove_captures(self):
|
|
3565
|
+
self.items = [s.remove_captures() for s in self.items]
|
|
3566
|
+
return self
|
|
3567
|
+
|
|
3568
|
+
def is_atomic(self):
|
|
3569
|
+
return all(s.is_atomic() for s in self.items)
|
|
3570
|
+
|
|
3571
|
+
def can_be_affix(self):
|
|
3572
|
+
return False
|
|
3573
|
+
|
|
3574
|
+
def contains_group(self):
|
|
3575
|
+
return any(s.contains_group() for s in self.items)
|
|
3576
|
+
|
|
3577
|
+
def get_firstset(self, reverse):
|
|
3578
|
+
fs = set()
|
|
3579
|
+
items = self.items
|
|
3580
|
+
if reverse:
|
|
3581
|
+
items.reverse()
|
|
3582
|
+
for s in items:
|
|
3583
|
+
fs |= s.get_firstset(reverse)
|
|
3584
|
+
if None not in fs:
|
|
3585
|
+
return fs
|
|
3586
|
+
fs.discard(None)
|
|
3587
|
+
|
|
3588
|
+
return fs | set([None])
|
|
3589
|
+
|
|
3590
|
+
def has_simple_start(self):
|
|
3591
|
+
return bool(self.items) and self.items[0].has_simple_start()
|
|
3592
|
+
|
|
3593
|
+
def _compile(self, reverse, fuzzy):
|
|
3594
|
+
seq = self.items
|
|
3595
|
+
if reverse:
|
|
3596
|
+
seq = seq[::-1]
|
|
3597
|
+
|
|
3598
|
+
code = []
|
|
3599
|
+
for s in seq:
|
|
3600
|
+
code.extend(s.compile(reverse, fuzzy))
|
|
3601
|
+
|
|
3602
|
+
return code
|
|
3603
|
+
|
|
3604
|
+
def dump(self, indent, reverse):
|
|
3605
|
+
for s in self.items:
|
|
3606
|
+
s.dump(indent, reverse)
|
|
3607
|
+
|
|
3608
|
+
@staticmethod
|
|
3609
|
+
def _flush_characters(info, characters, case_flags, items):
|
|
3610
|
+
if not characters:
|
|
3611
|
+
return
|
|
3612
|
+
|
|
3613
|
+
# Disregard case_flags if all of the characters are case-less.
|
|
3614
|
+
if case_flags & IGNORECASE:
|
|
3615
|
+
if not any(is_cased_i(info, c) for c in characters):
|
|
3616
|
+
case_flags = NOCASE
|
|
3617
|
+
|
|
3618
|
+
if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
|
|
3619
|
+
literals = Sequence._fix_full_casefold(characters)
|
|
3620
|
+
|
|
3621
|
+
for item in literals:
|
|
3622
|
+
chars = item.characters
|
|
3623
|
+
|
|
3624
|
+
if len(chars) == 1:
|
|
3625
|
+
items.append(Character(chars[0], case_flags=item.case_flags))
|
|
3626
|
+
else:
|
|
3627
|
+
items.append(String(chars, case_flags=item.case_flags))
|
|
3628
|
+
else:
|
|
3629
|
+
if len(characters) == 1:
|
|
3630
|
+
items.append(Character(characters[0], case_flags=case_flags))
|
|
3631
|
+
else:
|
|
3632
|
+
items.append(String(characters, case_flags=case_flags))
|
|
3633
|
+
|
|
3634
|
+
characters[:] = []
|
|
3635
|
+
|
|
3636
|
+
@staticmethod
|
|
3637
|
+
def _fix_full_casefold(characters):
|
|
3638
|
+
# Split a literal needing full case-folding into chunks that need it
|
|
3639
|
+
# and chunks that can use simple case-folding, which is faster.
|
|
3640
|
+
expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
|
|
3641
|
+
_regex.get_expand_on_folding()]
|
|
3642
|
+
string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
|
|
3643
|
+
for c in characters)).lower()
|
|
3644
|
+
chunks = []
|
|
3645
|
+
|
|
3646
|
+
for e in expanded:
|
|
3647
|
+
found = string.find(e)
|
|
3648
|
+
|
|
3649
|
+
while found >= 0:
|
|
3650
|
+
chunks.append((found, found + len(e)))
|
|
3651
|
+
found = string.find(e, found + 1)
|
|
3652
|
+
|
|
3653
|
+
pos = 0
|
|
3654
|
+
literals = []
|
|
3655
|
+
|
|
3656
|
+
for start, end in Sequence._merge_chunks(chunks):
|
|
3657
|
+
if pos < start:
|
|
3658
|
+
literals.append(Literal(characters[pos : start],
|
|
3659
|
+
case_flags=IGNORECASE))
|
|
3660
|
+
|
|
3661
|
+
literals.append(Literal(characters[start : end],
|
|
3662
|
+
case_flags=FULLIGNORECASE))
|
|
3663
|
+
pos = end
|
|
3664
|
+
|
|
3665
|
+
if pos < len(characters):
|
|
3666
|
+
literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
|
|
3667
|
+
|
|
3668
|
+
return literals
|
|
3669
|
+
|
|
3670
|
+
@staticmethod
|
|
3671
|
+
def _merge_chunks(chunks):
|
|
3672
|
+
if len(chunks) < 2:
|
|
3673
|
+
return chunks
|
|
3674
|
+
|
|
3675
|
+
chunks.sort()
|
|
3676
|
+
|
|
3677
|
+
start, end = chunks[0]
|
|
3678
|
+
new_chunks = []
|
|
3679
|
+
|
|
3680
|
+
for s, e in chunks[1 : ]:
|
|
3681
|
+
if s <= end:
|
|
3682
|
+
end = max(end, e)
|
|
3683
|
+
else:
|
|
3684
|
+
new_chunks.append((start, end))
|
|
3685
|
+
start, end = s, e
|
|
3686
|
+
|
|
3687
|
+
new_chunks.append((start, end))
|
|
3688
|
+
|
|
3689
|
+
return new_chunks
|
|
3690
|
+
|
|
3691
|
+
def is_empty(self):
|
|
3692
|
+
return all(i.is_empty() for i in self.items)
|
|
3693
|
+
|
|
3694
|
+
def __eq__(self, other):
|
|
3695
|
+
return type(self) is type(other) and self.items == other.items
|
|
3696
|
+
|
|
3697
|
+
def max_width(self):
|
|
3698
|
+
return sum(s.max_width() for s in self.items)
|
|
3699
|
+
|
|
3700
|
+
def get_required_string(self, reverse):
|
|
3701
|
+
seq = self.items
|
|
3702
|
+
if reverse:
|
|
3703
|
+
seq = seq[::-1]
|
|
3704
|
+
|
|
3705
|
+
offset = 0
|
|
3706
|
+
|
|
3707
|
+
for s in seq:
|
|
3708
|
+
ofs, req = s.get_required_string(reverse)
|
|
3709
|
+
offset += ofs
|
|
3710
|
+
if req:
|
|
3711
|
+
return offset, req
|
|
3712
|
+
|
|
3713
|
+
return offset, None
|
|
3714
|
+
|
|
3715
|
+
class SetBase(RegexBase):
|
|
3716
|
+
def __init__(self, info, items, positive=True, case_flags=NOCASE,
|
|
3717
|
+
zerowidth=False):
|
|
3718
|
+
RegexBase.__init__(self)
|
|
3719
|
+
self.info = info
|
|
3720
|
+
self.items = tuple(items)
|
|
3721
|
+
self.positive = bool(positive)
|
|
3722
|
+
self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
|
|
3723
|
+
self.zerowidth = bool(zerowidth)
|
|
3724
|
+
|
|
3725
|
+
self.char_width = 1
|
|
3726
|
+
|
|
3727
|
+
self._key = (self.__class__, self.items, self.positive,
|
|
3728
|
+
self.case_flags, self.zerowidth)
|
|
3729
|
+
|
|
3730
|
+
def rebuild(self, positive, case_flags, zerowidth):
|
|
3731
|
+
return type(self)(self.info, self.items, positive, case_flags,
|
|
3732
|
+
zerowidth).optimise(self.info, False)
|
|
3733
|
+
|
|
3734
|
+
def get_firstset(self, reverse):
|
|
3735
|
+
return set([self])
|
|
3736
|
+
|
|
3737
|
+
def has_simple_start(self):
|
|
3738
|
+
return True
|
|
3739
|
+
|
|
3740
|
+
def _compile(self, reverse, fuzzy):
|
|
3741
|
+
flags = 0
|
|
3742
|
+
if self.positive:
|
|
3743
|
+
flags |= POSITIVE_OP
|
|
3744
|
+
if self.zerowidth:
|
|
3745
|
+
flags |= ZEROWIDTH_OP
|
|
3746
|
+
if fuzzy:
|
|
3747
|
+
flags |= FUZZY_OP
|
|
3748
|
+
code = [(self._opcode[self.case_flags, reverse], flags)]
|
|
3749
|
+
for m in self.items:
|
|
3750
|
+
code.extend(m.compile())
|
|
3751
|
+
|
|
3752
|
+
code.append((OP.END, ))
|
|
3753
|
+
|
|
3754
|
+
return code
|
|
3755
|
+
|
|
3756
|
+
def dump(self, indent, reverse):
|
|
3757
|
+
print("{}{} {}{}".format(INDENT * indent, self._op_name,
|
|
3758
|
+
POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
|
|
3759
|
+
for i in self.items:
|
|
3760
|
+
i.dump(indent + 1, reverse)
|
|
3761
|
+
|
|
3762
|
+
def _handle_case_folding(self, info, in_set):
|
|
3763
|
+
# Is the set case-sensitive?
|
|
3764
|
+
if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
|
|
3765
|
+
return self
|
|
3766
|
+
|
|
3767
|
+
# Is full case-folding possible?
|
|
3768
|
+
if (not (self.info.flags & UNICODE) or (self.case_flags &
|
|
3769
|
+
FULLIGNORECASE) != FULLIGNORECASE):
|
|
3770
|
+
return self
|
|
3771
|
+
|
|
3772
|
+
# Get the characters which expand to multiple codepoints on folding.
|
|
3773
|
+
expanding_chars = _regex.get_expand_on_folding()
|
|
3774
|
+
|
|
3775
|
+
# Get the folded characters in the set.
|
|
3776
|
+
items = []
|
|
3777
|
+
seen = set()
|
|
3778
|
+
for ch in expanding_chars:
|
|
3779
|
+
if self.matches(ord(ch)):
|
|
3780
|
+
folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
|
|
3781
|
+
if folded not in seen:
|
|
3782
|
+
items.append(String([ord(c) for c in folded],
|
|
3783
|
+
case_flags=self.case_flags))
|
|
3784
|
+
seen.add(folded)
|
|
3785
|
+
|
|
3786
|
+
if not items:
|
|
3787
|
+
# We can fall back to simple case-folding.
|
|
3788
|
+
return self
|
|
3789
|
+
|
|
3790
|
+
return Branch([self] + items)
|
|
3791
|
+
|
|
3792
|
+
def max_width(self):
|
|
3793
|
+
# Is the set case-sensitive?
|
|
3794
|
+
if not self.positive or not (self.case_flags & IGNORECASE):
|
|
3795
|
+
return 1
|
|
3796
|
+
|
|
3797
|
+
# Is full case-folding possible?
|
|
3798
|
+
if (not (self.info.flags & UNICODE) or (self.case_flags &
|
|
3799
|
+
FULLIGNORECASE) != FULLIGNORECASE):
|
|
3800
|
+
return 1
|
|
3801
|
+
|
|
3802
|
+
# Get the characters which expand to multiple codepoints on folding.
|
|
3803
|
+
expanding_chars = _regex.get_expand_on_folding()
|
|
3804
|
+
|
|
3805
|
+
# Get the folded characters in the set.
|
|
3806
|
+
seen = set()
|
|
3807
|
+
for ch in expanding_chars:
|
|
3808
|
+
if self.matches(ord(ch)):
|
|
3809
|
+
folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
|
|
3810
|
+
seen.add(folded)
|
|
3811
|
+
|
|
3812
|
+
if not seen:
|
|
3813
|
+
return 1
|
|
3814
|
+
|
|
3815
|
+
return max(len(folded) for folded in seen)
|
|
3816
|
+
|
|
3817
|
+
def __del__(self):
|
|
3818
|
+
self.info = None
|
|
3819
|
+
|
|
3820
|
+
class SetDiff(SetBase):
|
|
3821
|
+
_opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
|
|
3822
|
+
OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
|
|
3823
|
+
OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
|
|
3824
|
+
OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
|
|
3825
|
+
True): OP.SET_DIFF_IGN_REV}
|
|
3826
|
+
_op_name = "SET_DIFF"
|
|
3827
|
+
|
|
3828
|
+
def optimise(self, info, reverse, in_set=False):
|
|
3829
|
+
items = self.items
|
|
3830
|
+
if len(items) > 2:
|
|
3831
|
+
items = [items[0], SetUnion(info, items[1 : ])]
|
|
3832
|
+
|
|
3833
|
+
if len(items) == 1:
|
|
3834
|
+
return items[0].with_flags(case_flags=self.case_flags,
|
|
3835
|
+
zerowidth=self.zerowidth).optimise(info, reverse, in_set)
|
|
3836
|
+
|
|
3837
|
+
self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
|
|
3838
|
+
items)
|
|
3839
|
+
|
|
3840
|
+
return self._handle_case_folding(info, in_set)
|
|
3841
|
+
|
|
3842
|
+
def matches(self, ch):
|
|
3843
|
+
m = self.items[0].matches(ch) and not self.items[1].matches(ch)
|
|
3844
|
+
return m == self.positive
|
|
3845
|
+
|
|
3846
|
+
class SetInter(SetBase):
|
|
3847
|
+
_opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
|
|
3848
|
+
OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
|
|
3849
|
+
False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
|
|
3850
|
+
True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
|
|
3851
|
+
(FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
|
|
3852
|
+
_op_name = "SET_INTER"
|
|
3853
|
+
|
|
3854
|
+
def optimise(self, info, reverse, in_set=False):
|
|
3855
|
+
items = []
|
|
3856
|
+
for m in self.items:
|
|
3857
|
+
m = m.optimise(info, reverse, in_set=True)
|
|
3858
|
+
if isinstance(m, SetInter) and m.positive:
|
|
3859
|
+
# Intersection in intersection.
|
|
3860
|
+
items.extend(m.items)
|
|
3861
|
+
else:
|
|
3862
|
+
items.append(m)
|
|
3863
|
+
|
|
3864
|
+
if len(items) == 1:
|
|
3865
|
+
return items[0].with_flags(case_flags=self.case_flags,
|
|
3866
|
+
zerowidth=self.zerowidth).optimise(info, reverse, in_set)
|
|
3867
|
+
|
|
3868
|
+
self.items = tuple(items)
|
|
3869
|
+
|
|
3870
|
+
return self._handle_case_folding(info, in_set)
|
|
3871
|
+
|
|
3872
|
+
def matches(self, ch):
|
|
3873
|
+
m = all(i.matches(ch) for i in self.items)
|
|
3874
|
+
return m == self.positive
|
|
3875
|
+
|
|
3876
|
+
class SetSymDiff(SetBase):
|
|
3877
|
+
_opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
|
|
3878
|
+
OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
|
|
3879
|
+
False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
|
|
3880
|
+
(IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
|
|
3881
|
+
OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
|
|
3882
|
+
_op_name = "SET_SYM_DIFF"
|
|
3883
|
+
|
|
3884
|
+
def optimise(self, info, reverse, in_set=False):
|
|
3885
|
+
items = []
|
|
3886
|
+
for m in self.items:
|
|
3887
|
+
m = m.optimise(info, reverse, in_set=True)
|
|
3888
|
+
if isinstance(m, SetSymDiff) and m.positive:
|
|
3889
|
+
# Symmetric difference in symmetric difference.
|
|
3890
|
+
items.extend(m.items)
|
|
3891
|
+
else:
|
|
3892
|
+
items.append(m)
|
|
3893
|
+
|
|
3894
|
+
if len(items) == 1:
|
|
3895
|
+
return items[0].with_flags(case_flags=self.case_flags,
|
|
3896
|
+
zerowidth=self.zerowidth).optimise(info, reverse, in_set)
|
|
3897
|
+
|
|
3898
|
+
self.items = tuple(items)
|
|
3899
|
+
|
|
3900
|
+
return self._handle_case_folding(info, in_set)
|
|
3901
|
+
|
|
3902
|
+
def matches(self, ch):
|
|
3903
|
+
m = False
|
|
3904
|
+
for i in self.items:
|
|
3905
|
+
m = m != i.matches(ch)
|
|
3906
|
+
|
|
3907
|
+
return m == self.positive
|
|
3908
|
+
|
|
3909
|
+
class SetUnion(SetBase):
|
|
3910
|
+
_opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
|
|
3911
|
+
OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
|
|
3912
|
+
False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
|
|
3913
|
+
True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
|
|
3914
|
+
(FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
|
|
3915
|
+
_op_name = "SET_UNION"
|
|
3916
|
+
|
|
3917
|
+
def optimise(self, info, reverse, in_set=False):
|
|
3918
|
+
items = []
|
|
3919
|
+
for m in self.items:
|
|
3920
|
+
m = m.optimise(info, reverse, in_set=True)
|
|
3921
|
+
if isinstance(m, SetUnion) and m.positive:
|
|
3922
|
+
# Union in union.
|
|
3923
|
+
items.extend(m.items)
|
|
3924
|
+
elif isinstance(m, AnyAll):
|
|
3925
|
+
return AnyAll()
|
|
3926
|
+
else:
|
|
3927
|
+
items.append(m)
|
|
3928
|
+
|
|
3929
|
+
# Are there complementary properties?
|
|
3930
|
+
properties = (set(), set())
|
|
3931
|
+
|
|
3932
|
+
for m in items:
|
|
3933
|
+
if isinstance(m, Property):
|
|
3934
|
+
properties[m.positive].add((m.value, m.case_flags, m.zerowidth))
|
|
3935
|
+
|
|
3936
|
+
if properties[0] & properties[1]:
|
|
3937
|
+
return AnyAll()
|
|
3938
|
+
|
|
3939
|
+
if len(items) == 1:
|
|
3940
|
+
i = items[0]
|
|
3941
|
+
return i.with_flags(positive=i.positive == self.positive,
|
|
3942
|
+
case_flags=self.case_flags,
|
|
3943
|
+
zerowidth=self.zerowidth).optimise(info, reverse, in_set)
|
|
3944
|
+
|
|
3945
|
+
self.items = tuple(items)
|
|
3946
|
+
|
|
3947
|
+
return self._handle_case_folding(info, in_set)
|
|
3948
|
+
|
|
3949
|
+
def _compile(self, reverse, fuzzy):
|
|
3950
|
+
flags = 0
|
|
3951
|
+
if self.positive:
|
|
3952
|
+
flags |= POSITIVE_OP
|
|
3953
|
+
if self.zerowidth:
|
|
3954
|
+
flags |= ZEROWIDTH_OP
|
|
3955
|
+
if fuzzy:
|
|
3956
|
+
flags |= FUZZY_OP
|
|
3957
|
+
|
|
3958
|
+
characters, others = defaultdict(list), []
|
|
3959
|
+
for m in self.items:
|
|
3960
|
+
if isinstance(m, Character):
|
|
3961
|
+
characters[m.positive].append(m.value)
|
|
3962
|
+
else:
|
|
3963
|
+
others.append(m)
|
|
3964
|
+
|
|
3965
|
+
code = [(self._opcode[self.case_flags, reverse], flags)]
|
|
3966
|
+
|
|
3967
|
+
for positive, values in characters.items():
|
|
3968
|
+
flags = 0
|
|
3969
|
+
if positive:
|
|
3970
|
+
flags |= POSITIVE_OP
|
|
3971
|
+
if len(values) == 1:
|
|
3972
|
+
code.append((OP.CHARACTER, flags, values[0]))
|
|
3973
|
+
else:
|
|
3974
|
+
code.append((OP.STRING, flags, len(values)) + tuple(values))
|
|
3975
|
+
|
|
3976
|
+
for m in others:
|
|
3977
|
+
code.extend(m.compile())
|
|
3978
|
+
|
|
3979
|
+
code.append((OP.END, ))
|
|
3980
|
+
|
|
3981
|
+
return code
|
|
3982
|
+
|
|
3983
|
+
def matches(self, ch):
|
|
3984
|
+
m = any(i.matches(ch) for i in self.items)
|
|
3985
|
+
return m == self.positive
|
|
3986
|
+
|
|
3987
|
+
class Skip(ZeroWidthBase):
|
|
3988
|
+
_op_name = "SKIP"
|
|
3989
|
+
_opcode = OP.SKIP
|
|
3990
|
+
|
|
3991
|
+
class StartOfLine(ZeroWidthBase):
|
|
3992
|
+
_opcode = OP.START_OF_LINE
|
|
3993
|
+
_op_name = "START_OF_LINE"
|
|
3994
|
+
|
|
3995
|
+
class StartOfLineU(StartOfLine):
|
|
3996
|
+
_opcode = OP.START_OF_LINE_U
|
|
3997
|
+
_op_name = "START_OF_LINE_U"
|
|
3998
|
+
|
|
3999
|
+
class StartOfString(ZeroWidthBase):
|
|
4000
|
+
_opcode = OP.START_OF_STRING
|
|
4001
|
+
_op_name = "START_OF_STRING"
|
|
4002
|
+
|
|
4003
|
+
class StartOfWord(ZeroWidthBase):
|
|
4004
|
+
_opcode = OP.START_OF_WORD
|
|
4005
|
+
_op_name = "START_OF_WORD"
|
|
4006
|
+
|
|
4007
|
+
class String(RegexBase):
|
|
4008
|
+
_opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
|
|
4009
|
+
(FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
|
|
4010
|
+
(NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
|
|
4011
|
+
(FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
|
|
4012
|
+
OP.STRING_FLD_REV}
|
|
4013
|
+
|
|
4014
|
+
def __init__(self, characters, case_flags=NOCASE):
|
|
4015
|
+
self.characters = tuple(characters)
|
|
4016
|
+
self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
|
|
4017
|
+
|
|
4018
|
+
if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
|
|
4019
|
+
folded_characters = []
|
|
4020
|
+
for char in self.characters:
|
|
4021
|
+
folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
|
|
4022
|
+
folded_characters.extend(ord(c) for c in folded)
|
|
4023
|
+
else:
|
|
4024
|
+
folded_characters = self.characters
|
|
4025
|
+
|
|
4026
|
+
self.folded_characters = tuple(folded_characters)
|
|
4027
|
+
self.required = False
|
|
4028
|
+
|
|
4029
|
+
self._key = self.__class__, self.characters, self.case_flags
|
|
4030
|
+
|
|
4031
|
+
def get_firstset(self, reverse):
|
|
4032
|
+
if reverse:
|
|
4033
|
+
pos = -1
|
|
4034
|
+
else:
|
|
4035
|
+
pos = 0
|
|
4036
|
+
return set([Character(self.characters[pos],
|
|
4037
|
+
case_flags=self.case_flags)])
|
|
4038
|
+
|
|
4039
|
+
def has_simple_start(self):
|
|
4040
|
+
return True
|
|
4041
|
+
|
|
4042
|
+
def _compile(self, reverse, fuzzy):
|
|
4043
|
+
flags = 0
|
|
4044
|
+
if fuzzy:
|
|
4045
|
+
flags |= FUZZY_OP
|
|
4046
|
+
if self.required:
|
|
4047
|
+
flags |= REQUIRED_OP
|
|
4048
|
+
return [(self._opcode[self.case_flags, reverse], flags,
|
|
4049
|
+
len(self.folded_characters)) + self.folded_characters]
|
|
4050
|
+
|
|
4051
|
+
def dump(self, indent, reverse):
|
|
4052
|
+
display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
|
|
4053
|
+
print("{}STRING {}{}".format(INDENT * indent, display,
|
|
4054
|
+
CASE_TEXT[self.case_flags]))
|
|
4055
|
+
|
|
4056
|
+
def max_width(self):
|
|
4057
|
+
return len(self.folded_characters)
|
|
4058
|
+
|
|
4059
|
+
def get_required_string(self, reverse):
|
|
4060
|
+
return 0, self
|
|
4061
|
+
|
|
4062
|
+
class Literal(String):
|
|
4063
|
+
def dump(self, indent, reverse):
|
|
4064
|
+
literal = ''.join(chr(c) for c in self.characters)
|
|
4065
|
+
display = ascii(literal).lstrip("bu")
|
|
4066
|
+
print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
|
|
4067
|
+
CASE_TEXT[self.case_flags]))
|
|
4068
|
+
|
|
4069
|
+
class StringSet(Branch):
|
|
4070
|
+
def __init__(self, info, name, case_flags=NOCASE):
|
|
4071
|
+
self.info = info
|
|
4072
|
+
self.name = name
|
|
4073
|
+
self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
|
|
4074
|
+
|
|
4075
|
+
self._key = self.__class__, self.name, self.case_flags
|
|
4076
|
+
|
|
4077
|
+
self.set_key = (name, self.case_flags)
|
|
4078
|
+
if self.set_key not in info.named_lists_used:
|
|
4079
|
+
info.named_lists_used[self.set_key] = len(info.named_lists_used)
|
|
4080
|
+
|
|
4081
|
+
index = self.info.named_lists_used[self.set_key]
|
|
4082
|
+
items = self.info.kwargs[self.name]
|
|
4083
|
+
|
|
4084
|
+
case_flags = self.case_flags
|
|
4085
|
+
|
|
4086
|
+
encoding = self.info.flags & _ALL_ENCODINGS
|
|
4087
|
+
fold_flags = encoding | case_flags
|
|
4088
|
+
|
|
4089
|
+
choices = []
|
|
4090
|
+
|
|
4091
|
+
for string in items:
|
|
4092
|
+
if isinstance(string, str):
|
|
4093
|
+
string = [ord(c) for c in string]
|
|
4094
|
+
|
|
4095
|
+
choices.append([Character(c, case_flags=case_flags) for c in
|
|
4096
|
+
string])
|
|
4097
|
+
|
|
4098
|
+
# Sort from longest to shortest.
|
|
4099
|
+
choices.sort(key=len, reverse=True)
|
|
4100
|
+
|
|
4101
|
+
self.branches = [Sequence(choice) for choice in choices]
|
|
4102
|
+
|
|
4103
|
+
def dump(self, indent, reverse):
|
|
4104
|
+
print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
|
|
4105
|
+
CASE_TEXT[self.case_flags]))
|
|
4106
|
+
|
|
4107
|
+
def __del__(self):
|
|
4108
|
+
self.info = None
|
|
4109
|
+
|
|
4110
|
+
class Source:
|
|
4111
|
+
"Scanner for the regular expression source string."
|
|
4112
|
+
def __init__(self, string):
|
|
4113
|
+
if isinstance(string, str):
|
|
4114
|
+
self.string = string
|
|
4115
|
+
self.char_type = chr
|
|
4116
|
+
else:
|
|
4117
|
+
self.string = string.decode("latin-1")
|
|
4118
|
+
self.char_type = lambda c: bytes([c])
|
|
4119
|
+
|
|
4120
|
+
self.pos = 0
|
|
4121
|
+
self.ignore_space = False
|
|
4122
|
+
self.sep = string[ : 0]
|
|
4123
|
+
|
|
4124
|
+
def peek(self, override_ignore=False):
|
|
4125
|
+
string = self.string
|
|
4126
|
+
pos = self.pos
|
|
4127
|
+
|
|
4128
|
+
try:
|
|
4129
|
+
if self.ignore_space and not override_ignore:
|
|
4130
|
+
while True:
|
|
4131
|
+
if string[pos].isspace():
|
|
4132
|
+
# Skip over the whitespace.
|
|
4133
|
+
pos += 1
|
|
4134
|
+
elif string[pos] == "#":
|
|
4135
|
+
# Skip over the comment to the end of the line.
|
|
4136
|
+
pos = string.index("\n", pos)
|
|
4137
|
+
else:
|
|
4138
|
+
break
|
|
4139
|
+
|
|
4140
|
+
return string[pos]
|
|
4141
|
+
except IndexError:
|
|
4142
|
+
# We've reached the end of the string.
|
|
4143
|
+
return string[ : 0]
|
|
4144
|
+
except ValueError:
|
|
4145
|
+
# The comment extended to the end of the string.
|
|
4146
|
+
return string[ : 0]
|
|
4147
|
+
|
|
4148
|
+
def get(self, override_ignore=False):
|
|
4149
|
+
string = self.string
|
|
4150
|
+
pos = self.pos
|
|
4151
|
+
|
|
4152
|
+
try:
|
|
4153
|
+
if self.ignore_space and not override_ignore:
|
|
4154
|
+
while True:
|
|
4155
|
+
if string[pos].isspace():
|
|
4156
|
+
# Skip over the whitespace.
|
|
4157
|
+
pos += 1
|
|
4158
|
+
elif string[pos] == "#":
|
|
4159
|
+
# Skip over the comment to the end of the line.
|
|
4160
|
+
pos = string.index("\n", pos)
|
|
4161
|
+
else:
|
|
4162
|
+
break
|
|
4163
|
+
|
|
4164
|
+
ch = string[pos]
|
|
4165
|
+
self.pos = pos + 1
|
|
4166
|
+
return ch
|
|
4167
|
+
except IndexError:
|
|
4168
|
+
# We've reached the end of the string.
|
|
4169
|
+
self.pos = pos
|
|
4170
|
+
return string[ : 0]
|
|
4171
|
+
except ValueError:
|
|
4172
|
+
# The comment extended to the end of the string.
|
|
4173
|
+
self.pos = len(string)
|
|
4174
|
+
return string[ : 0]
|
|
4175
|
+
|
|
4176
|
+
def get_many(self, count=1):
|
|
4177
|
+
string = self.string
|
|
4178
|
+
pos = self.pos
|
|
4179
|
+
|
|
4180
|
+
try:
|
|
4181
|
+
if self.ignore_space:
|
|
4182
|
+
substring = []
|
|
4183
|
+
|
|
4184
|
+
while len(substring) < count:
|
|
4185
|
+
while True:
|
|
4186
|
+
if string[pos].isspace():
|
|
4187
|
+
# Skip over the whitespace.
|
|
4188
|
+
pos += 1
|
|
4189
|
+
elif string[pos] == "#":
|
|
4190
|
+
# Skip over the comment to the end of the line.
|
|
4191
|
+
pos = string.index("\n", pos)
|
|
4192
|
+
else:
|
|
4193
|
+
break
|
|
4194
|
+
|
|
4195
|
+
substring.append(string[pos])
|
|
4196
|
+
pos += 1
|
|
4197
|
+
|
|
4198
|
+
substring = "".join(substring)
|
|
4199
|
+
else:
|
|
4200
|
+
substring = string[pos : pos + count]
|
|
4201
|
+
pos += len(substring)
|
|
4202
|
+
|
|
4203
|
+
self.pos = pos
|
|
4204
|
+
return substring
|
|
4205
|
+
except IndexError:
|
|
4206
|
+
# We've reached the end of the string.
|
|
4207
|
+
self.pos = len(string)
|
|
4208
|
+
return "".join(substring)
|
|
4209
|
+
except ValueError:
|
|
4210
|
+
# The comment extended to the end of the string.
|
|
4211
|
+
self.pos = len(string)
|
|
4212
|
+
return "".join(substring)
|
|
4213
|
+
|
|
4214
|
+
def get_while(self, test_set, include=True, keep_spaces=False):
|
|
4215
|
+
string = self.string
|
|
4216
|
+
pos = self.pos
|
|
4217
|
+
|
|
4218
|
+
if self.ignore_space and not keep_spaces:
|
|
4219
|
+
try:
|
|
4220
|
+
substring = []
|
|
4221
|
+
|
|
4222
|
+
while True:
|
|
4223
|
+
if string[pos].isspace():
|
|
4224
|
+
# Skip over the whitespace.
|
|
4225
|
+
pos += 1
|
|
4226
|
+
elif string[pos] == "#":
|
|
4227
|
+
# Skip over the comment to the end of the line.
|
|
4228
|
+
pos = string.index("\n", pos)
|
|
4229
|
+
elif (string[pos] in test_set) == include:
|
|
4230
|
+
substring.append(string[pos])
|
|
4231
|
+
pos += 1
|
|
4232
|
+
else:
|
|
4233
|
+
break
|
|
4234
|
+
|
|
4235
|
+
self.pos = pos
|
|
4236
|
+
except IndexError:
|
|
4237
|
+
# We've reached the end of the string.
|
|
4238
|
+
self.pos = len(string)
|
|
4239
|
+
except ValueError:
|
|
4240
|
+
# The comment extended to the end of the string.
|
|
4241
|
+
self.pos = len(string)
|
|
4242
|
+
|
|
4243
|
+
return "".join(substring)
|
|
4244
|
+
else:
|
|
4245
|
+
try:
|
|
4246
|
+
while (string[pos] in test_set) == include:
|
|
4247
|
+
pos += 1
|
|
4248
|
+
|
|
4249
|
+
substring = string[self.pos : pos]
|
|
4250
|
+
|
|
4251
|
+
self.pos = pos
|
|
4252
|
+
|
|
4253
|
+
return substring
|
|
4254
|
+
except IndexError:
|
|
4255
|
+
# We've reached the end of the string.
|
|
4256
|
+
substring = string[self.pos : pos]
|
|
4257
|
+
|
|
4258
|
+
self.pos = pos
|
|
4259
|
+
|
|
4260
|
+
return substring
|
|
4261
|
+
|
|
4262
|
+
def skip_while(self, test_set, include=True):
|
|
4263
|
+
string = self.string
|
|
4264
|
+
pos = self.pos
|
|
4265
|
+
|
|
4266
|
+
try:
|
|
4267
|
+
if self.ignore_space:
|
|
4268
|
+
while True:
|
|
4269
|
+
if string[pos].isspace():
|
|
4270
|
+
# Skip over the whitespace.
|
|
4271
|
+
pos += 1
|
|
4272
|
+
elif string[pos] == "#":
|
|
4273
|
+
# Skip over the comment to the end of the line.
|
|
4274
|
+
pos = string.index("\n", pos)
|
|
4275
|
+
elif (string[pos] in test_set) == include:
|
|
4276
|
+
pos += 1
|
|
4277
|
+
else:
|
|
4278
|
+
break
|
|
4279
|
+
else:
|
|
4280
|
+
while (string[pos] in test_set) == include:
|
|
4281
|
+
pos += 1
|
|
4282
|
+
|
|
4283
|
+
self.pos = pos
|
|
4284
|
+
except IndexError:
|
|
4285
|
+
# We've reached the end of the string.
|
|
4286
|
+
self.pos = len(string)
|
|
4287
|
+
except ValueError:
|
|
4288
|
+
# The comment extended to the end of the string.
|
|
4289
|
+
self.pos = len(string)
|
|
4290
|
+
|
|
4291
|
+
def match(self, substring):
|
|
4292
|
+
string = self.string
|
|
4293
|
+
pos = self.pos
|
|
4294
|
+
|
|
4295
|
+
if self.ignore_space:
|
|
4296
|
+
try:
|
|
4297
|
+
for c in substring:
|
|
4298
|
+
while True:
|
|
4299
|
+
if string[pos].isspace():
|
|
4300
|
+
# Skip over the whitespace.
|
|
4301
|
+
pos += 1
|
|
4302
|
+
elif string[pos] == "#":
|
|
4303
|
+
# Skip over the comment to the end of the line.
|
|
4304
|
+
pos = string.index("\n", pos)
|
|
4305
|
+
else:
|
|
4306
|
+
break
|
|
4307
|
+
|
|
4308
|
+
if string[pos] != c:
|
|
4309
|
+
return False
|
|
4310
|
+
|
|
4311
|
+
pos += 1
|
|
4312
|
+
|
|
4313
|
+
self.pos = pos
|
|
4314
|
+
|
|
4315
|
+
return True
|
|
4316
|
+
except IndexError:
|
|
4317
|
+
# We've reached the end of the string.
|
|
4318
|
+
return False
|
|
4319
|
+
except ValueError:
|
|
4320
|
+
# The comment extended to the end of the string.
|
|
4321
|
+
return False
|
|
4322
|
+
else:
|
|
4323
|
+
if not string.startswith(substring, pos):
|
|
4324
|
+
return False
|
|
4325
|
+
|
|
4326
|
+
self.pos = pos + len(substring)
|
|
4327
|
+
|
|
4328
|
+
return True
|
|
4329
|
+
|
|
4330
|
+
def expect(self, substring):
|
|
4331
|
+
if not self.match(substring):
|
|
4332
|
+
raise error("missing {}".format(substring), self.string, self.pos)
|
|
4333
|
+
|
|
4334
|
+
def at_end(self):
|
|
4335
|
+
string = self.string
|
|
4336
|
+
pos = self.pos
|
|
4337
|
+
|
|
4338
|
+
try:
|
|
4339
|
+
if self.ignore_space:
|
|
4340
|
+
while True:
|
|
4341
|
+
if string[pos].isspace():
|
|
4342
|
+
pos += 1
|
|
4343
|
+
elif string[pos] == "#":
|
|
4344
|
+
pos = string.index("\n", pos)
|
|
4345
|
+
else:
|
|
4346
|
+
break
|
|
4347
|
+
|
|
4348
|
+
return pos >= len(string)
|
|
4349
|
+
except IndexError:
|
|
4350
|
+
# We've reached the end of the string.
|
|
4351
|
+
return True
|
|
4352
|
+
except ValueError:
|
|
4353
|
+
# The comment extended to the end of the string.
|
|
4354
|
+
return True
|
|
4355
|
+
|
|
4356
|
+
class Info:
|
|
4357
|
+
"Info about the regular expression."
|
|
4358
|
+
|
|
4359
|
+
def __init__(self, flags=0, char_type=None, kwargs={}):
|
|
4360
|
+
flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
|
|
4361
|
+
self.flags = flags
|
|
4362
|
+
self.global_flags = flags
|
|
4363
|
+
self.inline_locale = False
|
|
4364
|
+
|
|
4365
|
+
self.kwargs = kwargs
|
|
4366
|
+
|
|
4367
|
+
self.group_count = 0
|
|
4368
|
+
self.group_index = {}
|
|
4369
|
+
self.group_name = {}
|
|
4370
|
+
self.char_type = char_type
|
|
4371
|
+
self.named_lists_used = {}
|
|
4372
|
+
self.open_groups = []
|
|
4373
|
+
self.open_group_count = {}
|
|
4374
|
+
self.defined_groups = {}
|
|
4375
|
+
self.group_calls = []
|
|
4376
|
+
self.private_groups = {}
|
|
4377
|
+
|
|
4378
|
+
def open_group(self, name=None):
|
|
4379
|
+
group = self.group_index.get(name)
|
|
4380
|
+
if group is None:
|
|
4381
|
+
while True:
|
|
4382
|
+
self.group_count += 1
|
|
4383
|
+
if name is None or self.group_count not in self.group_name:
|
|
4384
|
+
break
|
|
4385
|
+
|
|
4386
|
+
group = self.group_count
|
|
4387
|
+
if name:
|
|
4388
|
+
self.group_index[name] = group
|
|
4389
|
+
self.group_name[group] = name
|
|
4390
|
+
|
|
4391
|
+
if group in self.open_groups:
|
|
4392
|
+
# We have a nested named group. We'll assign it a private group
|
|
4393
|
+
# number, initially negative until we can assign a proper
|
|
4394
|
+
# (positive) number.
|
|
4395
|
+
group_alias = -(len(self.private_groups) + 1)
|
|
4396
|
+
self.private_groups[group_alias] = group
|
|
4397
|
+
group = group_alias
|
|
4398
|
+
|
|
4399
|
+
self.open_groups.append(group)
|
|
4400
|
+
self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
|
|
4401
|
+
|
|
4402
|
+
return group
|
|
4403
|
+
|
|
4404
|
+
def close_group(self):
|
|
4405
|
+
self.open_groups.pop()
|
|
4406
|
+
|
|
4407
|
+
def is_open_group(self, name):
|
|
4408
|
+
# In version 1, a group reference can refer to an open group. We'll
|
|
4409
|
+
# just pretend the group isn't open.
|
|
4410
|
+
version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
|
|
4411
|
+
if version == VERSION1:
|
|
4412
|
+
return False
|
|
4413
|
+
|
|
4414
|
+
if name.isdigit():
|
|
4415
|
+
group = int(name)
|
|
4416
|
+
else:
|
|
4417
|
+
group = self.group_index.get(name)
|
|
4418
|
+
|
|
4419
|
+
return group in self.open_groups
|
|
4420
|
+
|
|
4421
|
+
def _check_group_features(info, parsed):
|
|
4422
|
+
"""Checks whether the reverse and fuzzy features of the group calls match
|
|
4423
|
+
the groups which they call.
|
|
4424
|
+
"""
|
|
4425
|
+
call_refs = {}
|
|
4426
|
+
additional_groups = []
|
|
4427
|
+
for call, reverse, fuzzy in info.group_calls:
|
|
4428
|
+
# Look up the reference of this group call.
|
|
4429
|
+
key = (call.group, reverse, fuzzy)
|
|
4430
|
+
ref = call_refs.get(key)
|
|
4431
|
+
if ref is None:
|
|
4432
|
+
# This group doesn't have a reference yet, so look up its features.
|
|
4433
|
+
if call.group == 0:
|
|
4434
|
+
# Calling the pattern as a whole.
|
|
4435
|
+
rev = bool(info.flags & REVERSE)
|
|
4436
|
+
fuz = isinstance(parsed, Fuzzy)
|
|
4437
|
+
if (rev, fuz) != (reverse, fuzzy):
|
|
4438
|
+
# The pattern as a whole doesn't have the features we want,
|
|
4439
|
+
# so we'll need to make a copy of it with the desired
|
|
4440
|
+
# features.
|
|
4441
|
+
additional_groups.append((CallRef(len(call_refs), parsed),
|
|
4442
|
+
reverse, fuzzy))
|
|
4443
|
+
else:
|
|
4444
|
+
# Calling a capture group.
|
|
4445
|
+
def_info = info.defined_groups[call.group]
|
|
4446
|
+
group = def_info[0]
|
|
4447
|
+
if def_info[1 : ] != (reverse, fuzzy):
|
|
4448
|
+
# The group doesn't have the features we want, so we'll
|
|
4449
|
+
# need to make a copy of it with the desired features.
|
|
4450
|
+
additional_groups.append((group, reverse, fuzzy))
|
|
4451
|
+
|
|
4452
|
+
ref = len(call_refs)
|
|
4453
|
+
call_refs[key] = ref
|
|
4454
|
+
|
|
4455
|
+
call.call_ref = ref
|
|
4456
|
+
|
|
4457
|
+
info.call_refs = call_refs
|
|
4458
|
+
info.additional_groups = additional_groups
|
|
4459
|
+
|
|
4460
|
+
def _get_required_string(parsed, flags):
|
|
4461
|
+
"Gets the required string and related info of a parsed pattern."
|
|
4462
|
+
|
|
4463
|
+
req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
|
|
4464
|
+
if required:
|
|
4465
|
+
required.required = True
|
|
4466
|
+
if req_offset >= UNLIMITED:
|
|
4467
|
+
req_offset = -1
|
|
4468
|
+
|
|
4469
|
+
req_flags = required.case_flags
|
|
4470
|
+
if not (flags & UNICODE):
|
|
4471
|
+
req_flags &= ~UNICODE
|
|
4472
|
+
|
|
4473
|
+
req_chars = required.folded_characters
|
|
4474
|
+
else:
|
|
4475
|
+
req_offset = 0
|
|
4476
|
+
req_chars = ()
|
|
4477
|
+
req_flags = 0
|
|
4478
|
+
|
|
4479
|
+
return req_offset, req_chars, req_flags
|
|
4480
|
+
|
|
4481
|
+
class Scanner:
|
|
4482
|
+
def __init__(self, lexicon, flags=0):
|
|
4483
|
+
self.lexicon = lexicon
|
|
4484
|
+
|
|
4485
|
+
# Combine phrases into a compound pattern.
|
|
4486
|
+
patterns = []
|
|
4487
|
+
for phrase, action in lexicon:
|
|
4488
|
+
# Parse the regular expression.
|
|
4489
|
+
source = Source(phrase)
|
|
4490
|
+
info = Info(flags, source.char_type)
|
|
4491
|
+
source.ignore_space = bool(info.flags & VERBOSE)
|
|
4492
|
+
parsed = _parse_pattern(source, info)
|
|
4493
|
+
if not source.at_end():
|
|
4494
|
+
raise error("unbalanced parenthesis", source.string,
|
|
4495
|
+
source.pos)
|
|
4496
|
+
|
|
4497
|
+
# We want to forbid capture groups within each phrase.
|
|
4498
|
+
patterns.append(parsed.remove_captures())
|
|
4499
|
+
|
|
4500
|
+
# Combine all the subpatterns into one pattern.
|
|
4501
|
+
info = Info(flags)
|
|
4502
|
+
patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
|
|
4503
|
+
parsed = Branch(patterns)
|
|
4504
|
+
|
|
4505
|
+
# Optimise the compound pattern.
|
|
4506
|
+
reverse = bool(info.flags & REVERSE)
|
|
4507
|
+
parsed = parsed.optimise(info, reverse)
|
|
4508
|
+
parsed = parsed.pack_characters(info)
|
|
4509
|
+
|
|
4510
|
+
# Get the required string.
|
|
4511
|
+
req_offset, req_chars, req_flags = _get_required_string(parsed,
|
|
4512
|
+
info.flags)
|
|
4513
|
+
|
|
4514
|
+
# Check the features of the groups.
|
|
4515
|
+
_check_group_features(info, parsed)
|
|
4516
|
+
|
|
4517
|
+
# Complain if there are any group calls. They are not supported by the
|
|
4518
|
+
# Scanner class.
|
|
4519
|
+
if info.call_refs:
|
|
4520
|
+
raise error("recursive regex not supported by Scanner",
|
|
4521
|
+
source.string, source.pos)
|
|
4522
|
+
|
|
4523
|
+
reverse = bool(info.flags & REVERSE)
|
|
4524
|
+
|
|
4525
|
+
# Compile the compound pattern. The result is a list of tuples.
|
|
4526
|
+
code = parsed.compile(reverse) + [(OP.SUCCESS, )]
|
|
4527
|
+
|
|
4528
|
+
# Flatten the code into a list of ints.
|
|
4529
|
+
code = _flatten_code(code)
|
|
4530
|
+
|
|
4531
|
+
if not parsed.has_simple_start():
|
|
4532
|
+
# Get the first set, if possible.
|
|
4533
|
+
try:
|
|
4534
|
+
fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
|
|
4535
|
+
fs_code = _flatten_code(fs_code)
|
|
4536
|
+
code = fs_code + code
|
|
4537
|
+
except _FirstSetError:
|
|
4538
|
+
pass
|
|
4539
|
+
|
|
4540
|
+
# Check the global flags for conflicts.
|
|
4541
|
+
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
|
|
4542
|
+
if version not in (0, VERSION0, VERSION1):
|
|
4543
|
+
raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
|
|
4544
|
+
|
|
4545
|
+
# Create the PatternObject.
|
|
4546
|
+
#
|
|
4547
|
+
# Local flags like IGNORECASE affect the code generation, but aren't
|
|
4548
|
+
# needed by the PatternObject itself. Conversely, global flags like
|
|
4549
|
+
# LOCALE _don't_ affect the code generation but _are_ needed by the
|
|
4550
|
+
# PatternObject.
|
|
4551
|
+
self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
|
|
4552
|
+
code, {}, {}, {}, [], req_offset, req_chars, req_flags,
|
|
4553
|
+
len(patterns))
|
|
4554
|
+
|
|
4555
|
+
def scan(self, string):
|
|
4556
|
+
result = []
|
|
4557
|
+
append = result.append
|
|
4558
|
+
match = self.scanner.scanner(string).match
|
|
4559
|
+
i = 0
|
|
4560
|
+
while True:
|
|
4561
|
+
m = match()
|
|
4562
|
+
if not m:
|
|
4563
|
+
break
|
|
4564
|
+
j = m.end()
|
|
4565
|
+
if i == j:
|
|
4566
|
+
break
|
|
4567
|
+
action = self.lexicon[m.lastindex - 1][1]
|
|
4568
|
+
if hasattr(action, '__call__'):
|
|
4569
|
+
self.match = m
|
|
4570
|
+
action = action(self, m.group())
|
|
4571
|
+
if action is not None:
|
|
4572
|
+
append(action)
|
|
4573
|
+
i = j
|
|
4574
|
+
|
|
4575
|
+
return result, string[i : ]
|
|
4576
|
+
|
|
4577
|
+
# Get the known properties dict.
|
|
4578
|
+
PROPERTIES = _regex.get_properties()
|
|
4579
|
+
|
|
4580
|
+
# Build the inverse of the properties dict.
|
|
4581
|
+
PROPERTY_NAMES = {}
|
|
4582
|
+
for prop_name, (prop_id, values) in PROPERTIES.items():
|
|
4583
|
+
name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
|
|
4584
|
+
name = max(name, prop_name, key=len)
|
|
4585
|
+
PROPERTY_NAMES[prop_id] = name, prop_values
|
|
4586
|
+
|
|
4587
|
+
for val_name, val_id in values.items():
|
|
4588
|
+
prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
|
|
4589
|
+
key=len)
|
|
4590
|
+
|
|
4591
|
+
# Character escape sequences.
|
|
4592
|
+
CHARACTER_ESCAPES = {
|
|
4593
|
+
"a": "\a",
|
|
4594
|
+
"b": "\b",
|
|
4595
|
+
"f": "\f",
|
|
4596
|
+
"n": "\n",
|
|
4597
|
+
"r": "\r",
|
|
4598
|
+
"t": "\t",
|
|
4599
|
+
"v": "\v",
|
|
4600
|
+
}
|
|
4601
|
+
|
|
4602
|
+
ASCII_ENCODING = 1
|
|
4603
|
+
UNICODE_ENCODING = 2
|
|
4604
|
+
|
|
4605
|
+
# Predefined character set escape sequences.
|
|
4606
|
+
CHARSET_ESCAPES = {
|
|
4607
|
+
"d": lookup_property(None, "Digit", True),
|
|
4608
|
+
"D": lookup_property(None, "Digit", False),
|
|
4609
|
+
"h": lookup_property(None, "Blank", True),
|
|
4610
|
+
"s": lookup_property(None, "Space", True),
|
|
4611
|
+
"S": lookup_property(None, "Space", False),
|
|
4612
|
+
"w": lookup_property(None, "Word", True),
|
|
4613
|
+
"W": lookup_property(None, "Word", False),
|
|
4614
|
+
}
|
|
4615
|
+
|
|
4616
|
+
ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
|
|
4617
|
+
ASCII_CHARSET_ESCAPES.update({
|
|
4618
|
+
"d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),
|
|
4619
|
+
"D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),
|
|
4620
|
+
"s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),
|
|
4621
|
+
"S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),
|
|
4622
|
+
"w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),
|
|
4623
|
+
"W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),
|
|
4624
|
+
})
|
|
4625
|
+
UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
|
|
4626
|
+
UNICODE_CHARSET_ESCAPES.update({
|
|
4627
|
+
"d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),
|
|
4628
|
+
"D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),
|
|
4629
|
+
"s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),
|
|
4630
|
+
"S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),
|
|
4631
|
+
"w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),
|
|
4632
|
+
"W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),
|
|
4633
|
+
})
|
|
4634
|
+
|
|
4635
|
+
# Positional escape sequences.
|
|
4636
|
+
POSITION_ESCAPES = {
|
|
4637
|
+
"A": StartOfString(),
|
|
4638
|
+
"b": Boundary(),
|
|
4639
|
+
"B": Boundary(False),
|
|
4640
|
+
"K": Keep(),
|
|
4641
|
+
"m": StartOfWord(),
|
|
4642
|
+
"M": EndOfWord(),
|
|
4643
|
+
"Z": EndOfString(),
|
|
4644
|
+
}
|
|
4645
|
+
ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)
|
|
4646
|
+
ASCII_POSITION_ESCAPES.update({
|
|
4647
|
+
"b": Boundary(encoding=ASCII_ENCODING),
|
|
4648
|
+
"B": Boundary(False, encoding=ASCII_ENCODING),
|
|
4649
|
+
"m": StartOfWord(encoding=ASCII_ENCODING),
|
|
4650
|
+
"M": EndOfWord(encoding=ASCII_ENCODING),
|
|
4651
|
+
})
|
|
4652
|
+
UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)
|
|
4653
|
+
UNICODE_POSITION_ESCAPES.update({
|
|
4654
|
+
"b": Boundary(encoding=UNICODE_ENCODING),
|
|
4655
|
+
"B": Boundary(False, encoding=UNICODE_ENCODING),
|
|
4656
|
+
"m": StartOfWord(encoding=UNICODE_ENCODING),
|
|
4657
|
+
"M": EndOfWord(encoding=UNICODE_ENCODING),
|
|
4658
|
+
})
|
|
4659
|
+
|
|
4660
|
+
# Positional escape sequences when WORD flag set.
|
|
4661
|
+
WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
|
|
4662
|
+
WORD_POSITION_ESCAPES.update({
|
|
4663
|
+
"b": DefaultBoundary(),
|
|
4664
|
+
"B": DefaultBoundary(False),
|
|
4665
|
+
"m": DefaultStartOfWord(),
|
|
4666
|
+
"M": DefaultEndOfWord(),
|
|
4667
|
+
})
|
|
4668
|
+
|
|
4669
|
+
# Regex control verbs.
|
|
4670
|
+
VERBS = {
|
|
4671
|
+
"FAIL": Failure(),
|
|
4672
|
+
"F": Failure(),
|
|
4673
|
+
"PRUNE": Prune(),
|
|
4674
|
+
"SKIP": Skip(),
|
|
4675
|
+
}
|