regex 2026.1.14__cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
regex/_regex_core.py ADDED
@@ -0,0 +1,4675 @@
1
+ #
2
+ # Secret Labs' Regular Expression Engine core module
3
+ #
4
+ # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5
+ #
6
+ # This version of the SRE library can be redistributed under CNRI's
7
+ # Python 1.6 license. For any other use, please contact Secret Labs
8
+ # AB (info@pythonware.com).
9
+ #
10
+ # Portions of this engine have been developed in cooperation with
11
+ # CNRI. Hewlett-Packard provided funding for 1.6 integration and
12
+ # other compatibility work.
13
+ #
14
+ # 2010-01-16 mrab Python front-end re-written and extended
15
+
16
+ import enum
17
+ import string
18
+ import unicodedata
19
+ from collections import defaultdict
20
+
21
+ from regex import _regex
22
+
23
+ __all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24
+ "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25
+ "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26
+ "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27
+ "Scanner", "RegexFlag"]
28
+
29
+ # The regex exception.
30
+ class error(Exception):
31
+ """Exception raised for invalid regular expressions.
32
+
33
+ Attributes:
34
+
35
+ msg: The unformatted error message
36
+ pattern: The regular expression pattern
37
+ pos: The position in the pattern where compilation failed, or None
38
+ lineno: The line number where compilation failed, unless pos is None
39
+ colno: The column number where compilation failed, unless pos is None
40
+ """
41
+
42
+ def __init__(self, message, pattern=None, pos=None):
43
+ newline = '\n' if isinstance(pattern, str) else b'\n'
44
+ self.msg = message
45
+ self.pattern = pattern
46
+ self.pos = pos
47
+ if pattern is not None and pos is not None:
48
+ self.lineno = pattern.count(newline, 0, pos) + 1
49
+ self.colno = pos - pattern.rfind(newline, 0, pos)
50
+
51
+ message = "{} at position {}".format(message, pos)
52
+
53
+ if newline in pattern:
54
+ message += " (line {}, column {})".format(self.lineno,
55
+ self.colno)
56
+
57
+ Exception.__init__(self, message)
58
+
59
+ # The exception for when a positional flag has been turned on in the old
60
+ # behaviour.
61
+ class _UnscopedFlagSet(Exception):
62
+ pass
63
+
64
+ # The exception for when parsing fails and we want to try something else.
65
+ class ParseError(Exception):
66
+ pass
67
+
68
+ # The exception for when there isn't a valid first set.
69
+ class _FirstSetError(Exception):
70
+ pass
71
+
72
+ # Flags.
73
+ class RegexFlag(enum.IntFlag):
74
+ A = ASCII = 0x80 # Assume ASCII locale.
75
+ B = BESTMATCH = 0x1000 # Best fuzzy match.
76
+ D = DEBUG = 0x200 # Print parsed pattern.
77
+ E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78
+ # fuzzy match.
79
+ F = FULLCASE = 0x4000 # Unicode full case-folding.
80
+ I = IGNORECASE = 0x2 # Ignore case.
81
+ L = LOCALE = 0x4 # Assume current 8-bit locale.
82
+ M = MULTILINE = 0x8 # Make anchors look for newline.
83
+ P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84
+ R = REVERSE = 0x400 # Search backwards.
85
+ S = DOTALL = 0x10 # Make dot match newline.
86
+ U = UNICODE = 0x20 # Assume Unicode locale.
87
+ V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88
+ V1 = VERSION1 = 0x100 # New enhanced behaviour.
89
+ W = WORD = 0x800 # Default Unicode word breaks.
90
+ X = VERBOSE = 0x40 # Ignore whitespace and comments.
91
+ T = TEMPLATE = 0x1 # Template (present because re module has it).
92
+
93
+ def __repr__(self):
94
+ if self._name_ is not None:
95
+ return 'regex.%s' % self._name_
96
+
97
+ value = self._value_
98
+ members = []
99
+ negative = value < 0
100
+
101
+ if negative:
102
+ value = ~value
103
+
104
+ for m in self.__class__:
105
+ if value & m._value_:
106
+ value &= ~m._value_
107
+ members.append('regex.%s' % m._name_)
108
+
109
+ if value:
110
+ members.append(hex(value))
111
+
112
+ res = '|'.join(members)
113
+
114
+ if negative:
115
+ if len(members) > 1:
116
+ res = '~(%s)' % res
117
+ else:
118
+ res = '~%s' % res
119
+
120
+ return res
121
+
122
+ __str__ = object.__str__
123
+
124
+ # Put the flags into the module namespace. Being explicit here helps tools like
125
+ # linters and IDEs understand the code better.
126
+ ASCII = RegexFlag.ASCII
127
+ BESTMATCH = RegexFlag.BESTMATCH
128
+ DEBUG = RegexFlag.DEBUG
129
+ DOTALL = RegexFlag.DOTALL
130
+ ENHANCEMATCH = RegexFlag.ENHANCEMATCH
131
+ FULLCASE = RegexFlag.FULLCASE
132
+ IGNORECASE = RegexFlag.IGNORECASE
133
+ LOCALE = RegexFlag.LOCALE
134
+ MULTILINE = RegexFlag.MULTILINE
135
+ POSIX = RegexFlag.POSIX
136
+ REVERSE = RegexFlag.REVERSE
137
+ TEMPLATE = RegexFlag.TEMPLATE
138
+ UNICODE = RegexFlag.UNICODE
139
+ VERBOSE = RegexFlag.VERBOSE
140
+ VERSION0 = RegexFlag.VERSION0
141
+ VERSION1 = RegexFlag.VERSION1
142
+ WORD = RegexFlag.WORD
143
+ A = RegexFlag.A
144
+ B = RegexFlag.B
145
+ D = RegexFlag.D
146
+ E = RegexFlag.E
147
+ F = RegexFlag.F
148
+ I = RegexFlag.I
149
+ L = RegexFlag.L
150
+ M = RegexFlag.M
151
+ P = RegexFlag.P
152
+ R = RegexFlag.R
153
+ S = RegexFlag.S
154
+ U = RegexFlag.U
155
+ V0 = RegexFlag.V0
156
+ V1 = RegexFlag.V1
157
+ W = RegexFlag.W
158
+ X = RegexFlag.X
159
+ T = RegexFlag.T
160
+
161
+ DEFAULT_VERSION = VERSION1
162
+
163
+ _ALL_VERSIONS = VERSION0 | VERSION1
164
+ _ALL_ENCODINGS = ASCII | LOCALE | UNICODE
165
+
166
+ # The default flags for the various versions.
167
+ DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
168
+
169
+ # The mask for the flags.
170
+ GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
171
+ REVERSE)
172
+ SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
173
+ _ALL_ENCODINGS)
174
+
175
+ ALPHA = frozenset(string.ascii_letters)
176
+ DIGITS = frozenset(string.digits)
177
+ ALNUM = ALPHA | DIGITS
178
+ OCT_DIGITS = frozenset(string.octdigits)
179
+ HEX_DIGITS = frozenset(string.hexdigits)
180
+ SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
181
+ NAMED_CHAR_PART = ALNUM | frozenset(" -")
182
+ PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
183
+ SET_OPS = ("||", "~~", "&&", "--")
184
+
185
+ # The width of the code words inside the regex engine.
186
+ BYTES_PER_CODE = _regex.get_code_size()
187
+ BITS_PER_CODE = BYTES_PER_CODE * 8
188
+
189
+ # The repeat count which represents infinity.
190
+ UNLIMITED = (1 << BITS_PER_CODE) - 1
191
+
192
+ # The regular expression flags.
193
+ REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
194
+ "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
195
+ "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
196
+ VERBOSE}
197
+
198
+ # The case flags.
199
+ CASE_FLAGS = FULLCASE | IGNORECASE
200
+ NOCASE = 0
201
+ FULLIGNORECASE = FULLCASE | IGNORECASE
202
+
203
+ FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
204
+
205
+ CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
206
+ FULLIGNORECASE: FULLIGNORECASE}
207
+
208
+ # The number of digits in hexadecimal escapes.
209
+ HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
210
+
211
+ # The names of the opcodes.
212
+ OPCODES = """
213
+ FAILURE
214
+ SUCCESS
215
+ ANY
216
+ ANY_ALL
217
+ ANY_ALL_REV
218
+ ANY_REV
219
+ ANY_U
220
+ ANY_U_REV
221
+ ATOMIC
222
+ BOUNDARY
223
+ BRANCH
224
+ CALL_REF
225
+ CHARACTER
226
+ CHARACTER_IGN
227
+ CHARACTER_IGN_REV
228
+ CHARACTER_REV
229
+ CONDITIONAL
230
+ DEFAULT_BOUNDARY
231
+ DEFAULT_END_OF_WORD
232
+ DEFAULT_START_OF_WORD
233
+ END
234
+ END_OF_LINE
235
+ END_OF_LINE_U
236
+ END_OF_STRING
237
+ END_OF_STRING_LINE
238
+ END_OF_STRING_LINE_U
239
+ END_OF_WORD
240
+ FUZZY
241
+ GRAPHEME_BOUNDARY
242
+ GREEDY_REPEAT
243
+ GROUP
244
+ GROUP_CALL
245
+ GROUP_EXISTS
246
+ KEEP
247
+ LAZY_REPEAT
248
+ LOOKAROUND
249
+ NEXT
250
+ PROPERTY
251
+ PROPERTY_IGN
252
+ PROPERTY_IGN_REV
253
+ PROPERTY_REV
254
+ PRUNE
255
+ RANGE
256
+ RANGE_IGN
257
+ RANGE_IGN_REV
258
+ RANGE_REV
259
+ REF_GROUP
260
+ REF_GROUP_FLD
261
+ REF_GROUP_FLD_REV
262
+ REF_GROUP_IGN
263
+ REF_GROUP_IGN_REV
264
+ REF_GROUP_REV
265
+ SEARCH_ANCHOR
266
+ SET_DIFF
267
+ SET_DIFF_IGN
268
+ SET_DIFF_IGN_REV
269
+ SET_DIFF_REV
270
+ SET_INTER
271
+ SET_INTER_IGN
272
+ SET_INTER_IGN_REV
273
+ SET_INTER_REV
274
+ SET_SYM_DIFF
275
+ SET_SYM_DIFF_IGN
276
+ SET_SYM_DIFF_IGN_REV
277
+ SET_SYM_DIFF_REV
278
+ SET_UNION
279
+ SET_UNION_IGN
280
+ SET_UNION_IGN_REV
281
+ SET_UNION_REV
282
+ SKIP
283
+ START_OF_LINE
284
+ START_OF_LINE_U
285
+ START_OF_STRING
286
+ START_OF_WORD
287
+ STRING
288
+ STRING_FLD
289
+ STRING_FLD_REV
290
+ STRING_IGN
291
+ STRING_IGN_REV
292
+ STRING_REV
293
+ FUZZY_EXT
294
+ """
295
+
296
+ # Define the opcodes in a namespace.
297
+ class Namespace:
298
+ pass
299
+
300
+ OP = Namespace()
301
+ for i, op in enumerate(OPCODES.split()):
302
+ setattr(OP, op, i)
303
+
304
+ def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
305
+ """Make room in the given cache.
306
+
307
+ Args:
308
+ cache_dict: The cache dictionary to modify.
309
+ args_dict: The dictionary of named list args used by patterns.
310
+ max_length: Maximum # of entries in cache_dict before it is shrunk.
311
+ divisor: Cache will shrink to max_length - 1/divisor*max_length items.
312
+ """
313
+ # Toss out a fraction of the entries at random to make room for new ones.
314
+ # A random algorithm was chosen as opposed to simply cache_dict.popitem()
315
+ # as popitem could penalize the same regular expression repeatedly based
316
+ # on its internal hash value. Being random should spread the cache miss
317
+ # love around.
318
+ cache_keys = tuple(cache_dict.keys())
319
+ overage = len(cache_keys) - max_length
320
+ if overage < 0:
321
+ # Cache is already within limits. Normally this should not happen
322
+ # but it could due to multithreading.
323
+ return
324
+
325
+ number_to_toss = max_length // divisor + overage
326
+
327
+ # The import is done here to avoid a circular dependency.
328
+ import random
329
+ if not hasattr(random, 'sample'):
330
+ # Do nothing while resolving the circular dependency:
331
+ # re->random->warnings->tokenize->string->re
332
+ return
333
+
334
+ for doomed_key in random.sample(cache_keys, number_to_toss):
335
+ try:
336
+ del cache_dict[doomed_key]
337
+ except KeyError:
338
+ # Ignore problems if the cache changed from another thread.
339
+ pass
340
+
341
+ # Rebuild the arguments and locale-sensitivity dictionaries.
342
+ args_dict.clear()
343
+ sensitivity_dict = {}
344
+ for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
345
+ args_dict[pattern, pattern_type, flags, default_version, locale] = args
346
+ try:
347
+ sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
348
+ except KeyError:
349
+ pass
350
+
351
+ locale_sensitive.clear()
352
+ locale_sensitive.update(sensitivity_dict)
353
+
354
+ def _fold_case(info, string):
355
+ "Folds the case of a string."
356
+ flags = info.flags
357
+ if (flags & _ALL_ENCODINGS) == 0:
358
+ flags |= info.guess_encoding
359
+
360
+ return _regex.fold_case(flags, string)
361
+
362
+ def is_cased_i(info, char):
363
+ "Checks whether a character is cased."
364
+ return len(_regex.get_all_cases(info.flags, char)) > 1
365
+
366
+ def is_cased_f(flags, char):
367
+ "Checks whether a character is cased."
368
+ return len(_regex.get_all_cases(flags, char)) > 1
369
+
370
+ def _compile_firstset(info, fs):
371
+ "Compiles the firstset for the pattern."
372
+ reverse = bool(info.flags & REVERSE)
373
+ fs = _check_firstset(info, reverse, fs)
374
+ if not fs or isinstance(fs, AnyAll):
375
+ return []
376
+
377
+ # Compile the firstset.
378
+ return fs.compile(reverse)
379
+
380
+ def _check_firstset(info, reverse, fs):
381
+ "Checks the firstset for the pattern."
382
+ if not fs or None in fs:
383
+ return None
384
+
385
+ # If we ignore the case, for simplicity we won't build a firstset.
386
+ members = set()
387
+ case_flags = NOCASE
388
+ for i in fs:
389
+ if isinstance(i, Character) and not i.positive:
390
+ return None
391
+
392
+ # if i.case_flags:
393
+ # if isinstance(i, Character):
394
+ # if is_cased_i(info, i.value):
395
+ # return []
396
+ # elif isinstance(i, SetBase):
397
+ # return []
398
+ case_flags |= i.case_flags
399
+ members.add(i.with_flags(case_flags=NOCASE))
400
+
401
+ if case_flags == (FULLCASE | IGNORECASE):
402
+ return None
403
+
404
+ # Build the firstset.
405
+ fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
406
+ zerowidth=True)
407
+ fs = fs.optimise(info, reverse, in_set=True)
408
+
409
+ return fs
410
+
411
+ def _flatten_code(code):
412
+ "Flattens the code from a list of tuples."
413
+ flat_code = []
414
+ for c in code:
415
+ flat_code.extend(c)
416
+
417
+ return flat_code
418
+
419
+ def make_case_flags(info):
420
+ "Makes the case flags."
421
+ flags = info.flags & CASE_FLAGS
422
+
423
+ # Turn off FULLCASE if ASCII is turned on.
424
+ if info.flags & ASCII:
425
+ flags &= ~FULLCASE
426
+
427
+ return flags
428
+
429
+ def make_character(info, value, in_set=False):
430
+ "Makes a character literal."
431
+ if in_set:
432
+ # A character set is built case-sensitively.
433
+ return Character(value)
434
+
435
+ return Character(value, case_flags=make_case_flags(info))
436
+
437
+ def make_ref_group(info, name, position):
438
+ "Makes a group reference."
439
+ return RefGroup(info, name, position, case_flags=make_case_flags(info))
440
+
441
+ def make_string_set(info, name):
442
+ "Makes a string set."
443
+ return StringSet(info, name, case_flags=make_case_flags(info))
444
+
445
+ def make_property(info, prop, in_set):
446
+ "Makes a property."
447
+ if in_set:
448
+ return prop
449
+
450
+ return prop.with_flags(case_flags=make_case_flags(info))
451
+
452
+ def _parse_pattern(source, info):
453
+ "Parses a pattern, eg. 'a|b|c'."
454
+ branches = [parse_sequence(source, info)]
455
+ while source.match("|"):
456
+ branches.append(parse_sequence(source, info))
457
+
458
+ if len(branches) == 1:
459
+ return branches[0]
460
+ return Branch(branches)
461
+
462
+ def parse_sequence(source, info):
463
+ "Parses a sequence, eg. 'abc'."
464
+ sequence = [None]
465
+ case_flags = make_case_flags(info)
466
+ while True:
467
+ saved_pos = source.pos
468
+ ch = source.get()
469
+ if ch in SPECIAL_CHARS:
470
+ if ch in ")|":
471
+ # The end of a sequence. At the end of the pattern ch is "".
472
+ source.pos = saved_pos
473
+ break
474
+ elif ch == "\\":
475
+ # An escape sequence outside a set.
476
+ sequence.append(parse_escape(source, info, False))
477
+ elif ch == "(":
478
+ # A parenthesised subpattern or a flag.
479
+ element = parse_paren(source, info)
480
+ if element is None:
481
+ case_flags = make_case_flags(info)
482
+ else:
483
+ sequence.append(element)
484
+ elif ch == ".":
485
+ # Any character.
486
+ if info.flags & DOTALL:
487
+ sequence.append(AnyAll())
488
+ elif info.flags & WORD:
489
+ sequence.append(AnyU())
490
+ else:
491
+ sequence.append(Any())
492
+ elif ch == "[":
493
+ # A character set.
494
+ sequence.append(parse_set(source, info))
495
+ elif ch == "^":
496
+ # The start of a line or the string.
497
+ if info.flags & MULTILINE:
498
+ if info.flags & WORD:
499
+ sequence.append(StartOfLineU())
500
+ else:
501
+ sequence.append(StartOfLine())
502
+ else:
503
+ sequence.append(StartOfString())
504
+ elif ch == "$":
505
+ # The end of a line or the string.
506
+ if info.flags & MULTILINE:
507
+ if info.flags & WORD:
508
+ sequence.append(EndOfLineU())
509
+ else:
510
+ sequence.append(EndOfLine())
511
+ else:
512
+ if info.flags & WORD:
513
+ sequence.append(EndOfStringLineU())
514
+ else:
515
+ sequence.append(EndOfStringLine())
516
+ elif ch in "?*+{":
517
+ # Looks like a quantifier.
518
+ counts = parse_quantifier(source, info, ch)
519
+ if counts:
520
+ # It _is_ a quantifier.
521
+ apply_quantifier(source, info, counts, case_flags, ch,
522
+ saved_pos, sequence)
523
+ sequence.append(None)
524
+ else:
525
+ # It's not a quantifier. Maybe it's a fuzzy constraint.
526
+ constraints = parse_fuzzy(source, info, ch, case_flags)
527
+
528
+ if constraints:
529
+ # It _is_ a fuzzy constraint.
530
+ if is_actually_fuzzy(constraints):
531
+ apply_constraint(source, info, constraints, case_flags,
532
+ saved_pos, sequence)
533
+ sequence.append(None)
534
+ else:
535
+ # The element was just a literal.
536
+ sequence.append(Character(ord(ch),
537
+ case_flags=case_flags))
538
+ else:
539
+ # A literal.
540
+ sequence.append(Character(ord(ch), case_flags=case_flags))
541
+ else:
542
+ # A literal.
543
+ sequence.append(Character(ord(ch), case_flags=case_flags))
544
+
545
+ sequence = [item for item in sequence if item is not None]
546
+ return Sequence(sequence)
547
+
548
+ def is_actually_fuzzy(constraints):
549
+ "Checks whether a fuzzy constraint is actually fuzzy."
550
+ if constraints.get("e") == (0, 0):
551
+ return False
552
+
553
+ if (constraints.get("s"), constraints.get("i"), constraints.get("d")) == ((0, 0), (0, 0), (0, 0)):
554
+ return False
555
+
556
+ return True
557
+
558
+ def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
559
+ sequence):
560
+ element = sequence.pop()
561
+ if element is None:
562
+ if sequence:
563
+ raise error("multiple repeat", source.string, saved_pos)
564
+ raise error("nothing to repeat", source.string, saved_pos)
565
+
566
+ if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
567
+ raise error("multiple repeat", source.string, saved_pos)
568
+
569
+ min_count, max_count = counts
570
+ saved_pos = source.pos
571
+ ch = source.get()
572
+ if ch == "?":
573
+ # The "?" suffix that means it's a lazy repeat.
574
+ repeated = LazyRepeat
575
+ elif ch == "+":
576
+ # The "+" suffix that means it's a possessive repeat.
577
+ repeated = PossessiveRepeat
578
+ else:
579
+ # No suffix means that it's a greedy repeat.
580
+ source.pos = saved_pos
581
+ repeated = GreedyRepeat
582
+
583
+ # Ignore the quantifier if it applies to a zero-width item or the number of
584
+ # repeats is fixed at 1.
585
+ if not element.is_empty() and (min_count != 1 or max_count != 1):
586
+ element = repeated(element, min_count, max_count)
587
+
588
+ sequence.append(element)
589
+
590
+ def apply_constraint(source, info, constraints, case_flags, saved_pos,
591
+ sequence):
592
+ element = sequence.pop()
593
+ if element is None:
594
+ raise error("nothing for fuzzy constraint", source.string, saved_pos)
595
+
596
+ # If a group is marked as fuzzy then put all of the fuzzy part in the
597
+ # group.
598
+ if isinstance(element, Group):
599
+ element.subpattern = Fuzzy(element.subpattern, constraints)
600
+ sequence.append(element)
601
+ else:
602
+ sequence.append(Fuzzy(element, constraints))
603
+
604
+ _QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
605
+
606
+ def parse_quantifier(source, info, ch):
607
+ "Parses a quantifier."
608
+ q = _QUANTIFIERS.get(ch)
609
+ if q:
610
+ # It's a quantifier.
611
+ return q
612
+
613
+ if ch == "{":
614
+ # Looks like a limited repeated element, eg. 'a{2,3}'.
615
+ counts = parse_limited_quantifier(source)
616
+ if counts:
617
+ return counts
618
+
619
+ return None
620
+
621
+ def is_above_limit(count):
622
+ "Checks whether a count is above the maximum."
623
+ return count is not None and count >= UNLIMITED
624
+
625
+ def parse_limited_quantifier(source):
626
+ "Parses a limited quantifier."
627
+ saved_pos = source.pos
628
+ min_count = parse_count(source)
629
+ if source.match(","):
630
+ max_count = parse_count(source)
631
+
632
+ # No minimum means 0 and no maximum means unlimited.
633
+ min_count = int(min_count or 0)
634
+ max_count = int(max_count) if max_count else None
635
+ else:
636
+ if not min_count:
637
+ source.pos = saved_pos
638
+ return None
639
+
640
+ min_count = max_count = int(min_count)
641
+
642
+ if not source.match ("}"):
643
+ source.pos = saved_pos
644
+ return None
645
+
646
+ if is_above_limit(min_count) or is_above_limit(max_count):
647
+ raise error("repeat count too big", source.string, saved_pos)
648
+
649
+ if max_count is not None and min_count > max_count:
650
+ raise error("min repeat greater than max repeat", source.string,
651
+ saved_pos)
652
+
653
+ return min_count, max_count
654
+
655
+ def parse_fuzzy(source, info, ch, case_flags):
656
+ "Parses a fuzzy setting, if present."
657
+ saved_pos = source.pos
658
+
659
+ if ch != "{":
660
+ return None
661
+
662
+ constraints = {}
663
+ try:
664
+ parse_fuzzy_item(source, constraints)
665
+ while source.match(","):
666
+ parse_fuzzy_item(source, constraints)
667
+ except ParseError:
668
+ source.pos = saved_pos
669
+ return None
670
+
671
+ if source.match(":"):
672
+ constraints["test"] = parse_fuzzy_test(source, info, case_flags)
673
+
674
+ if not source.match("}"):
675
+ raise error("expected }", source.string, source.pos)
676
+
677
+ return constraints
678
+
679
+ def parse_fuzzy_item(source, constraints):
680
+ "Parses a fuzzy setting item."
681
+ saved_pos = source.pos
682
+ try:
683
+ parse_cost_constraint(source, constraints)
684
+ except ParseError:
685
+ source.pos = saved_pos
686
+
687
+ parse_cost_equation(source, constraints)
688
+
689
+ def parse_cost_constraint(source, constraints):
690
+ "Parses a cost constraint."
691
+ saved_pos = source.pos
692
+ ch = source.get()
693
+ if ch in ALPHA:
694
+ # Syntax: constraint [("<=" | "<") cost]
695
+ constraint = parse_constraint(source, constraints, ch)
696
+
697
+ max_inc = parse_fuzzy_compare(source)
698
+
699
+ if max_inc is None:
700
+ # No maximum cost.
701
+ constraints[constraint] = 0, None
702
+ else:
703
+ # There's a maximum cost.
704
+ cost_pos = source.pos
705
+ max_cost = parse_cost_limit(source)
706
+
707
+ # Inclusive or exclusive limit?
708
+ if not max_inc:
709
+ max_cost -= 1
710
+
711
+ if max_cost < 0:
712
+ raise error("bad fuzzy cost limit", source.string, cost_pos)
713
+
714
+ constraints[constraint] = 0, max_cost
715
+ elif ch in DIGITS:
716
+ # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
717
+ source.pos = saved_pos
718
+
719
+ # Minimum cost.
720
+ cost_pos = source.pos
721
+ min_cost = parse_cost_limit(source)
722
+
723
+ min_inc = parse_fuzzy_compare(source)
724
+ if min_inc is None:
725
+ raise ParseError()
726
+
727
+ constraint = parse_constraint(source, constraints, source.get())
728
+
729
+ max_inc = parse_fuzzy_compare(source)
730
+ if max_inc is None:
731
+ raise ParseError()
732
+
733
+ # Maximum cost.
734
+ cost_pos = source.pos
735
+ max_cost = parse_cost_limit(source)
736
+
737
+ # Inclusive or exclusive limits?
738
+ if not min_inc:
739
+ min_cost += 1
740
+ if not max_inc:
741
+ max_cost -= 1
742
+
743
+ if not 0 <= min_cost <= max_cost:
744
+ raise error("bad fuzzy cost limit", source.string, cost_pos)
745
+
746
+ constraints[constraint] = min_cost, max_cost
747
+ else:
748
+ raise ParseError()
749
+
750
+ def parse_cost_limit(source):
751
+ "Parses a cost limit."
752
+ cost_pos = source.pos
753
+ digits = parse_count(source)
754
+
755
+ try:
756
+ return int(digits)
757
+ except ValueError:
758
+ pass
759
+
760
+ raise error("bad fuzzy cost limit", source.string, cost_pos)
761
+
762
+ def parse_constraint(source, constraints, ch):
763
+ "Parses a constraint."
764
+ if ch not in "deis":
765
+ raise ParseError()
766
+
767
+ if ch in constraints:
768
+ raise ParseError()
769
+
770
+ return ch
771
+
772
+ def parse_fuzzy_compare(source):
773
+ "Parses a cost comparator."
774
+ if source.match("<="):
775
+ return True
776
+ elif source.match("<"):
777
+ return False
778
+ else:
779
+ return None
780
+
781
+ def parse_cost_equation(source, constraints):
782
+ "Parses a cost equation."
783
+ if "cost" in constraints:
784
+ raise error("more than one cost equation", source.string, source.pos)
785
+
786
+ cost = {}
787
+
788
+ parse_cost_term(source, cost)
789
+ while source.match("+"):
790
+ parse_cost_term(source, cost)
791
+
792
+ max_inc = parse_fuzzy_compare(source)
793
+ if max_inc is None:
794
+ raise ParseError()
795
+
796
+ max_cost = int(parse_count(source))
797
+
798
+ if not max_inc:
799
+ max_cost -= 1
800
+
801
+ if max_cost < 0:
802
+ raise error("bad fuzzy cost limit", source.string, source.pos)
803
+
804
+ cost["max"] = max_cost
805
+
806
+ constraints["cost"] = cost
807
+
808
+ def parse_cost_term(source, cost):
809
+ "Parses a cost equation term."
810
+ coeff = parse_count(source)
811
+ ch = source.get()
812
+ if ch not in "dis":
813
+ raise ParseError()
814
+
815
+ if ch in cost:
816
+ raise error("repeated fuzzy cost", source.string, source.pos)
817
+
818
+ cost[ch] = int(coeff or 1)
819
+
820
+ def parse_fuzzy_test(source, info, case_flags):
821
+ saved_pos = source.pos
822
+ ch = source.get()
823
+ if ch in SPECIAL_CHARS:
824
+ if ch == "\\":
825
+ # An escape sequence outside a set.
826
+ return parse_escape(source, info, False)
827
+ elif ch == ".":
828
+ # Any character.
829
+ if info.flags & DOTALL:
830
+ return AnyAll()
831
+ elif info.flags & WORD:
832
+ return AnyU()
833
+ else:
834
+ return Any()
835
+ elif ch == "[":
836
+ # A character set.
837
+ return parse_set(source, info)
838
+ else:
839
+ raise error("expected character set", source.string, saved_pos)
840
+ elif ch:
841
+ # A literal.
842
+ return Character(ord(ch), case_flags=case_flags)
843
+ else:
844
+ raise error("expected character set", source.string, saved_pos)
845
+
846
+ def parse_count(source):
847
+ "Parses a quantifier's count, which can be empty."
848
+ return source.get_while(DIGITS)
849
+
850
+ def parse_paren(source, info):
851
+ """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
852
+ inline flag.
853
+ """
854
+ saved_pos = source.pos
855
+ ch = source.get(True)
856
+ if ch == "?":
857
+ # (?...
858
+ saved_pos_2 = source.pos
859
+ ch = source.get(True)
860
+ if ch == "<":
861
+ # (?<...
862
+ saved_pos_3 = source.pos
863
+ ch = source.get()
864
+ if ch in ("=", "!"):
865
+ # (?<=... or (?<!...: lookbehind.
866
+ return parse_lookaround(source, info, True, ch == "=")
867
+
868
+ # (?<...: a named capture group.
869
+ source.pos = saved_pos_3
870
+ name = parse_name(source)
871
+ group = info.open_group(name)
872
+ source.expect(">")
873
+ saved_flags = info.flags
874
+ try:
875
+ subpattern = _parse_pattern(source, info)
876
+ source.expect(")")
877
+ finally:
878
+ info.flags = saved_flags
879
+ source.ignore_space = bool(info.flags & VERBOSE)
880
+
881
+ info.close_group()
882
+ return Group(info, group, subpattern)
883
+ if ch in ("=", "!"):
884
+ # (?=... or (?!...: lookahead.
885
+ return parse_lookaround(source, info, False, ch == "=")
886
+ if ch == "P":
887
+ # (?P...: a Python extension.
888
+ return parse_extension(source, info)
889
+ if ch == "#":
890
+ # (?#...: a comment.
891
+ return parse_comment(source)
892
+ if ch == "(":
893
+ # (?(...: a conditional subpattern.
894
+ return parse_conditional(source, info)
895
+ if ch == ">":
896
+ # (?>...: an atomic subpattern.
897
+ return parse_atomic(source, info)
898
+ if ch == "|":
899
+ # (?|...: a common/reset groups branch.
900
+ return parse_common(source, info)
901
+ if ch == "R" or "0" <= ch <= "9":
902
+ # (?R...: probably a call to a group.
903
+ return parse_call_group(source, info, ch, saved_pos_2)
904
+ if ch == "&":
905
+ # (?&...: a call to a named group.
906
+ return parse_call_named_group(source, info, saved_pos_2)
907
+ if (ch == "+" or ch == "-") and source.peek() in DIGITS:
908
+ return parse_rel_call_group(source, info, ch, saved_pos_2)
909
+
910
+ # (?...: probably a flags subpattern.
911
+ source.pos = saved_pos_2
912
+ return parse_flags_subpattern(source, info)
913
+
914
+ if ch == "*":
915
+ # (*...
916
+ saved_pos_2 = source.pos
917
+ word = source.get_while(set(")>"), include=False)
918
+ if word[ : 1].isalpha():
919
+ verb = VERBS.get(word)
920
+ if not verb:
921
+ raise error("unknown verb", source.string, saved_pos_2)
922
+
923
+ source.expect(")")
924
+
925
+ return verb
926
+
927
+ # (...: an unnamed capture group.
928
+ source.pos = saved_pos
929
+ group = info.open_group()
930
+ saved_flags = info.flags
931
+ try:
932
+ subpattern = _parse_pattern(source, info)
933
+ source.expect(")")
934
+ finally:
935
+ info.flags = saved_flags
936
+ source.ignore_space = bool(info.flags & VERBOSE)
937
+
938
+ info.close_group()
939
+
940
+ return Group(info, group, subpattern)
941
+
942
+ def parse_extension(source, info):
943
+ "Parses a Python extension."
944
+ saved_pos = source.pos
945
+ ch = source.get()
946
+ if ch == "<":
947
+ # (?P<...: a named capture group.
948
+ name = parse_name(source)
949
+ group = info.open_group(name)
950
+ source.expect(">")
951
+ saved_flags = info.flags
952
+ try:
953
+ subpattern = _parse_pattern(source, info)
954
+ source.expect(")")
955
+ finally:
956
+ info.flags = saved_flags
957
+ source.ignore_space = bool(info.flags & VERBOSE)
958
+
959
+ info.close_group()
960
+
961
+ return Group(info, group, subpattern)
962
+ if ch == "=":
963
+ # (?P=...: a named group reference.
964
+ name = parse_name(source, allow_numeric=True)
965
+ source.expect(")")
966
+ if info.is_open_group(name):
967
+ raise error("cannot refer to an open group", source.string,
968
+ saved_pos)
969
+
970
+ return make_ref_group(info, name, saved_pos)
971
+ if ch == ">" or ch == "&":
972
+ # (?P>...: a call to a group.
973
+ return parse_call_named_group(source, info, saved_pos)
974
+
975
+ source.pos = saved_pos
976
+ raise error("unknown extension", source.string, saved_pos)
977
+
978
+ def parse_comment(source):
979
+ "Parses a comment."
980
+ while True:
981
+ saved_pos = source.pos
982
+ c = source.get(True)
983
+
984
+ if not c or c == ")":
985
+ break
986
+
987
+ if c == "\\":
988
+ c = source.get(True)
989
+
990
+ source.pos = saved_pos
991
+ source.expect(")")
992
+
993
+ return None
994
+
995
+ def parse_lookaround(source, info, behind, positive):
996
+ "Parses a lookaround."
997
+ saved_flags = info.flags
998
+ try:
999
+ subpattern = _parse_pattern(source, info)
1000
+ source.expect(")")
1001
+ finally:
1002
+ info.flags = saved_flags
1003
+ source.ignore_space = bool(info.flags & VERBOSE)
1004
+
1005
+ return LookAround(behind, positive, subpattern)
1006
+
1007
+ def parse_conditional(source, info):
1008
+ "Parses a conditional subpattern."
1009
+ saved_flags = info.flags
1010
+ saved_pos = source.pos
1011
+ ch = source.get()
1012
+ if ch == "?":
1013
+ # (?(?...
1014
+ ch = source.get()
1015
+ if ch in ("=", "!"):
1016
+ # (?(?=... or (?(?!...: lookahead conditional.
1017
+ return parse_lookaround_conditional(source, info, False, ch == "=")
1018
+ if ch == "<":
1019
+ # (?(?<...
1020
+ ch = source.get()
1021
+ if ch in ("=", "!"):
1022
+ # (?(?<=... or (?(?<!...: lookbehind conditional.
1023
+ return parse_lookaround_conditional(source, info, True, ch ==
1024
+ "=")
1025
+
1026
+ source.pos = saved_pos
1027
+ raise error("expected lookaround conditional", source.string,
1028
+ source.pos)
1029
+
1030
+ source.pos = saved_pos
1031
+ try:
1032
+ group = parse_name(source, True)
1033
+ source.expect(")")
1034
+ yes_branch = parse_sequence(source, info)
1035
+ if source.match("|"):
1036
+ no_branch = parse_sequence(source, info)
1037
+ else:
1038
+ no_branch = Sequence()
1039
+
1040
+ source.expect(")")
1041
+ finally:
1042
+ info.flags = saved_flags
1043
+ source.ignore_space = bool(info.flags & VERBOSE)
1044
+
1045
+ if yes_branch.is_empty() and no_branch.is_empty():
1046
+ return Sequence()
1047
+
1048
+ return Conditional(info, group, yes_branch, no_branch, saved_pos)
1049
+
1050
+ def parse_lookaround_conditional(source, info, behind, positive):
1051
+ saved_flags = info.flags
1052
+ try:
1053
+ subpattern = _parse_pattern(source, info)
1054
+ source.expect(")")
1055
+ finally:
1056
+ info.flags = saved_flags
1057
+ source.ignore_space = bool(info.flags & VERBOSE)
1058
+
1059
+ yes_branch = parse_sequence(source, info)
1060
+ if source.match("|"):
1061
+ no_branch = parse_sequence(source, info)
1062
+ else:
1063
+ no_branch = Sequence()
1064
+
1065
+ source.expect(")")
1066
+
1067
+ return LookAroundConditional(behind, positive, subpattern, yes_branch,
1068
+ no_branch)
1069
+
1070
+ def parse_atomic(source, info):
1071
+ "Parses an atomic subpattern."
1072
+ saved_flags = info.flags
1073
+ try:
1074
+ subpattern = _parse_pattern(source, info)
1075
+ source.expect(")")
1076
+ finally:
1077
+ info.flags = saved_flags
1078
+ source.ignore_space = bool(info.flags & VERBOSE)
1079
+
1080
+ return Atomic(subpattern)
1081
+
1082
+ def parse_common(source, info):
1083
+ "Parses a common groups branch."
1084
+ # Capture group numbers in different branches can reuse the group numbers.
1085
+ initial_group_count = info.group_count
1086
+ branches = [parse_sequence(source, info)]
1087
+ final_group_count = info.group_count
1088
+ while source.match("|"):
1089
+ info.group_count = initial_group_count
1090
+ branches.append(parse_sequence(source, info))
1091
+ final_group_count = max(final_group_count, info.group_count)
1092
+
1093
+ info.group_count = final_group_count
1094
+ source.expect(")")
1095
+
1096
+ if len(branches) == 1:
1097
+ return branches[0]
1098
+ return Branch(branches)
1099
+
1100
+ def parse_call_group(source, info, ch, pos):
1101
+ "Parses a call to a group."
1102
+ if ch == "R":
1103
+ group = "0"
1104
+ else:
1105
+ group = ch + source.get_while(DIGITS)
1106
+
1107
+ source.expect(")")
1108
+
1109
+ return CallGroup(info, group, pos)
1110
+
1111
+ def parse_rel_call_group(source, info, ch, pos):
1112
+ "Parses a relative call to a group."
1113
+ digits = source.get_while(DIGITS)
1114
+ if not digits:
1115
+ raise error("missing relative group number", source.string, source.pos)
1116
+
1117
+ offset = int(digits)
1118
+ group = info.group_count + offset if ch == "+" else info.group_count - offset + 1
1119
+ if group <= 0:
1120
+ raise error("invalid relative group number", source.string, source.pos)
1121
+
1122
+ source.expect(")")
1123
+
1124
+ return CallGroup(info, group, pos)
1125
+
1126
+ def parse_call_named_group(source, info, pos):
1127
+ "Parses a call to a named group."
1128
+ group = parse_name(source)
1129
+ source.expect(")")
1130
+
1131
+ return CallGroup(info, group, pos)
1132
+
1133
+ def parse_flag_set(source):
1134
+ "Parses a set of inline flags."
1135
+ flags = 0
1136
+
1137
+ try:
1138
+ while True:
1139
+ saved_pos = source.pos
1140
+ ch = source.get()
1141
+ if ch == "V":
1142
+ ch += source.get()
1143
+ flags |= REGEX_FLAGS[ch]
1144
+ except KeyError:
1145
+ source.pos = saved_pos
1146
+
1147
+ return flags
1148
+
1149
+ def parse_flags(source, info):
1150
+ "Parses flags being turned on/off."
1151
+ flags_on = parse_flag_set(source)
1152
+ if source.match("-"):
1153
+ flags_off = parse_flag_set(source)
1154
+ if not flags_off:
1155
+ raise error("bad inline flags: no flags after '-'", source.string,
1156
+ source.pos)
1157
+ else:
1158
+ flags_off = 0
1159
+
1160
+ if flags_on & LOCALE:
1161
+ # Remember that this pattern as an inline locale flag.
1162
+ info.inline_locale = True
1163
+
1164
+ return flags_on, flags_off
1165
+
1166
+ def parse_subpattern(source, info, flags_on, flags_off):
1167
+ "Parses a subpattern with scoped flags."
1168
+ saved_flags = info.flags
1169
+ info.flags = (info.flags | flags_on) & ~flags_off
1170
+
1171
+ # Ensure that there aren't multiple encoding flags set.
1172
+ if info.flags & (ASCII | LOCALE | UNICODE):
1173
+ info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on
1174
+
1175
+ source.ignore_space = bool(info.flags & VERBOSE)
1176
+ try:
1177
+ subpattern = _parse_pattern(source, info)
1178
+ source.expect(")")
1179
+ finally:
1180
+ info.flags = saved_flags
1181
+ source.ignore_space = bool(info.flags & VERBOSE)
1182
+
1183
+ return subpattern
1184
+
1185
+ def parse_flags_subpattern(source, info):
1186
+ """Parses a flags subpattern. It could be inline flags or a subpattern
1187
+ possibly with local flags. If it's a subpattern, then that's returned;
1188
+ if it's a inline flags, then None is returned.
1189
+ """
1190
+ flags_on, flags_off = parse_flags(source, info)
1191
+
1192
+ if flags_off & GLOBAL_FLAGS:
1193
+ raise error("bad inline flags: cannot turn off global flag",
1194
+ source.string, source.pos)
1195
+
1196
+ if flags_on & flags_off:
1197
+ raise error("bad inline flags: flag turned on and off", source.string,
1198
+ source.pos)
1199
+
1200
+ # Handle flags which are global in all regex behaviours.
1201
+ new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1202
+ if new_global_flags:
1203
+ info.global_flags |= new_global_flags
1204
+
1205
+ # A global has been turned on, so reparse the pattern.
1206
+ raise _UnscopedFlagSet(info.global_flags)
1207
+
1208
+ # Ensure that from now on we have only scoped flags.
1209
+ flags_on &= ~GLOBAL_FLAGS
1210
+
1211
+ if source.match(":"):
1212
+ return parse_subpattern(source, info, flags_on, flags_off)
1213
+
1214
+ if source.match(")"):
1215
+ parse_positional_flags(source, info, flags_on, flags_off)
1216
+ return None
1217
+
1218
+ raise error("unknown extension", source.string, source.pos)
1219
+
1220
+ def parse_positional_flags(source, info, flags_on, flags_off):
1221
+ "Parses positional flags."
1222
+ info.flags = (info.flags | flags_on) & ~flags_off
1223
+ source.ignore_space = bool(info.flags & VERBOSE)
1224
+
1225
+ def parse_name(source, allow_numeric=False, allow_group_0=False):
1226
+ "Parses a name."
1227
+ name = source.get_while(set(")>"), include=False)
1228
+
1229
+ if not name:
1230
+ raise error("missing group name", source.string, source.pos)
1231
+
1232
+ if name.isdigit():
1233
+ min_group = 0 if allow_group_0 else 1
1234
+ if not allow_numeric or int(name) < min_group:
1235
+ raise error("bad character in group name", source.string,
1236
+ source.pos)
1237
+ else:
1238
+ if not name.isidentifier():
1239
+ raise error("bad character in group name", source.string,
1240
+ source.pos)
1241
+
1242
+ return name
1243
+
1244
+ def is_octal(string):
1245
+ "Checks whether a string is octal."
1246
+ return all(ch in OCT_DIGITS for ch in string)
1247
+
1248
+ def is_decimal(string):
1249
+ "Checks whether a string is decimal."
1250
+ return all(ch in DIGITS for ch in string)
1251
+
1252
+ def is_hexadecimal(string):
1253
+ "Checks whether a string is hexadecimal."
1254
+ return all(ch in HEX_DIGITS for ch in string)
1255
+
1256
+ def parse_escape(source, info, in_set):
1257
+ "Parses an escape sequence."
1258
+ saved_ignore = source.ignore_space
1259
+ source.ignore_space = False
1260
+ ch = source.get()
1261
+ source.ignore_space = saved_ignore
1262
+ if not ch:
1263
+ # A backslash at the end of the pattern.
1264
+ raise error("bad escape (end of pattern)", source.string, source.pos)
1265
+ if ch in HEX_ESCAPES:
1266
+ # A hexadecimal escape sequence.
1267
+ return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1268
+ elif ch == "g" and not in_set:
1269
+ # A group reference.
1270
+ saved_pos = source.pos
1271
+ try:
1272
+ return parse_group_ref(source, info)
1273
+ except error:
1274
+ # Invalid as a group reference, so assume it's a literal.
1275
+ source.pos = saved_pos
1276
+
1277
+ return make_character(info, ord(ch), in_set)
1278
+ elif ch == "G" and not in_set:
1279
+ # A search anchor.
1280
+ return SearchAnchor()
1281
+ elif ch == "L" and not in_set:
1282
+ # A string set.
1283
+ return parse_string_set(source, info)
1284
+ elif ch == "N":
1285
+ # A named codepoint.
1286
+ return parse_named_char(source, info, in_set)
1287
+ elif ch in "pP":
1288
+ # A Unicode property, positive or negative.
1289
+ return parse_property(source, info, ch == "p", in_set)
1290
+ elif ch == "R" and not in_set:
1291
+ # A line ending.
1292
+ charset = [0x0A, 0x0B, 0x0C, 0x0D]
1293
+ if info.guess_encoding == UNICODE:
1294
+ charset.extend([0x85, 0x2028, 0x2029])
1295
+
1296
+ return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
1297
+ for c in charset])]))
1298
+ elif ch == "X" and not in_set:
1299
+ # A grapheme cluster.
1300
+ return Grapheme()
1301
+ elif ch in ALPHA:
1302
+ # An alphabetic escape sequence.
1303
+ # Positional escapes aren't allowed inside a character set.
1304
+ if not in_set:
1305
+ if info.flags & WORD:
1306
+ value = WORD_POSITION_ESCAPES.get(ch)
1307
+ elif info.flags & ASCII:
1308
+ value = ASCII_POSITION_ESCAPES.get(ch)
1309
+ elif info.flags & UNICODE:
1310
+ value = UNICODE_POSITION_ESCAPES.get(ch)
1311
+ else:
1312
+ value = POSITION_ESCAPES.get(ch)
1313
+
1314
+ if value:
1315
+ return value
1316
+
1317
+ if info.flags & ASCII:
1318
+ value = ASCII_CHARSET_ESCAPES.get(ch)
1319
+ elif info.flags & UNICODE:
1320
+ value = UNICODE_CHARSET_ESCAPES.get(ch)
1321
+ else:
1322
+ value = CHARSET_ESCAPES.get(ch)
1323
+
1324
+ if value:
1325
+ return value
1326
+
1327
+ value = CHARACTER_ESCAPES.get(ch)
1328
+ if value:
1329
+ return Character(ord(value))
1330
+
1331
+ raise error("bad escape \\%s" % ch, source.string, source.pos)
1332
+ elif ch in DIGITS:
1333
+ # A numeric escape sequence.
1334
+ return parse_numeric_escape(source, info, ch, in_set)
1335
+ else:
1336
+ # A literal.
1337
+ return make_character(info, ord(ch), in_set)
1338
+
1339
+ def parse_numeric_escape(source, info, ch, in_set):
1340
+ "Parses a numeric escape sequence."
1341
+ if in_set or ch == "0":
1342
+ # Octal escape sequence, max 3 digits.
1343
+ return parse_octal_escape(source, info, [ch], in_set)
1344
+
1345
+ # At least 1 digit, so either octal escape or group.
1346
+ digits = ch
1347
+ saved_pos = source.pos
1348
+ ch = source.get()
1349
+ if ch in DIGITS:
1350
+ # At least 2 digits, so either octal escape or group.
1351
+ digits += ch
1352
+ saved_pos = source.pos
1353
+ ch = source.get()
1354
+ if is_octal(digits) and ch in OCT_DIGITS:
1355
+ # 3 octal digits, so octal escape sequence.
1356
+ encoding = info.flags & _ALL_ENCODINGS
1357
+ if encoding == ASCII or encoding == LOCALE:
1358
+ octal_mask = 0xFF
1359
+ else:
1360
+ octal_mask = 0x1FF
1361
+
1362
+ value = int(digits + ch, 8) & octal_mask
1363
+ return make_character(info, value)
1364
+
1365
+ # Group reference.
1366
+ source.pos = saved_pos
1367
+ if info.is_open_group(digits):
1368
+ raise error("cannot refer to an open group", source.string, source.pos)
1369
+
1370
+ return make_ref_group(info, digits, source.pos)
1371
+
1372
+ def parse_octal_escape(source, info, digits, in_set):
1373
+ "Parses an octal escape sequence."
1374
+ saved_pos = source.pos
1375
+ ch = source.get()
1376
+ while len(digits) < 3 and ch in OCT_DIGITS:
1377
+ digits.append(ch)
1378
+ saved_pos = source.pos
1379
+ ch = source.get()
1380
+
1381
+ source.pos = saved_pos
1382
+ try:
1383
+ value = int("".join(digits), 8)
1384
+ return make_character(info, value, in_set)
1385
+ except ValueError:
1386
+ if digits[0] in OCT_DIGITS:
1387
+ raise error("incomplete escape \\%s" % ''.join(digits),
1388
+ source.string, source.pos)
1389
+ else:
1390
+ raise error("bad escape \\%s" % digits[0], source.string,
1391
+ source.pos)
1392
+
1393
+ def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1394
+ "Parses a hex escape sequence."
1395
+ saved_pos = source.pos
1396
+ digits = []
1397
+ for i in range(expected_len):
1398
+ ch = source.get()
1399
+ if ch not in HEX_DIGITS:
1400
+ raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1401
+ source.string, saved_pos)
1402
+ digits.append(ch)
1403
+
1404
+ try:
1405
+ value = int("".join(digits), 16)
1406
+ except ValueError:
1407
+ pass
1408
+ else:
1409
+ if value < 0x110000:
1410
+ return make_character(info, value, in_set)
1411
+
1412
+ # Bad hex escape.
1413
+ raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1414
+ source.string, saved_pos)
1415
+
1416
+ def parse_group_ref(source, info):
1417
+ "Parses a group reference."
1418
+ source.expect("<")
1419
+ saved_pos = source.pos
1420
+ name = parse_name(source, True)
1421
+ source.expect(">")
1422
+ if info.is_open_group(name):
1423
+ raise error("cannot refer to an open group", source.string, source.pos)
1424
+
1425
+ return make_ref_group(info, name, saved_pos)
1426
+
1427
+ def parse_string_set(source, info):
1428
+ "Parses a string set reference."
1429
+ source.expect("<")
1430
+ name = parse_name(source, True)
1431
+ source.expect(">")
1432
+ if name is None or name not in info.kwargs:
1433
+ raise error("undefined named list", source.string, source.pos)
1434
+
1435
+ return make_string_set(info, name)
1436
+
1437
+ def parse_named_char(source, info, in_set):
1438
+ "Parses a named character."
1439
+ saved_pos = source.pos
1440
+ if source.match("{"):
1441
+ name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
1442
+ if source.match("}"):
1443
+ try:
1444
+ value = unicodedata.lookup(name)
1445
+ return make_character(info, ord(value), in_set)
1446
+ except KeyError:
1447
+ raise error("undefined character name", source.string,
1448
+ source.pos)
1449
+
1450
+ source.pos = saved_pos
1451
+ return make_character(info, ord("N"), in_set)
1452
+
1453
+ def parse_property(source, info, positive, in_set):
1454
+ "Parses a Unicode property."
1455
+ saved_pos = source.pos
1456
+ ch = source.get()
1457
+ if ch == "{":
1458
+ negate = source.match("^")
1459
+ prop_name, name = parse_property_name(source)
1460
+ if source.match("}"):
1461
+ # It's correctly delimited.
1462
+ if info.flags & ASCII:
1463
+ encoding = ASCII_ENCODING
1464
+ elif info.flags & UNICODE:
1465
+ encoding = UNICODE_ENCODING
1466
+ else:
1467
+ encoding = 0
1468
+
1469
+ prop = lookup_property(prop_name, name, positive != negate, source,
1470
+ encoding=encoding)
1471
+ return make_property(info, prop, in_set)
1472
+ elif ch and ch in "CLMNPSZ":
1473
+ # An abbreviated property, eg \pL.
1474
+ if info.flags & ASCII:
1475
+ encoding = ASCII_ENCODING
1476
+ elif info.flags & UNICODE:
1477
+ encoding = UNICODE_ENCODING
1478
+ else:
1479
+ encoding = 0
1480
+
1481
+ prop = lookup_property(None, ch, positive, source, encoding=encoding)
1482
+ return make_property(info, prop, in_set)
1483
+
1484
+ # Not a property, so treat as a literal "p" or "P".
1485
+ source.pos = saved_pos
1486
+ ch = "p" if positive else "P"
1487
+ return make_character(info, ord(ch), in_set)
1488
+
1489
+ def parse_property_name(source):
1490
+ "Parses a property name, which may be qualified."
1491
+ name = source.get_while(PROPERTY_NAME_PART)
1492
+ saved_pos = source.pos
1493
+
1494
+ ch = source.get()
1495
+ if ch and ch in ":=":
1496
+ prop_name = name
1497
+ name = source.get_while(ALNUM | set(" &_-./")).strip()
1498
+
1499
+ if name:
1500
+ # Name after the ":" or "=", so it's a qualified name.
1501
+ saved_pos = source.pos
1502
+ else:
1503
+ # No name after the ":" or "=", so assume it's an unqualified name.
1504
+ prop_name, name = None, prop_name
1505
+ else:
1506
+ prop_name = None
1507
+
1508
+ source.pos = saved_pos
1509
+ return prop_name, name
1510
+
1511
+ def parse_set(source, info):
1512
+ "Parses a character set."
1513
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1514
+
1515
+ saved_ignore = source.ignore_space
1516
+ source.ignore_space = False
1517
+ # Negative set?
1518
+ negate = source.match("^")
1519
+ try:
1520
+ if version == VERSION0:
1521
+ item = parse_set_imp_union(source, info)
1522
+ else:
1523
+ item = parse_set_union(source, info)
1524
+
1525
+ if not source.match("]"):
1526
+ raise error("missing ]", source.string, source.pos)
1527
+ finally:
1528
+ source.ignore_space = saved_ignore
1529
+
1530
+ if negate:
1531
+ item = item.with_flags(positive=not item.positive)
1532
+
1533
+ item = item.with_flags(case_flags=make_case_flags(info))
1534
+
1535
+ return item
1536
+
1537
+ def parse_set_union(source, info):
1538
+ "Parses a set union ([x||y])."
1539
+ items = [parse_set_symm_diff(source, info)]
1540
+ while source.match("||"):
1541
+ items.append(parse_set_symm_diff(source, info))
1542
+
1543
+ if len(items) == 1:
1544
+ return items[0]
1545
+ return SetUnion(info, items)
1546
+
1547
+ def parse_set_symm_diff(source, info):
1548
+ "Parses a set symmetric difference ([x~~y])."
1549
+ items = [parse_set_inter(source, info)]
1550
+ while source.match("~~"):
1551
+ items.append(parse_set_inter(source, info))
1552
+
1553
+ if len(items) == 1:
1554
+ return items[0]
1555
+ return SetSymDiff(info, items)
1556
+
1557
+ def parse_set_inter(source, info):
1558
+ "Parses a set intersection ([x&&y])."
1559
+ items = [parse_set_diff(source, info)]
1560
+ while source.match("&&"):
1561
+ items.append(parse_set_diff(source, info))
1562
+
1563
+ if len(items) == 1:
1564
+ return items[0]
1565
+ return SetInter(info, items)
1566
+
1567
+ def parse_set_diff(source, info):
1568
+ "Parses a set difference ([x--y])."
1569
+ items = [parse_set_imp_union(source, info)]
1570
+ while source.match("--"):
1571
+ items.append(parse_set_imp_union(source, info))
1572
+
1573
+ if len(items) == 1:
1574
+ return items[0]
1575
+ return SetDiff(info, items)
1576
+
1577
+ def parse_set_imp_union(source, info):
1578
+ "Parses a set implicit union ([xy])."
1579
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1580
+
1581
+ items = [parse_set_member(source, info)]
1582
+ while True:
1583
+ saved_pos = source.pos
1584
+ if source.match("]"):
1585
+ # End of the set.
1586
+ source.pos = saved_pos
1587
+ break
1588
+
1589
+ if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1590
+ # The new behaviour has set operators.
1591
+ source.pos = saved_pos
1592
+ break
1593
+
1594
+ items.append(parse_set_member(source, info))
1595
+
1596
+ if len(items) == 1:
1597
+ return items[0]
1598
+ return SetUnion(info, items)
1599
+
1600
+ def parse_set_member(source, info):
1601
+ "Parses a member in a character set."
1602
+ # Parse a set item.
1603
+ start = parse_set_item(source, info)
1604
+ saved_pos1 = source.pos
1605
+ if (not isinstance(start, Character) or not start.positive or not
1606
+ source.match("-")):
1607
+ # It's not the start of a range.
1608
+ return start
1609
+
1610
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1611
+
1612
+ # It looks like the start of a range of characters.
1613
+ saved_pos2 = source.pos
1614
+ if version == VERSION1 and source.match("-"):
1615
+ # It's actually the set difference operator '--', so return the
1616
+ # character.
1617
+ source.pos = saved_pos1
1618
+ return start
1619
+
1620
+ if source.match("]"):
1621
+ # We've reached the end of the set, so return both the character and
1622
+ # hyphen.
1623
+ source.pos = saved_pos2
1624
+ return SetUnion(info, [start, Character(ord("-"))])
1625
+
1626
+ # Parse a set item.
1627
+ end = parse_set_item(source, info)
1628
+ if not isinstance(end, Character) or not end.positive:
1629
+ # It's not a range, so return the character, hyphen and property.
1630
+ return SetUnion(info, [start, Character(ord("-")), end])
1631
+
1632
+ # It _is_ a range.
1633
+ if start.value > end.value:
1634
+ raise error("bad character range", source.string, source.pos)
1635
+
1636
+ if start.value == end.value:
1637
+ return start
1638
+
1639
+ return Range(start.value, end.value)
1640
+
1641
+ def parse_set_item(source, info):
1642
+ "Parses an item in a character set."
1643
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1644
+
1645
+ if source.match("\\"):
1646
+ # An escape sequence in a set.
1647
+ return parse_escape(source, info, True)
1648
+
1649
+ saved_pos = source.pos
1650
+ if source.match("[:"):
1651
+ # Looks like a POSIX character class.
1652
+ try:
1653
+ return parse_posix_class(source, info)
1654
+ except ParseError:
1655
+ # Not a POSIX character class.
1656
+ source.pos = saved_pos
1657
+
1658
+ if version == VERSION1 and source.match("["):
1659
+ # It's the start of a nested set.
1660
+
1661
+ # Negative set?
1662
+ negate = source.match("^")
1663
+ item = parse_set_union(source, info)
1664
+
1665
+ if not source.match("]"):
1666
+ raise error("missing ]", source.string, source.pos)
1667
+
1668
+ if negate:
1669
+ item = item.with_flags(positive=not item.positive)
1670
+
1671
+ return item
1672
+
1673
+ ch = source.get()
1674
+ if not ch:
1675
+ raise error("unterminated character set", source.string, source.pos)
1676
+
1677
+ return Character(ord(ch))
1678
+
1679
+ def parse_posix_class(source, info):
1680
+ "Parses a POSIX character class."
1681
+ negate = source.match("^")
1682
+ prop_name, name = parse_property_name(source)
1683
+ if not source.match(":]"):
1684
+ raise ParseError()
1685
+
1686
+ return lookup_property(prop_name, name, not negate, source, posix=True)
1687
+
1688
+ def float_to_rational(flt):
1689
+ "Converts a float to a rational pair."
1690
+ int_part = int(flt)
1691
+ error = flt - int_part
1692
+ if abs(error) < 0.0001:
1693
+ return int_part, 1
1694
+
1695
+ den, num = float_to_rational(1.0 / error)
1696
+
1697
+ return int_part * den + num, den
1698
+
1699
+ def numeric_to_rational(numeric):
1700
+ "Converts a numeric string to a rational string, if possible."
1701
+ if numeric[ : 1] == "-":
1702
+ sign, numeric = numeric[0], numeric[1 : ]
1703
+ else:
1704
+ sign = ""
1705
+
1706
+ parts = numeric.split("/")
1707
+ if len(parts) == 2:
1708
+ num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1709
+ elif len(parts) == 1:
1710
+ num, den = float_to_rational(float(parts[0]))
1711
+ else:
1712
+ raise ValueError()
1713
+
1714
+ result = "{}{}/{}".format(sign, num, den)
1715
+ if result.endswith("/1"):
1716
+ return result[ : -2]
1717
+
1718
+ return result
1719
+
1720
+ def standardise_name(name):
1721
+ "Standardises a property or value name."
1722
+ try:
1723
+ return numeric_to_rational("".join(name))
1724
+ except (ValueError, ZeroDivisionError):
1725
+ return "".join(ch for ch in name if ch not in "_- ").upper()
1726
+
1727
+ _POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1728
+
1729
+ _BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1730
+
1731
+ def lookup_property(property, value, positive, source=None, posix=False, encoding=0):
1732
+ "Looks up a property."
1733
+ # Normalise the names (which may still be lists).
1734
+ property = standardise_name(property) if property else None
1735
+ value = standardise_name(value)
1736
+
1737
+ if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1738
+ property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1739
+
1740
+ if posix and not property and value.upper() in _POSIX_CLASSES:
1741
+ value = 'POSIX' + value
1742
+
1743
+ if property:
1744
+ # Both the property and the value are provided.
1745
+ prop = PROPERTIES.get(property)
1746
+ if not prop:
1747
+ if not source:
1748
+ raise error("unknown property")
1749
+
1750
+ raise error("unknown property", source.string, source.pos)
1751
+
1752
+ prop_id, value_dict = prop
1753
+ val_id = value_dict.get(value)
1754
+ if val_id is None:
1755
+ if not source:
1756
+ raise error("unknown property value")
1757
+
1758
+ raise error("unknown property value", source.string, source.pos)
1759
+
1760
+ return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1761
+
1762
+ # Only the value is provided.
1763
+ # It might be the name of a GC, script or block value.
1764
+ for property in ("GC", "SCRIPT", "BLOCK"):
1765
+ prop_id, value_dict = PROPERTIES.get(property)
1766
+ val_id = value_dict.get(value)
1767
+ if val_id is not None:
1768
+ return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1769
+
1770
+ # It might be the name of a binary property.
1771
+ prop = PROPERTIES.get(value)
1772
+ if prop:
1773
+ prop_id, value_dict = prop
1774
+ if set(value_dict) == _BINARY_VALUES:
1775
+ return Property((prop_id << 16) | 1, positive, encoding=encoding)
1776
+
1777
+ return Property(prop_id << 16, not positive, encoding=encoding)
1778
+
1779
+ # It might be the name of a binary property starting with a prefix.
1780
+ if value.startswith("IS"):
1781
+ prop = PROPERTIES.get(value[2 : ])
1782
+ if prop:
1783
+ prop_id, value_dict = prop
1784
+ if "YES" in value_dict:
1785
+ return Property((prop_id << 16) | 1, positive, encoding=encoding)
1786
+
1787
+ # It might be the name of a script or block starting with a prefix.
1788
+ for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1789
+ if value.startswith(prefix):
1790
+ prop_id, value_dict = PROPERTIES.get(property)
1791
+ val_id = value_dict.get(value[2 : ])
1792
+ if val_id is not None:
1793
+ return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1794
+
1795
+ # Unknown property.
1796
+ if not source:
1797
+ raise error("unknown property")
1798
+
1799
+ raise error("unknown property", source.string, source.pos)
1800
+
1801
+ def _compile_replacement(source, pattern, is_unicode):
1802
+ "Compiles a replacement template escape sequence."
1803
+ ch = source.get()
1804
+ if ch in ALPHA:
1805
+ # An alphabetic escape sequence.
1806
+ value = CHARACTER_ESCAPES.get(ch)
1807
+ if value:
1808
+ return False, [ord(value)]
1809
+
1810
+ if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1811
+ # A hexadecimal escape sequence.
1812
+ return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1813
+
1814
+ if ch == "g":
1815
+ # A group preference.
1816
+ return True, [compile_repl_group(source, pattern)]
1817
+
1818
+ if ch == "N" and is_unicode:
1819
+ # A named character.
1820
+ value = parse_repl_named_char(source)
1821
+ if value is not None:
1822
+ return False, [value]
1823
+
1824
+ raise error("bad escape \\%s" % ch, source.string, source.pos)
1825
+
1826
+ if isinstance(source.sep, bytes):
1827
+ octal_mask = 0xFF
1828
+ else:
1829
+ octal_mask = 0x1FF
1830
+
1831
+ if ch == "0":
1832
+ # An octal escape sequence.
1833
+ digits = ch
1834
+ while len(digits) < 3:
1835
+ saved_pos = source.pos
1836
+ ch = source.get()
1837
+ if ch not in OCT_DIGITS:
1838
+ source.pos = saved_pos
1839
+ break
1840
+ digits += ch
1841
+
1842
+ return False, [int(digits, 8) & octal_mask]
1843
+
1844
+ if ch in DIGITS:
1845
+ # Either an octal escape sequence (3 digits) or a group reference (max
1846
+ # 2 digits).
1847
+ digits = ch
1848
+ saved_pos = source.pos
1849
+ ch = source.get()
1850
+ if ch in DIGITS:
1851
+ digits += ch
1852
+ saved_pos = source.pos
1853
+ ch = source.get()
1854
+ if ch and is_octal(digits + ch):
1855
+ # An octal escape sequence.
1856
+ return False, [int(digits + ch, 8) & octal_mask]
1857
+
1858
+ # A group reference.
1859
+ source.pos = saved_pos
1860
+ return True, [int(digits)]
1861
+
1862
+ if ch == "\\":
1863
+ # An escaped backslash is a backslash.
1864
+ return False, [ord("\\")]
1865
+
1866
+ if not ch:
1867
+ # A trailing backslash.
1868
+ raise error("bad escape (end of pattern)", source.string, source.pos)
1869
+
1870
+ # An escaped non-backslash is a backslash followed by the literal.
1871
+ return False, [ord("\\"), ord(ch)]
1872
+
1873
+ def parse_repl_hex_escape(source, expected_len, type):
1874
+ "Parses a hex escape sequence in a replacement string."
1875
+ digits = []
1876
+ for i in range(expected_len):
1877
+ ch = source.get()
1878
+ if ch not in HEX_DIGITS:
1879
+ raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1880
+ source.string, source.pos)
1881
+ digits.append(ch)
1882
+
1883
+ return int("".join(digits), 16)
1884
+
1885
+ def parse_repl_named_char(source):
1886
+ "Parses a named character in a replacement string."
1887
+ saved_pos = source.pos
1888
+ if source.match("{"):
1889
+ name = source.get_while(ALPHA | set(" "))
1890
+
1891
+ if source.match("}"):
1892
+ try:
1893
+ value = unicodedata.lookup(name)
1894
+ return ord(value)
1895
+ except KeyError:
1896
+ raise error("undefined character name", source.string,
1897
+ source.pos)
1898
+
1899
+ source.pos = saved_pos
1900
+ return None
1901
+
1902
+ def compile_repl_group(source, pattern):
1903
+ "Compiles a replacement template group reference."
1904
+ source.expect("<")
1905
+ name = parse_name(source, True, True)
1906
+
1907
+ source.expect(">")
1908
+ if name.isdigit():
1909
+ index = int(name)
1910
+ if not 0 <= index <= pattern.groups:
1911
+ raise error("invalid group reference", source.string, source.pos)
1912
+
1913
+ return index
1914
+
1915
+ try:
1916
+ return pattern.groupindex[name]
1917
+ except KeyError:
1918
+ raise IndexError("unknown group")
1919
+
1920
+ # The regular expression is parsed into a syntax tree. The different types of
1921
+ # node are defined below.
1922
+
1923
+ INDENT = " "
1924
+ POSITIVE_OP = 0x1
1925
+ ZEROWIDTH_OP = 0x2
1926
+ FUZZY_OP = 0x4
1927
+ REVERSE_OP = 0x8
1928
+ REQUIRED_OP = 0x10
1929
+ ENCODING_OP_SHIFT = 5
1930
+
1931
+ POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1932
+ CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1933
+ FULLIGNORECASE: " FULL_IGNORE_CASE"}
1934
+
1935
+ def make_sequence(items):
1936
+ if len(items) == 1:
1937
+ return items[0]
1938
+ return Sequence(items)
1939
+
1940
+ # Common base class for all nodes.
1941
+ class RegexBase:
1942
+ def __init__(self):
1943
+ self._key = self.__class__
1944
+
1945
+ def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1946
+ if positive is None:
1947
+ positive = self.positive
1948
+ else:
1949
+ positive = bool(positive)
1950
+ if case_flags is None:
1951
+ case_flags = self.case_flags
1952
+ else:
1953
+ case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1954
+ if zerowidth is None:
1955
+ zerowidth = self.zerowidth
1956
+ else:
1957
+ zerowidth = bool(zerowidth)
1958
+
1959
+ if (positive == self.positive and case_flags == self.case_flags and
1960
+ zerowidth == self.zerowidth):
1961
+ return self
1962
+
1963
+ return self.rebuild(positive, case_flags, zerowidth)
1964
+
1965
+ def fix_groups(self, pattern, reverse, fuzzy):
1966
+ pass
1967
+
1968
+ def optimise(self, info, reverse):
1969
+ return self
1970
+
1971
+ def pack_characters(self, info):
1972
+ return self
1973
+
1974
+ def remove_captures(self):
1975
+ return self
1976
+
1977
+ def is_atomic(self):
1978
+ return True
1979
+
1980
+ def can_be_affix(self):
1981
+ return True
1982
+
1983
+ def contains_group(self):
1984
+ return False
1985
+
1986
+ def get_firstset(self, reverse):
1987
+ raise _FirstSetError()
1988
+
1989
+ def has_simple_start(self):
1990
+ return False
1991
+
1992
+ def compile(self, reverse=False, fuzzy=False):
1993
+ return self._compile(reverse, fuzzy)
1994
+
1995
+ def is_empty(self):
1996
+ return False
1997
+
1998
+ def __hash__(self):
1999
+ return hash(self._key)
2000
+
2001
+ def __eq__(self, other):
2002
+ return type(self) is type(other) and self._key == other._key
2003
+
2004
+ def __ne__(self, other):
2005
+ return not self.__eq__(other)
2006
+
2007
+ def get_required_string(self, reverse):
2008
+ return self.max_width(), None
2009
+
2010
+ # Base class for zero-width nodes.
2011
+ class ZeroWidthBase(RegexBase):
2012
+ def __init__(self, positive=True, encoding=0):
2013
+ RegexBase.__init__(self)
2014
+ self.positive = bool(positive)
2015
+ self.encoding = encoding
2016
+
2017
+ self._key = self.__class__, self.positive
2018
+
2019
+ def get_firstset(self, reverse):
2020
+ return set([None])
2021
+
2022
+ def _compile(self, reverse, fuzzy):
2023
+ flags = 0
2024
+ if self.positive:
2025
+ flags |= POSITIVE_OP
2026
+ if fuzzy:
2027
+ flags |= FUZZY_OP
2028
+ if reverse:
2029
+ flags |= REVERSE_OP
2030
+ flags |= self.encoding << ENCODING_OP_SHIFT
2031
+ return [(self._opcode, flags)]
2032
+
2033
+ def dump(self, indent, reverse):
2034
+ print("{}{} {}{}".format(INDENT * indent, self._op_name,
2035
+ POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))
2036
+
2037
+ def max_width(self):
2038
+ return 0
2039
+
2040
+ class Any(RegexBase):
2041
+ _opcode = {False: OP.ANY, True: OP.ANY_REV}
2042
+ _op_name = "ANY"
2043
+
2044
+ def has_simple_start(self):
2045
+ return True
2046
+
2047
+ def _compile(self, reverse, fuzzy):
2048
+ flags = 0
2049
+ if fuzzy:
2050
+ flags |= FUZZY_OP
2051
+ return [(self._opcode[reverse], flags)]
2052
+
2053
+ def dump(self, indent, reverse):
2054
+ print("{}{}".format(INDENT * indent, self._op_name))
2055
+
2056
+ def max_width(self):
2057
+ return 1
2058
+
2059
+ class AnyAll(Any):
2060
+ _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
2061
+ _op_name = "ANY_ALL"
2062
+
2063
+ def __init__(self):
2064
+ self.positive = True
2065
+ self.zerowidth = False
2066
+ self.case_flags = 0
2067
+
2068
+ self._key = self.__class__, self.positive
2069
+
2070
+ class AnyU(Any):
2071
+ _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
2072
+ _op_name = "ANY_U"
2073
+
2074
+ class Atomic(RegexBase):
2075
+ def __init__(self, subpattern):
2076
+ RegexBase.__init__(self)
2077
+ self.subpattern = subpattern
2078
+
2079
+ def fix_groups(self, pattern, reverse, fuzzy):
2080
+ self.subpattern.fix_groups(pattern, reverse, fuzzy)
2081
+
2082
+ def optimise(self, info, reverse):
2083
+ self.subpattern = self.subpattern.optimise(info, reverse)
2084
+
2085
+ if self.subpattern.is_empty():
2086
+ return self.subpattern
2087
+ return self
2088
+
2089
+ def pack_characters(self, info):
2090
+ self.subpattern = self.subpattern.pack_characters(info)
2091
+ return self
2092
+
2093
+ def remove_captures(self):
2094
+ self.subpattern = self.subpattern.remove_captures()
2095
+ return self
2096
+
2097
+ def can_be_affix(self):
2098
+ return self.subpattern.can_be_affix()
2099
+
2100
+ def contains_group(self):
2101
+ return self.subpattern.contains_group()
2102
+
2103
+ def get_firstset(self, reverse):
2104
+ return self.subpattern.get_firstset(reverse)
2105
+
2106
+ def has_simple_start(self):
2107
+ return self.subpattern.has_simple_start()
2108
+
2109
+ def _compile(self, reverse, fuzzy):
2110
+ return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2111
+ [(OP.END, )])
2112
+
2113
+ def dump(self, indent, reverse):
2114
+ print("{}ATOMIC".format(INDENT * indent))
2115
+ self.subpattern.dump(indent + 1, reverse)
2116
+
2117
+ def is_empty(self):
2118
+ return self.subpattern.is_empty()
2119
+
2120
+ def __eq__(self, other):
2121
+ return (type(self) is type(other) and self.subpattern ==
2122
+ other.subpattern)
2123
+
2124
+ def max_width(self):
2125
+ return self.subpattern.max_width()
2126
+
2127
+ def get_required_string(self, reverse):
2128
+ return self.subpattern.get_required_string(reverse)
2129
+
2130
+ class Boundary(ZeroWidthBase):
2131
+ _opcode = OP.BOUNDARY
2132
+ _op_name = "BOUNDARY"
2133
+
2134
+ class Branch(RegexBase):
2135
+ def __init__(self, branches):
2136
+ RegexBase.__init__(self)
2137
+ self.branches = branches
2138
+
2139
+ def fix_groups(self, pattern, reverse, fuzzy):
2140
+ for b in self.branches:
2141
+ b.fix_groups(pattern, reverse, fuzzy)
2142
+
2143
+ def optimise(self, info, reverse):
2144
+ if not self.branches:
2145
+ return Sequence([])
2146
+
2147
+ # Flatten branches within branches.
2148
+ branches = Branch._flatten_branches(info, reverse, self.branches)
2149
+
2150
+ # Move any common prefix or suffix out of the branches.
2151
+ if reverse:
2152
+ suffix, branches = Branch._split_common_suffix(info, branches)
2153
+ prefix = []
2154
+ else:
2155
+ prefix, branches = Branch._split_common_prefix(info, branches)
2156
+ suffix = []
2157
+
2158
+ # Try to reduce adjacent single-character branches to sets.
2159
+ branches = Branch._reduce_to_set(info, reverse, branches)
2160
+
2161
+ if len(branches) > 1:
2162
+ sequence = [Branch(branches)]
2163
+
2164
+ if not prefix or not suffix:
2165
+ # We might be able to add a quick precheck before the branches.
2166
+ firstset = self._add_precheck(info, reverse, branches)
2167
+
2168
+ if firstset:
2169
+ if reverse:
2170
+ sequence.append(firstset)
2171
+ else:
2172
+ sequence.insert(0, firstset)
2173
+ else:
2174
+ sequence = branches
2175
+
2176
+ return make_sequence(prefix + sequence + suffix)
2177
+
2178
+ def _add_precheck(self, info, reverse, branches):
2179
+ charset = set()
2180
+ pos = -1 if reverse else 0
2181
+
2182
+ for branch in branches:
2183
+ if type(branch) is Literal and branch.case_flags == NOCASE:
2184
+ charset.add(branch.characters[pos])
2185
+ else:
2186
+ return
2187
+
2188
+ if not charset:
2189
+ return None
2190
+
2191
+ return _check_firstset(info, reverse, [Character(c) for c in charset])
2192
+
2193
+ def pack_characters(self, info):
2194
+ self.branches = [b.pack_characters(info) for b in self.branches]
2195
+ return self
2196
+
2197
+ def remove_captures(self):
2198
+ self.branches = [b.remove_captures() for b in self.branches]
2199
+ return self
2200
+
2201
+ def is_atomic(self):
2202
+ return all(b.is_atomic() for b in self.branches)
2203
+
2204
+ def can_be_affix(self):
2205
+ return all(b.can_be_affix() for b in self.branches)
2206
+
2207
+ def contains_group(self):
2208
+ return any(b.contains_group() for b in self.branches)
2209
+
2210
+ def get_firstset(self, reverse):
2211
+ fs = set()
2212
+ for b in self.branches:
2213
+ fs |= b.get_firstset(reverse)
2214
+
2215
+ return fs or set([None])
2216
+
2217
+ def _compile(self, reverse, fuzzy):
2218
+ if not self.branches:
2219
+ return []
2220
+
2221
+ code = [(OP.BRANCH, )]
2222
+ for b in self.branches:
2223
+ code.extend(b.compile(reverse, fuzzy))
2224
+ code.append((OP.NEXT, ))
2225
+
2226
+ code[-1] = (OP.END, )
2227
+
2228
+ return code
2229
+
2230
+ def dump(self, indent, reverse):
2231
+ print("{}BRANCH".format(INDENT * indent))
2232
+ self.branches[0].dump(indent + 1, reverse)
2233
+ for b in self.branches[1 : ]:
2234
+ print("{}OR".format(INDENT * indent))
2235
+ b.dump(indent + 1, reverse)
2236
+
2237
+ @staticmethod
2238
+ def _flatten_branches(info, reverse, branches):
2239
+ # Flatten the branches so that there aren't branches of branches.
2240
+ new_branches = []
2241
+ for b in branches:
2242
+ b = b.optimise(info, reverse)
2243
+ if isinstance(b, Branch):
2244
+ new_branches.extend(b.branches)
2245
+ else:
2246
+ new_branches.append(b)
2247
+
2248
+ return new_branches
2249
+
2250
+ @staticmethod
2251
+ def _split_common_prefix(info, branches):
2252
+ # Common leading items can be moved out of the branches.
2253
+ # Get the items in the branches.
2254
+ alternatives = []
2255
+ for b in branches:
2256
+ if isinstance(b, Sequence):
2257
+ alternatives.append(b.items)
2258
+ else:
2259
+ alternatives.append([b])
2260
+
2261
+ # What is the maximum possible length of the prefix?
2262
+ max_count = min(len(a) for a in alternatives)
2263
+
2264
+ # What is the longest common prefix?
2265
+ prefix = alternatives[0]
2266
+ pos = 0
2267
+ end_pos = max_count
2268
+ while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2269
+ prefix[pos] for a in alternatives):
2270
+ pos += 1
2271
+ count = pos
2272
+
2273
+ if info.flags & UNICODE:
2274
+ # We need to check that we're not splitting a sequence of
2275
+ # characters which could form part of full case-folding.
2276
+ count = pos
2277
+ while count > 0 and not all(Branch._can_split(a, count) for a in
2278
+ alternatives):
2279
+ count -= 1
2280
+
2281
+ # No common prefix is possible.
2282
+ if count == 0:
2283
+ return [], branches
2284
+
2285
+ # Rebuild the branches.
2286
+ new_branches = []
2287
+ for a in alternatives:
2288
+ new_branches.append(make_sequence(a[count : ]))
2289
+
2290
+ return prefix[ : count], new_branches
2291
+
2292
+ @staticmethod
2293
+ def _split_common_suffix(info, branches):
2294
+ # Common trailing items can be moved out of the branches.
2295
+ # Get the items in the branches.
2296
+ alternatives = []
2297
+ for b in branches:
2298
+ if isinstance(b, Sequence):
2299
+ alternatives.append(b.items)
2300
+ else:
2301
+ alternatives.append([b])
2302
+
2303
+ # What is the maximum possible length of the suffix?
2304
+ max_count = min(len(a) for a in alternatives)
2305
+
2306
+ # What is the longest common suffix?
2307
+ suffix = alternatives[0]
2308
+ pos = -1
2309
+ end_pos = -1 - max_count
2310
+ while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2311
+ suffix[pos] for a in alternatives):
2312
+ pos -= 1
2313
+ count = -1 - pos
2314
+
2315
+ if info.flags & UNICODE:
2316
+ # We need to check that we're not splitting a sequence of
2317
+ # characters which could form part of full case-folding.
2318
+ while count > 0 and not all(Branch._can_split_rev(a, count) for a
2319
+ in alternatives):
2320
+ count -= 1
2321
+
2322
+ # No common suffix is possible.
2323
+ if count == 0:
2324
+ return [], branches
2325
+
2326
+ # Rebuild the branches.
2327
+ new_branches = []
2328
+ for a in alternatives:
2329
+ new_branches.append(make_sequence(a[ : -count]))
2330
+
2331
+ return suffix[-count : ], new_branches
2332
+
2333
+ @staticmethod
2334
+ def _can_split(items, count):
2335
+ # Check the characters either side of the proposed split.
2336
+ if not Branch._is_full_case(items, count - 1):
2337
+ return True
2338
+
2339
+ if not Branch._is_full_case(items, count):
2340
+ return True
2341
+
2342
+ # Check whether a 1-1 split would be OK.
2343
+ if Branch._is_folded(items[count - 1 : count + 1]):
2344
+ return False
2345
+
2346
+ # Check whether a 1-2 split would be OK.
2347
+ if (Branch._is_full_case(items, count + 2) and
2348
+ Branch._is_folded(items[count - 1 : count + 2])):
2349
+ return False
2350
+
2351
+ # Check whether a 2-1 split would be OK.
2352
+ if (Branch._is_full_case(items, count - 2) and
2353
+ Branch._is_folded(items[count - 2 : count + 1])):
2354
+ return False
2355
+
2356
+ return True
2357
+
2358
+ @staticmethod
2359
+ def _can_split_rev(items, count):
2360
+ end = len(items)
2361
+
2362
+ # Check the characters either side of the proposed split.
2363
+ if not Branch._is_full_case(items, end - count):
2364
+ return True
2365
+
2366
+ if not Branch._is_full_case(items, end - count - 1):
2367
+ return True
2368
+
2369
+ # Check whether a 1-1 split would be OK.
2370
+ if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2371
+ return False
2372
+
2373
+ # Check whether a 1-2 split would be OK.
2374
+ if (Branch._is_full_case(items, end - count + 2) and
2375
+ Branch._is_folded(items[end - count - 1 : end - count + 2])):
2376
+ return False
2377
+
2378
+ # Check whether a 2-1 split would be OK.
2379
+ if (Branch._is_full_case(items, end - count - 2) and
2380
+ Branch._is_folded(items[end - count - 2 : end - count + 1])):
2381
+ return False
2382
+
2383
+ return True
2384
+
2385
+ @staticmethod
2386
+ def _merge_common_prefixes(info, reverse, branches):
2387
+ # Branches with the same case-sensitive character prefix can be grouped
2388
+ # together if they are separated only by other branches with a
2389
+ # character prefix.
2390
+ prefixed = defaultdict(list)
2391
+ order = {}
2392
+ new_branches = []
2393
+ for b in branches:
2394
+ if Branch._is_simple_character(b):
2395
+ # Branch starts with a simple character.
2396
+ prefixed[b.value].append([b])
2397
+ order.setdefault(b.value, len(order))
2398
+ elif (isinstance(b, Sequence) and b.items and
2399
+ Branch._is_simple_character(b.items[0])):
2400
+ # Branch starts with a simple character.
2401
+ prefixed[b.items[0].value].append(b.items)
2402
+ order.setdefault(b.items[0].value, len(order))
2403
+ else:
2404
+ Branch._flush_char_prefix(info, reverse, prefixed, order,
2405
+ new_branches)
2406
+
2407
+ new_branches.append(b)
2408
+
2409
+ Branch._flush_char_prefix(info, prefixed, order, new_branches)
2410
+
2411
+ return new_branches
2412
+
2413
+ @staticmethod
2414
+ def _is_simple_character(c):
2415
+ return isinstance(c, Character) and c.positive and not c.case_flags
2416
+
2417
+ @staticmethod
2418
+ def _reduce_to_set(info, reverse, branches):
2419
+ # Can the branches be reduced to a set?
2420
+ new_branches = []
2421
+ items = set()
2422
+ case_flags = NOCASE
2423
+ for b in branches:
2424
+ if isinstance(b, (Character, Property, SetBase)):
2425
+ # Branch starts with a single character.
2426
+ if b.case_flags != case_flags:
2427
+ # Different case sensitivity, so flush.
2428
+ Branch._flush_set_members(info, reverse, items, case_flags,
2429
+ new_branches)
2430
+
2431
+ case_flags = b.case_flags
2432
+
2433
+ items.add(b.with_flags(case_flags=NOCASE))
2434
+ else:
2435
+ Branch._flush_set_members(info, reverse, items, case_flags,
2436
+ new_branches)
2437
+
2438
+ new_branches.append(b)
2439
+
2440
+ Branch._flush_set_members(info, reverse, items, case_flags,
2441
+ new_branches)
2442
+
2443
+ return new_branches
2444
+
2445
+ @staticmethod
2446
+ def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2447
+ # Flush the prefixed branches.
2448
+ if not prefixed:
2449
+ return
2450
+
2451
+ for value, branches in sorted(prefixed.items(), key=lambda pair:
2452
+ order[pair[0]]):
2453
+ if len(branches) == 1:
2454
+ new_branches.append(make_sequence(branches[0]))
2455
+ else:
2456
+ subbranches = []
2457
+ optional = False
2458
+ for b in branches:
2459
+ if len(b) > 1:
2460
+ subbranches.append(make_sequence(b[1 : ]))
2461
+ elif not optional:
2462
+ subbranches.append(Sequence())
2463
+ optional = True
2464
+
2465
+ sequence = Sequence([Character(value), Branch(subbranches)])
2466
+ new_branches.append(sequence.optimise(info, reverse))
2467
+
2468
+ prefixed.clear()
2469
+ order.clear()
2470
+
2471
+ @staticmethod
2472
+ def _flush_set_members(info, reverse, items, case_flags, new_branches):
2473
+ # Flush the set members.
2474
+ if not items:
2475
+ return
2476
+
2477
+ if len(items) == 1:
2478
+ item = list(items)[0]
2479
+ else:
2480
+ item = SetUnion(info, list(items)).optimise(info, reverse)
2481
+
2482
+ new_branches.append(item.with_flags(case_flags=case_flags))
2483
+
2484
+ items.clear()
2485
+
2486
+ @staticmethod
2487
+ def _is_full_case(items, i):
2488
+ if not 0 <= i < len(items):
2489
+ return False
2490
+
2491
+ item = items[i]
2492
+ return (isinstance(item, Character) and item.positive and
2493
+ (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2494
+
2495
+ @staticmethod
2496
+ def _is_folded(items):
2497
+ if len(items) < 2:
2498
+ return False
2499
+
2500
+ for i in items:
2501
+ if (not isinstance(i, Character) or not i.positive or not
2502
+ i.case_flags):
2503
+ return False
2504
+
2505
+ folded = "".join(chr(i.value) for i in items)
2506
+ folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2507
+
2508
+ # Get the characters which expand to multiple codepoints on folding.
2509
+ expanding_chars = _regex.get_expand_on_folding()
2510
+
2511
+ for c in expanding_chars:
2512
+ if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2513
+ return True
2514
+
2515
+ return False
2516
+
2517
+ def is_empty(self):
2518
+ return all(b.is_empty() for b in self.branches)
2519
+
2520
+ def __eq__(self, other):
2521
+ return type(self) is type(other) and self.branches == other.branches
2522
+
2523
+ def max_width(self):
2524
+ return max(b.max_width() for b in self.branches)
2525
+
2526
+ class CallGroup(RegexBase):
2527
+ def __init__(self, info, group, position):
2528
+ RegexBase.__init__(self)
2529
+ self.info = info
2530
+ self.group = group
2531
+ self.position = position
2532
+
2533
+ self._key = self.__class__, self.group
2534
+
2535
+ def fix_groups(self, pattern, reverse, fuzzy):
2536
+ try:
2537
+ self.group = int(self.group)
2538
+ except ValueError:
2539
+ try:
2540
+ self.group = self.info.group_index[self.group]
2541
+ except KeyError:
2542
+ raise error("invalid group reference", pattern, self.position)
2543
+
2544
+ if not 0 <= self.group <= self.info.group_count:
2545
+ raise error("unknown group", pattern, self.position)
2546
+
2547
+ if self.group > 0 and self.info.open_group_count[self.group] > 1:
2548
+ raise error("ambiguous group reference", pattern, self.position)
2549
+
2550
+ self.info.group_calls.append((self, reverse, fuzzy))
2551
+
2552
+ self._key = self.__class__, self.group
2553
+
2554
+ def remove_captures(self):
2555
+ raise error("group reference not allowed", self.pattern, self.position)
2556
+
2557
+ def _compile(self, reverse, fuzzy):
2558
+ return [(OP.GROUP_CALL, self.call_ref)]
2559
+
2560
+ def dump(self, indent, reverse):
2561
+ print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2562
+
2563
+ def __eq__(self, other):
2564
+ return type(self) is type(other) and self.group == other.group
2565
+
2566
+ def max_width(self):
2567
+ return UNLIMITED
2568
+
2569
+ def __del__(self):
2570
+ self.info = None
2571
+
2572
+ class CallRef(RegexBase):
2573
+ def __init__(self, ref, parsed):
2574
+ self.ref = ref
2575
+ self.parsed = parsed
2576
+
2577
+ def _compile(self, reverse, fuzzy):
2578
+ return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2579
+ fuzzy) + [(OP.END, )])
2580
+
2581
+ class Character(RegexBase):
2582
+ _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2583
+ OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2584
+ False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2585
+ True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2586
+ (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2587
+
2588
+ def __init__(self, value, positive=True, case_flags=NOCASE,
2589
+ zerowidth=False):
2590
+ RegexBase.__init__(self)
2591
+ self.value = value
2592
+ self.positive = bool(positive)
2593
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2594
+ self.zerowidth = bool(zerowidth)
2595
+
2596
+ if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2597
+ FULLIGNORECASE):
2598
+ self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2599
+ else:
2600
+ self.folded = chr(self.value)
2601
+
2602
+ self._key = (self.__class__, self.value, self.positive,
2603
+ self.case_flags, self.zerowidth)
2604
+
2605
+ def rebuild(self, positive, case_flags, zerowidth):
2606
+ return Character(self.value, positive, case_flags, zerowidth)
2607
+
2608
+ def optimise(self, info, reverse, in_set=False):
2609
+ return self
2610
+
2611
+ def get_firstset(self, reverse):
2612
+ return set([self])
2613
+
2614
+ def has_simple_start(self):
2615
+ return True
2616
+
2617
+ def _compile(self, reverse, fuzzy):
2618
+ flags = 0
2619
+ if self.positive:
2620
+ flags |= POSITIVE_OP
2621
+ if self.zerowidth:
2622
+ flags |= ZEROWIDTH_OP
2623
+ if fuzzy:
2624
+ flags |= FUZZY_OP
2625
+
2626
+ code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2627
+ self.value])
2628
+
2629
+ if len(self.folded) > 1:
2630
+ # The character expands on full case-folding.
2631
+ code = Branch([code, String([ord(c) for c in self.folded],
2632
+ case_flags=self.case_flags)])
2633
+
2634
+ return code.compile(reverse, fuzzy)
2635
+
2636
+ def dump(self, indent, reverse):
2637
+ display = ascii(chr(self.value)).lstrip("bu")
2638
+ print("{}CHARACTER {} {}{}".format(INDENT * indent,
2639
+ POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2640
+
2641
+ def matches(self, ch):
2642
+ return (ch == self.value) == self.positive
2643
+
2644
+ def max_width(self):
2645
+ return len(self.folded)
2646
+
2647
+ def get_required_string(self, reverse):
2648
+ if not self.positive:
2649
+ return 1, None
2650
+
2651
+ self.folded_characters = tuple(ord(c) for c in self.folded)
2652
+
2653
+ return 0, self
2654
+
2655
+ class Conditional(RegexBase):
2656
+ def __init__(self, info, group, yes_item, no_item, position):
2657
+ RegexBase.__init__(self)
2658
+ self.info = info
2659
+ self.group = group
2660
+ self.yes_item = yes_item
2661
+ self.no_item = no_item
2662
+ self.position = position
2663
+
2664
+ def fix_groups(self, pattern, reverse, fuzzy):
2665
+ try:
2666
+ self.group = int(self.group)
2667
+ except ValueError:
2668
+ try:
2669
+ self.group = self.info.group_index[self.group]
2670
+ except KeyError:
2671
+ if self.group == 'DEFINE':
2672
+ # 'DEFINE' is a special name unless there's a group with
2673
+ # that name.
2674
+ self.group = 0
2675
+ else:
2676
+ raise error("unknown group", pattern, self.position)
2677
+
2678
+ if not 0 <= self.group <= self.info.group_count:
2679
+ raise error("invalid group reference", pattern, self.position)
2680
+
2681
+ self.yes_item.fix_groups(pattern, reverse, fuzzy)
2682
+ self.no_item.fix_groups(pattern, reverse, fuzzy)
2683
+
2684
+ def optimise(self, info, reverse):
2685
+ yes_item = self.yes_item.optimise(info, reverse)
2686
+ no_item = self.no_item.optimise(info, reverse)
2687
+
2688
+ return Conditional(info, self.group, yes_item, no_item, self.position)
2689
+
2690
+ def pack_characters(self, info):
2691
+ self.yes_item = self.yes_item.pack_characters(info)
2692
+ self.no_item = self.no_item.pack_characters(info)
2693
+ return self
2694
+
2695
+ def remove_captures(self):
2696
+ self.yes_item = self.yes_item.remove_captures()
2697
+ self.no_item = self.no_item.remove_captures()
2698
+
2699
+ def is_atomic(self):
2700
+ return self.yes_item.is_atomic() and self.no_item.is_atomic()
2701
+
2702
+ def can_be_affix(self):
2703
+ return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2704
+
2705
+ def contains_group(self):
2706
+ return self.yes_item.contains_group() or self.no_item.contains_group()
2707
+
2708
+ def get_firstset(self, reverse):
2709
+ return (self.yes_item.get_firstset(reverse) |
2710
+ self.no_item.get_firstset(reverse))
2711
+
2712
+ def _compile(self, reverse, fuzzy):
2713
+ code = [(OP.GROUP_EXISTS, self.group)]
2714
+ code.extend(self.yes_item.compile(reverse, fuzzy))
2715
+ add_code = self.no_item.compile(reverse, fuzzy)
2716
+ if add_code:
2717
+ code.append((OP.NEXT, ))
2718
+ code.extend(add_code)
2719
+
2720
+ code.append((OP.END, ))
2721
+
2722
+ return code
2723
+
2724
+ def dump(self, indent, reverse):
2725
+ print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2726
+ self.yes_item.dump(indent + 1, reverse)
2727
+ if not self.no_item.is_empty():
2728
+ print("{}OR".format(INDENT * indent))
2729
+ self.no_item.dump(indent + 1, reverse)
2730
+
2731
+ def is_empty(self):
2732
+ return self.yes_item.is_empty() and self.no_item.is_empty()
2733
+
2734
+ def __eq__(self, other):
2735
+ return type(self) is type(other) and (self.group, self.yes_item,
2736
+ self.no_item) == (other.group, other.yes_item, other.no_item)
2737
+
2738
+ def max_width(self):
2739
+ return max(self.yes_item.max_width(), self.no_item.max_width())
2740
+
2741
+ def __del__(self):
2742
+ self.info = None
2743
+
2744
+ class DefaultBoundary(ZeroWidthBase):
2745
+ _opcode = OP.DEFAULT_BOUNDARY
2746
+ _op_name = "DEFAULT_BOUNDARY"
2747
+
2748
+ class DefaultEndOfWord(ZeroWidthBase):
2749
+ _opcode = OP.DEFAULT_END_OF_WORD
2750
+ _op_name = "DEFAULT_END_OF_WORD"
2751
+
2752
+ class DefaultStartOfWord(ZeroWidthBase):
2753
+ _opcode = OP.DEFAULT_START_OF_WORD
2754
+ _op_name = "DEFAULT_START_OF_WORD"
2755
+
2756
+ class EndOfLine(ZeroWidthBase):
2757
+ _opcode = OP.END_OF_LINE
2758
+ _op_name = "END_OF_LINE"
2759
+
2760
+ class EndOfLineU(EndOfLine):
2761
+ _opcode = OP.END_OF_LINE_U
2762
+ _op_name = "END_OF_LINE_U"
2763
+
2764
+ class EndOfString(ZeroWidthBase):
2765
+ _opcode = OP.END_OF_STRING
2766
+ _op_name = "END_OF_STRING"
2767
+
2768
+ class EndOfStringLine(ZeroWidthBase):
2769
+ _opcode = OP.END_OF_STRING_LINE
2770
+ _op_name = "END_OF_STRING_LINE"
2771
+
2772
+ class EndOfStringLineU(EndOfStringLine):
2773
+ _opcode = OP.END_OF_STRING_LINE_U
2774
+ _op_name = "END_OF_STRING_LINE_U"
2775
+
2776
+ class EndOfWord(ZeroWidthBase):
2777
+ _opcode = OP.END_OF_WORD
2778
+ _op_name = "END_OF_WORD"
2779
+
2780
+ class Failure(ZeroWidthBase):
2781
+ _op_name = "FAILURE"
2782
+
2783
+ def _compile(self, reverse, fuzzy):
2784
+ return [(OP.FAILURE, )]
2785
+
2786
+ class Fuzzy(RegexBase):
2787
+ def __init__(self, subpattern, constraints=None):
2788
+ RegexBase.__init__(self)
2789
+ if constraints is None:
2790
+ constraints = {}
2791
+ self.subpattern = subpattern
2792
+ self.constraints = constraints
2793
+
2794
+ # If an error type is mentioned in the cost equation, then its maximum
2795
+ # defaults to unlimited.
2796
+ if "cost" in constraints:
2797
+ for e in "dis":
2798
+ if e in constraints["cost"]:
2799
+ constraints.setdefault(e, (0, None))
2800
+
2801
+ # If any error type is mentioned, then all the error maxima default to
2802
+ # 0, otherwise they default to unlimited.
2803
+ if set(constraints) & set("dis"):
2804
+ for e in "dis":
2805
+ constraints.setdefault(e, (0, 0))
2806
+ else:
2807
+ for e in "dis":
2808
+ constraints.setdefault(e, (0, None))
2809
+
2810
+ # The maximum of the generic error type defaults to unlimited.
2811
+ constraints.setdefault("e", (0, None))
2812
+
2813
+ # The cost equation defaults to equal costs. Also, the cost of any
2814
+ # error type not mentioned in the cost equation defaults to 0.
2815
+ if "cost" in constraints:
2816
+ for e in "dis":
2817
+ constraints["cost"].setdefault(e, 0)
2818
+ else:
2819
+ constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2820
+ constraints["e"][1]}
2821
+
2822
+ def fix_groups(self, pattern, reverse, fuzzy):
2823
+ self.subpattern.fix_groups(pattern, reverse, True)
2824
+
2825
+ def pack_characters(self, info):
2826
+ self.subpattern = self.subpattern.pack_characters(info)
2827
+ return self
2828
+
2829
+ def remove_captures(self):
2830
+ self.subpattern = self.subpattern.remove_captures()
2831
+ return self
2832
+
2833
+ def is_atomic(self):
2834
+ return self.subpattern.is_atomic()
2835
+
2836
+ def contains_group(self):
2837
+ return self.subpattern.contains_group()
2838
+
2839
+ def _compile(self, reverse, fuzzy):
2840
+ # The individual limits.
2841
+ arguments = []
2842
+ for e in "dise":
2843
+ v = self.constraints[e]
2844
+ arguments.append(v[0])
2845
+ arguments.append(UNLIMITED if v[1] is None else v[1])
2846
+
2847
+ # The coeffs of the cost equation.
2848
+ for e in "dis":
2849
+ arguments.append(self.constraints["cost"][e])
2850
+
2851
+ # The maximum of the cost equation.
2852
+ v = self.constraints["cost"]["max"]
2853
+ arguments.append(UNLIMITED if v is None else v)
2854
+
2855
+ flags = 0
2856
+ if reverse:
2857
+ flags |= REVERSE_OP
2858
+
2859
+ test = self.constraints.get("test")
2860
+
2861
+ if test:
2862
+ return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2863
+ test.compile(reverse, True) + [(OP.NEXT,)] +
2864
+ self.subpattern.compile(reverse, True) + [(OP.END,)])
2865
+
2866
+ return ([(OP.FUZZY, flags) + tuple(arguments)] +
2867
+ self.subpattern.compile(reverse, True) + [(OP.END,)])
2868
+
2869
+ def dump(self, indent, reverse):
2870
+ constraints = self._constraints_to_string()
2871
+ if constraints:
2872
+ constraints = " " + constraints
2873
+ print("{}FUZZY{}".format(INDENT * indent, constraints))
2874
+ self.subpattern.dump(indent + 1, reverse)
2875
+
2876
+ def is_empty(self):
2877
+ return self.subpattern.is_empty()
2878
+
2879
+ def __eq__(self, other):
2880
+ return (type(self) is type(other) and self.subpattern ==
2881
+ other.subpattern and self.constraints == other.constraints)
2882
+
2883
+ def max_width(self):
2884
+ return UNLIMITED
2885
+
2886
+ def _constraints_to_string(self):
2887
+ constraints = []
2888
+
2889
+ for name in "ids":
2890
+ min, max = self.constraints[name]
2891
+ if max == 0:
2892
+ continue
2893
+
2894
+ con = ""
2895
+
2896
+ if min > 0:
2897
+ con = "{}<=".format(min)
2898
+
2899
+ con += name
2900
+
2901
+ if max is not None:
2902
+ con += "<={}".format(max)
2903
+
2904
+ constraints.append(con)
2905
+
2906
+ cost = []
2907
+ for name in "ids":
2908
+ coeff = self.constraints["cost"][name]
2909
+ if coeff > 0:
2910
+ cost.append("{}{}".format(coeff, name))
2911
+
2912
+ limit = self.constraints["cost"]["max"]
2913
+ if limit is not None and limit > 0:
2914
+ cost = "{}<={}".format("+".join(cost), limit)
2915
+ constraints.append(cost)
2916
+
2917
+ return ",".join(constraints)
2918
+
2919
+ class Grapheme(RegexBase):
2920
+ def _compile(self, reverse, fuzzy):
2921
+ # Match at least 1 character until a grapheme boundary is reached. Note
2922
+ # that this is the same whether matching forwards or backwards.
2923
+ grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2924
+ GraphemeBoundary()]))
2925
+
2926
+ return grapheme_matcher.compile(reverse, fuzzy)
2927
+
2928
+ def dump(self, indent, reverse):
2929
+ print("{}GRAPHEME".format(INDENT * indent))
2930
+
2931
+ def max_width(self):
2932
+ return UNLIMITED
2933
+
2934
+ class GraphemeBoundary:
2935
+ def compile(self, reverse, fuzzy):
2936
+ return [(OP.GRAPHEME_BOUNDARY, 1)]
2937
+
2938
+ class GreedyRepeat(RegexBase):
2939
+ _opcode = OP.GREEDY_REPEAT
2940
+ _op_name = "GREEDY_REPEAT"
2941
+
2942
+ def __init__(self, subpattern, min_count, max_count):
2943
+ RegexBase.__init__(self)
2944
+ self.subpattern = subpattern
2945
+ self.min_count = min_count
2946
+ self.max_count = max_count
2947
+
2948
+ def fix_groups(self, pattern, reverse, fuzzy):
2949
+ self.subpattern.fix_groups(pattern, reverse, fuzzy)
2950
+
2951
+ def optimise(self, info, reverse):
2952
+ subpattern = self.subpattern.optimise(info, reverse)
2953
+
2954
+ return type(self)(subpattern, self.min_count, self.max_count)
2955
+
2956
+ def pack_characters(self, info):
2957
+ self.subpattern = self.subpattern.pack_characters(info)
2958
+ return self
2959
+
2960
+ def remove_captures(self):
2961
+ self.subpattern = self.subpattern.remove_captures()
2962
+ return self
2963
+
2964
+ def is_atomic(self):
2965
+ return self.min_count == self.max_count and self.subpattern.is_atomic()
2966
+
2967
+ def can_be_affix(self):
2968
+ return False
2969
+
2970
+ def contains_group(self):
2971
+ return self.subpattern.contains_group()
2972
+
2973
+ def get_firstset(self, reverse):
2974
+ fs = self.subpattern.get_firstset(reverse)
2975
+ if self.min_count == 0:
2976
+ fs.add(None)
2977
+
2978
+ return fs
2979
+
2980
+ def _compile(self, reverse, fuzzy):
2981
+ repeat = [self._opcode, self.min_count]
2982
+ if self.max_count is None:
2983
+ repeat.append(UNLIMITED)
2984
+ else:
2985
+ repeat.append(self.max_count)
2986
+
2987
+ subpattern = self.subpattern.compile(reverse, fuzzy)
2988
+ if not subpattern:
2989
+ return []
2990
+
2991
+ return ([tuple(repeat)] + subpattern + [(OP.END, )])
2992
+
2993
+ def dump(self, indent, reverse):
2994
+ if self.max_count is None:
2995
+ limit = "INF"
2996
+ else:
2997
+ limit = self.max_count
2998
+ print("{}{} {} {}".format(INDENT * indent, self._op_name,
2999
+ self.min_count, limit))
3000
+
3001
+ self.subpattern.dump(indent + 1, reverse)
3002
+
3003
+ def is_empty(self):
3004
+ return self.subpattern.is_empty()
3005
+
3006
+ def __eq__(self, other):
3007
+ return type(self) is type(other) and (self.subpattern, self.min_count,
3008
+ self.max_count) == (other.subpattern, other.min_count,
3009
+ other.max_count)
3010
+
3011
+ def max_width(self):
3012
+ if self.max_count is None:
3013
+ return UNLIMITED
3014
+
3015
+ return self.subpattern.max_width() * self.max_count
3016
+
3017
+ def get_required_string(self, reverse):
3018
+ max_count = UNLIMITED if self.max_count is None else self.max_count
3019
+ if self.min_count == 0:
3020
+ w = self.subpattern.max_width() * max_count
3021
+ return min(w, UNLIMITED), None
3022
+
3023
+ ofs, req = self.subpattern.get_required_string(reverse)
3024
+ if req:
3025
+ return ofs, req
3026
+
3027
+ w = self.subpattern.max_width() * max_count
3028
+ return min(w, UNLIMITED), None
3029
+
3030
+ class PossessiveRepeat(GreedyRepeat):
3031
+ def is_atomic(self):
3032
+ return True
3033
+
3034
+ def _compile(self, reverse, fuzzy):
3035
+ subpattern = self.subpattern.compile(reverse, fuzzy)
3036
+ if not subpattern:
3037
+ return []
3038
+
3039
+ repeat = [self._opcode, self.min_count]
3040
+ if self.max_count is None:
3041
+ repeat.append(UNLIMITED)
3042
+ else:
3043
+ repeat.append(self.max_count)
3044
+
3045
+ return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
3046
+ (OP.END, )])
3047
+
3048
+ def dump(self, indent, reverse):
3049
+ print("{}ATOMIC".format(INDENT * indent))
3050
+
3051
+ if self.max_count is None:
3052
+ limit = "INF"
3053
+ else:
3054
+ limit = self.max_count
3055
+ print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
3056
+ self.min_count, limit))
3057
+
3058
+ self.subpattern.dump(indent + 2, reverse)
3059
+
3060
+ class Group(RegexBase):
3061
+ def __init__(self, info, group, subpattern):
3062
+ RegexBase.__init__(self)
3063
+ self.info = info
3064
+ self.group = group
3065
+ self.subpattern = subpattern
3066
+
3067
+ self.call_ref = None
3068
+
3069
+ def fix_groups(self, pattern, reverse, fuzzy):
3070
+ self.info.defined_groups[self.group] = (self, reverse, fuzzy)
3071
+ self.subpattern.fix_groups(pattern, reverse, fuzzy)
3072
+
3073
+ def optimise(self, info, reverse):
3074
+ subpattern = self.subpattern.optimise(info, reverse)
3075
+
3076
+ return Group(self.info, self.group, subpattern)
3077
+
3078
+ def pack_characters(self, info):
3079
+ self.subpattern = self.subpattern.pack_characters(info)
3080
+ return self
3081
+
3082
+ def remove_captures(self):
3083
+ return self.subpattern.remove_captures()
3084
+
3085
+ def is_atomic(self):
3086
+ return self.subpattern.is_atomic()
3087
+
3088
+ def can_be_affix(self):
3089
+ return False
3090
+
3091
+ def contains_group(self):
3092
+ return True
3093
+
3094
+ def get_firstset(self, reverse):
3095
+ return self.subpattern.get_firstset(reverse)
3096
+
3097
+ def has_simple_start(self):
3098
+ return self.subpattern.has_simple_start()
3099
+
3100
+ def _compile(self, reverse, fuzzy):
3101
+ code = []
3102
+
3103
+ public_group = private_group = self.group
3104
+ if private_group < 0:
3105
+ public_group = self.info.private_groups[private_group]
3106
+ private_group = self.info.group_count - private_group
3107
+
3108
+ key = self.group, reverse, fuzzy
3109
+ ref = self.info.call_refs.get(key)
3110
+ if ref is not None:
3111
+ code += [(OP.CALL_REF, ref)]
3112
+
3113
+ code += [(OP.GROUP, int(not reverse), private_group, public_group)]
3114
+ code += self.subpattern.compile(reverse, fuzzy)
3115
+ code += [(OP.END, )]
3116
+
3117
+ if ref is not None:
3118
+ code += [(OP.END, )]
3119
+
3120
+ return code
3121
+
3122
+ def dump(self, indent, reverse):
3123
+ group = self.group
3124
+ if group < 0:
3125
+ group = self.info.private_groups[group]
3126
+ print("{}GROUP {}".format(INDENT * indent, group))
3127
+ self.subpattern.dump(indent + 1, reverse)
3128
+
3129
+ def __eq__(self, other):
3130
+ return (type(self) is type(other) and (self.group, self.subpattern) ==
3131
+ (other.group, other.subpattern))
3132
+
3133
+ def max_width(self):
3134
+ return self.subpattern.max_width()
3135
+
3136
+ def get_required_string(self, reverse):
3137
+ return self.subpattern.get_required_string(reverse)
3138
+
3139
+ def __del__(self):
3140
+ self.info = None
3141
+
3142
+ class Keep(ZeroWidthBase):
3143
+ _opcode = OP.KEEP
3144
+ _op_name = "KEEP"
3145
+
3146
+ class LazyRepeat(GreedyRepeat):
3147
+ _opcode = OP.LAZY_REPEAT
3148
+ _op_name = "LAZY_REPEAT"
3149
+
3150
+ class LookAround(RegexBase):
3151
+ _dir_text = {False: "AHEAD", True: "BEHIND"}
3152
+
3153
+ def __init__(self, behind, positive, subpattern):
3154
+ RegexBase.__init__(self)
3155
+ self.behind = bool(behind)
3156
+ self.positive = bool(positive)
3157
+ self.subpattern = subpattern
3158
+
3159
+ def fix_groups(self, pattern, reverse, fuzzy):
3160
+ self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3161
+
3162
+ def optimise(self, info, reverse):
3163
+ subpattern = self.subpattern.optimise(info, self.behind)
3164
+ if self.positive and subpattern.is_empty():
3165
+ return subpattern
3166
+
3167
+ return LookAround(self.behind, self.positive, subpattern)
3168
+
3169
+ def pack_characters(self, info):
3170
+ self.subpattern = self.subpattern.pack_characters(info)
3171
+ return self
3172
+
3173
+ def remove_captures(self):
3174
+ return self.subpattern.remove_captures()
3175
+
3176
+ def is_atomic(self):
3177
+ return self.subpattern.is_atomic()
3178
+
3179
+ def can_be_affix(self):
3180
+ return self.subpattern.can_be_affix()
3181
+
3182
+ def contains_group(self):
3183
+ return self.subpattern.contains_group()
3184
+
3185
+ def get_firstset(self, reverse):
3186
+ if self.positive and self.behind == reverse:
3187
+ return self.subpattern.get_firstset(reverse)
3188
+
3189
+ return set([None])
3190
+
3191
+ def _compile(self, reverse, fuzzy):
3192
+ flags = 0
3193
+ if self.positive:
3194
+ flags |= POSITIVE_OP
3195
+ if fuzzy:
3196
+ flags |= FUZZY_OP
3197
+ if reverse:
3198
+ flags |= REVERSE_OP
3199
+
3200
+ return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3201
+ self.subpattern.compile(self.behind) + [(OP.END, )])
3202
+
3203
+ def dump(self, indent, reverse):
3204
+ print("{}LOOK{} {}".format(INDENT * indent,
3205
+ self._dir_text[self.behind], POS_TEXT[self.positive]))
3206
+ self.subpattern.dump(indent + 1, self.behind)
3207
+
3208
+ def is_empty(self):
3209
+ return self.positive and self.subpattern.is_empty()
3210
+
3211
+ def __eq__(self, other):
3212
+ return type(self) is type(other) and (self.behind, self.positive,
3213
+ self.subpattern) == (other.behind, other.positive, other.subpattern)
3214
+
3215
+ def max_width(self):
3216
+ return 0
3217
+
3218
+ class LookAroundConditional(RegexBase):
3219
+ _dir_text = {False: "AHEAD", True: "BEHIND"}
3220
+
3221
+ def __init__(self, behind, positive, subpattern, yes_item, no_item):
3222
+ RegexBase.__init__(self)
3223
+ self.behind = bool(behind)
3224
+ self.positive = bool(positive)
3225
+ self.subpattern = subpattern
3226
+ self.yes_item = yes_item
3227
+ self.no_item = no_item
3228
+
3229
+ def fix_groups(self, pattern, reverse, fuzzy):
3230
+ self.subpattern.fix_groups(pattern, reverse, fuzzy)
3231
+ self.yes_item.fix_groups(pattern, reverse, fuzzy)
3232
+ self.no_item.fix_groups(pattern, reverse, fuzzy)
3233
+
3234
+ def optimise(self, info, reverse):
3235
+ subpattern = self.subpattern.optimise(info, self.behind)
3236
+ yes_item = self.yes_item.optimise(info, self.behind)
3237
+ no_item = self.no_item.optimise(info, self.behind)
3238
+
3239
+ return LookAroundConditional(self.behind, self.positive, subpattern,
3240
+ yes_item, no_item)
3241
+
3242
+ def pack_characters(self, info):
3243
+ self.subpattern = self.subpattern.pack_characters(info)
3244
+ self.yes_item = self.yes_item.pack_characters(info)
3245
+ self.no_item = self.no_item.pack_characters(info)
3246
+ return self
3247
+
3248
+ def remove_captures(self):
3249
+ self.subpattern = self.subpattern.remove_captures()
3250
+ self.yes_item = self.yes_item.remove_captures()
3251
+ self.no_item = self.no_item.remove_captures()
3252
+
3253
+ def is_atomic(self):
3254
+ return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3255
+ self.no_item.is_atomic())
3256
+
3257
+ def can_be_affix(self):
3258
+ return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3259
+ and self.no_item.can_be_affix())
3260
+
3261
+ def contains_group(self):
3262
+ return (self.subpattern.contains_group() or
3263
+ self.yes_item.contains_group() or self.no_item.contains_group())
3264
+
3265
+ def _compile(self, reverse, fuzzy):
3266
+ code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3267
+ code.extend(self.subpattern.compile(self.behind, fuzzy))
3268
+ code.append((OP.NEXT, ))
3269
+ code.extend(self.yes_item.compile(reverse, fuzzy))
3270
+ add_code = self.no_item.compile(reverse, fuzzy)
3271
+ if add_code:
3272
+ code.append((OP.NEXT, ))
3273
+ code.extend(add_code)
3274
+
3275
+ code.append((OP.END, ))
3276
+
3277
+ return code
3278
+
3279
+ def dump(self, indent, reverse):
3280
+ print("{}CONDITIONAL {} {}".format(INDENT * indent,
3281
+ self._dir_text[self.behind], POS_TEXT[self.positive]))
3282
+ self.subpattern.dump(indent + 1, self.behind)
3283
+ print("{}EITHER".format(INDENT * indent))
3284
+ self.yes_item.dump(indent + 1, reverse)
3285
+ if not self.no_item.is_empty():
3286
+ print("{}OR".format(INDENT * indent))
3287
+ self.no_item.dump(indent + 1, reverse)
3288
+
3289
+ def is_empty(self):
3290
+ return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3291
+ self.no_item.is_empty())
3292
+
3293
+ def __eq__(self, other):
3294
+ return type(self) is type(other) and (self.subpattern, self.yes_item,
3295
+ self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3296
+
3297
+ def max_width(self):
3298
+ return max(self.yes_item.max_width(), self.no_item.max_width())
3299
+
3300
+ def get_required_string(self, reverse):
3301
+ return self.max_width(), None
3302
+
3303
+ class PrecompiledCode(RegexBase):
3304
+ def __init__(self, code):
3305
+ self.code = code
3306
+
3307
+ def _compile(self, reverse, fuzzy):
3308
+ return [tuple(self.code)]
3309
+
3310
+ class Property(RegexBase):
3311
+ _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3312
+ OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3313
+ OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3314
+ OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3315
+ True): OP.PROPERTY_IGN_REV}
3316
+
3317
+ def __init__(self, value, positive=True, case_flags=NOCASE,
3318
+ zerowidth=False, encoding=0):
3319
+ RegexBase.__init__(self)
3320
+ self.value = value
3321
+ self.positive = bool(positive)
3322
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3323
+ self.zerowidth = bool(zerowidth)
3324
+ self.encoding = encoding
3325
+
3326
+ self._key = (self.__class__, self.value, self.positive,
3327
+ self.case_flags, self.zerowidth)
3328
+
3329
+ def rebuild(self, positive, case_flags, zerowidth):
3330
+ return Property(self.value, positive, case_flags, zerowidth,
3331
+ self.encoding)
3332
+
3333
+ def optimise(self, info, reverse, in_set=False):
3334
+ return self
3335
+
3336
+ def get_firstset(self, reverse):
3337
+ return set([self])
3338
+
3339
+ def has_simple_start(self):
3340
+ return True
3341
+
3342
+ def _compile(self, reverse, fuzzy):
3343
+ flags = 0
3344
+ if self.positive:
3345
+ flags |= POSITIVE_OP
3346
+ if self.zerowidth:
3347
+ flags |= ZEROWIDTH_OP
3348
+ if fuzzy:
3349
+ flags |= FUZZY_OP
3350
+ flags |= self.encoding << ENCODING_OP_SHIFT
3351
+ return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3352
+
3353
+ def dump(self, indent, reverse):
3354
+ prop = PROPERTY_NAMES[self.value >> 16]
3355
+ name, value = prop[0], prop[1][self.value & 0xFFFF]
3356
+ print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,
3357
+ POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],
3358
+ ["", " ASCII"][self.encoding]))
3359
+
3360
+ def matches(self, ch):
3361
+ return _regex.has_property_value(self.value, ch) == self.positive
3362
+
3363
+ def max_width(self):
3364
+ return 1
3365
+
3366
+ class Prune(ZeroWidthBase):
3367
+ _op_name = "PRUNE"
3368
+
3369
+ def _compile(self, reverse, fuzzy):
3370
+ return [(OP.PRUNE, )]
3371
+
3372
+ class Range(RegexBase):
3373
+ _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3374
+ (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3375
+ (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3376
+ (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3377
+ _op_name = "RANGE"
3378
+
3379
+ def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3380
+ zerowidth=False):
3381
+ RegexBase.__init__(self)
3382
+ self.lower = lower
3383
+ self.upper = upper
3384
+ self.positive = bool(positive)
3385
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3386
+ self.zerowidth = bool(zerowidth)
3387
+
3388
+ self._key = (self.__class__, self.lower, self.upper, self.positive,
3389
+ self.case_flags, self.zerowidth)
3390
+
3391
+ def rebuild(self, positive, case_flags, zerowidth):
3392
+ return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3393
+
3394
+ def optimise(self, info, reverse, in_set=False):
3395
+ # Is the range case-sensitive?
3396
+ if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3397
+ return self
3398
+
3399
+ # Is full case-folding possible?
3400
+ if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3401
+ FULLIGNORECASE):
3402
+ return self
3403
+
3404
+ # Get the characters which expand to multiple codepoints on folding.
3405
+ expanding_chars = _regex.get_expand_on_folding()
3406
+
3407
+ # Get the folded characters in the range.
3408
+ items = []
3409
+ for ch in expanding_chars:
3410
+ if self.lower <= ord(ch) <= self.upper:
3411
+ folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3412
+ items.append(String([ord(c) for c in folded],
3413
+ case_flags=self.case_flags))
3414
+
3415
+ if not items:
3416
+ # We can fall back to simple case-folding.
3417
+ return self
3418
+
3419
+ if len(items) < self.upper - self.lower + 1:
3420
+ # Not all the characters are covered by the full case-folding.
3421
+ items.insert(0, self)
3422
+
3423
+ return Branch(items)
3424
+
3425
+ def _compile(self, reverse, fuzzy):
3426
+ flags = 0
3427
+ if self.positive:
3428
+ flags |= POSITIVE_OP
3429
+ if self.zerowidth:
3430
+ flags |= ZEROWIDTH_OP
3431
+ if fuzzy:
3432
+ flags |= FUZZY_OP
3433
+ return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3434
+ self.upper)]
3435
+
3436
+ def dump(self, indent, reverse):
3437
+ display_lower = ascii(chr(self.lower)).lstrip("bu")
3438
+ display_upper = ascii(chr(self.upper)).lstrip("bu")
3439
+ print("{}RANGE {} {} {}{}".format(INDENT * indent,
3440
+ POS_TEXT[self.positive], display_lower, display_upper,
3441
+ CASE_TEXT[self.case_flags]))
3442
+
3443
+ def matches(self, ch):
3444
+ return (self.lower <= ch <= self.upper) == self.positive
3445
+
3446
+ def max_width(self):
3447
+ return 1
3448
+
3449
+ class RefGroup(RegexBase):
3450
+ _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3451
+ OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3452
+ False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3453
+ True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3454
+ (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3455
+
3456
+ def __init__(self, info, group, position, case_flags=NOCASE):
3457
+ RegexBase.__init__(self)
3458
+ self.info = info
3459
+ self.group = group
3460
+ self.position = position
3461
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3462
+
3463
+ self._key = self.__class__, self.group, self.case_flags
3464
+
3465
+ def fix_groups(self, pattern, reverse, fuzzy):
3466
+ try:
3467
+ self.group = int(self.group)
3468
+ except ValueError:
3469
+ try:
3470
+ self.group = self.info.group_index[self.group]
3471
+ except KeyError:
3472
+ raise error("unknown group", pattern, self.position)
3473
+
3474
+ if not 1 <= self.group <= self.info.group_count:
3475
+ raise error("invalid group reference", pattern, self.position)
3476
+
3477
+ self._key = self.__class__, self.group, self.case_flags
3478
+
3479
+ def remove_captures(self):
3480
+ raise error("group reference not allowed", self.pattern, self.position)
3481
+
3482
+ def _compile(self, reverse, fuzzy):
3483
+ flags = 0
3484
+ if fuzzy:
3485
+ flags |= FUZZY_OP
3486
+ return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3487
+
3488
+ def dump(self, indent, reverse):
3489
+ print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3490
+ CASE_TEXT[self.case_flags]))
3491
+
3492
+ def max_width(self):
3493
+ return UNLIMITED
3494
+
3495
+ def __del__(self):
3496
+ self.info = None
3497
+
3498
+ class SearchAnchor(ZeroWidthBase):
3499
+ _opcode = OP.SEARCH_ANCHOR
3500
+ _op_name = "SEARCH_ANCHOR"
3501
+
3502
+ class Sequence(RegexBase):
3503
+ def __init__(self, items=None):
3504
+ RegexBase.__init__(self)
3505
+ if items is None:
3506
+ items = []
3507
+
3508
+ self.items = items
3509
+
3510
+ def fix_groups(self, pattern, reverse, fuzzy):
3511
+ for s in self.items:
3512
+ s.fix_groups(pattern, reverse, fuzzy)
3513
+
3514
+ def optimise(self, info, reverse):
3515
+ # Flatten the sequences.
3516
+ items = []
3517
+ for s in self.items:
3518
+ s = s.optimise(info, reverse)
3519
+ if isinstance(s, Sequence):
3520
+ items.extend(s.items)
3521
+ else:
3522
+ items.append(s)
3523
+
3524
+ return make_sequence(items)
3525
+
3526
+ def pack_characters(self, info):
3527
+ "Packs sequences of characters into strings."
3528
+ items = []
3529
+ characters = []
3530
+ case_flags = NOCASE
3531
+ for s in self.items:
3532
+ if type(s) is Character and s.positive and not s.zerowidth:
3533
+ if s.case_flags != case_flags:
3534
+ # Different case sensitivity, so flush, unless neither the
3535
+ # previous nor the new character are cased.
3536
+ if s.case_flags or is_cased_i(info, s.value):
3537
+ Sequence._flush_characters(info, characters,
3538
+ case_flags, items)
3539
+
3540
+ case_flags = s.case_flags
3541
+
3542
+ characters.append(s.value)
3543
+ elif type(s) is String or type(s) is Literal:
3544
+ if s.case_flags != case_flags:
3545
+ # Different case sensitivity, so flush, unless the neither
3546
+ # the previous nor the new string are cased.
3547
+ if s.case_flags or any(is_cased_i(info, c) for c in
3548
+ characters):
3549
+ Sequence._flush_characters(info, characters,
3550
+ case_flags, items)
3551
+
3552
+ case_flags = s.case_flags
3553
+
3554
+ characters.extend(s.characters)
3555
+ else:
3556
+ Sequence._flush_characters(info, characters, case_flags, items)
3557
+
3558
+ items.append(s.pack_characters(info))
3559
+
3560
+ Sequence._flush_characters(info, characters, case_flags, items)
3561
+
3562
+ return make_sequence(items)
3563
+
3564
+ def remove_captures(self):
3565
+ self.items = [s.remove_captures() for s in self.items]
3566
+ return self
3567
+
3568
+ def is_atomic(self):
3569
+ return all(s.is_atomic() for s in self.items)
3570
+
3571
+ def can_be_affix(self):
3572
+ return False
3573
+
3574
+ def contains_group(self):
3575
+ return any(s.contains_group() for s in self.items)
3576
+
3577
+ def get_firstset(self, reverse):
3578
+ fs = set()
3579
+ items = self.items
3580
+ if reverse:
3581
+ items.reverse()
3582
+ for s in items:
3583
+ fs |= s.get_firstset(reverse)
3584
+ if None not in fs:
3585
+ return fs
3586
+ fs.discard(None)
3587
+
3588
+ return fs | set([None])
3589
+
3590
+ def has_simple_start(self):
3591
+ return bool(self.items) and self.items[0].has_simple_start()
3592
+
3593
+ def _compile(self, reverse, fuzzy):
3594
+ seq = self.items
3595
+ if reverse:
3596
+ seq = seq[::-1]
3597
+
3598
+ code = []
3599
+ for s in seq:
3600
+ code.extend(s.compile(reverse, fuzzy))
3601
+
3602
+ return code
3603
+
3604
+ def dump(self, indent, reverse):
3605
+ for s in self.items:
3606
+ s.dump(indent, reverse)
3607
+
3608
+ @staticmethod
3609
+ def _flush_characters(info, characters, case_flags, items):
3610
+ if not characters:
3611
+ return
3612
+
3613
+ # Disregard case_flags if all of the characters are case-less.
3614
+ if case_flags & IGNORECASE:
3615
+ if not any(is_cased_i(info, c) for c in characters):
3616
+ case_flags = NOCASE
3617
+
3618
+ if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3619
+ literals = Sequence._fix_full_casefold(characters)
3620
+
3621
+ for item in literals:
3622
+ chars = item.characters
3623
+
3624
+ if len(chars) == 1:
3625
+ items.append(Character(chars[0], case_flags=item.case_flags))
3626
+ else:
3627
+ items.append(String(chars, case_flags=item.case_flags))
3628
+ else:
3629
+ if len(characters) == 1:
3630
+ items.append(Character(characters[0], case_flags=case_flags))
3631
+ else:
3632
+ items.append(String(characters, case_flags=case_flags))
3633
+
3634
+ characters[:] = []
3635
+
3636
+ @staticmethod
3637
+ def _fix_full_casefold(characters):
3638
+ # Split a literal needing full case-folding into chunks that need it
3639
+ # and chunks that can use simple case-folding, which is faster.
3640
+ expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3641
+ _regex.get_expand_on_folding()]
3642
+ string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3643
+ for c in characters)).lower()
3644
+ chunks = []
3645
+
3646
+ for e in expanded:
3647
+ found = string.find(e)
3648
+
3649
+ while found >= 0:
3650
+ chunks.append((found, found + len(e)))
3651
+ found = string.find(e, found + 1)
3652
+
3653
+ pos = 0
3654
+ literals = []
3655
+
3656
+ for start, end in Sequence._merge_chunks(chunks):
3657
+ if pos < start:
3658
+ literals.append(Literal(characters[pos : start],
3659
+ case_flags=IGNORECASE))
3660
+
3661
+ literals.append(Literal(characters[start : end],
3662
+ case_flags=FULLIGNORECASE))
3663
+ pos = end
3664
+
3665
+ if pos < len(characters):
3666
+ literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3667
+
3668
+ return literals
3669
+
3670
+ @staticmethod
3671
+ def _merge_chunks(chunks):
3672
+ if len(chunks) < 2:
3673
+ return chunks
3674
+
3675
+ chunks.sort()
3676
+
3677
+ start, end = chunks[0]
3678
+ new_chunks = []
3679
+
3680
+ for s, e in chunks[1 : ]:
3681
+ if s <= end:
3682
+ end = max(end, e)
3683
+ else:
3684
+ new_chunks.append((start, end))
3685
+ start, end = s, e
3686
+
3687
+ new_chunks.append((start, end))
3688
+
3689
+ return new_chunks
3690
+
3691
+ def is_empty(self):
3692
+ return all(i.is_empty() for i in self.items)
3693
+
3694
+ def __eq__(self, other):
3695
+ return type(self) is type(other) and self.items == other.items
3696
+
3697
+ def max_width(self):
3698
+ return sum(s.max_width() for s in self.items)
3699
+
3700
+ def get_required_string(self, reverse):
3701
+ seq = self.items
3702
+ if reverse:
3703
+ seq = seq[::-1]
3704
+
3705
+ offset = 0
3706
+
3707
+ for s in seq:
3708
+ ofs, req = s.get_required_string(reverse)
3709
+ offset += ofs
3710
+ if req:
3711
+ return offset, req
3712
+
3713
+ return offset, None
3714
+
3715
+ class SetBase(RegexBase):
3716
+ def __init__(self, info, items, positive=True, case_flags=NOCASE,
3717
+ zerowidth=False):
3718
+ RegexBase.__init__(self)
3719
+ self.info = info
3720
+ self.items = tuple(items)
3721
+ self.positive = bool(positive)
3722
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3723
+ self.zerowidth = bool(zerowidth)
3724
+
3725
+ self.char_width = 1
3726
+
3727
+ self._key = (self.__class__, self.items, self.positive,
3728
+ self.case_flags, self.zerowidth)
3729
+
3730
+ def rebuild(self, positive, case_flags, zerowidth):
3731
+ return type(self)(self.info, self.items, positive, case_flags,
3732
+ zerowidth).optimise(self.info, False)
3733
+
3734
+ def get_firstset(self, reverse):
3735
+ return set([self])
3736
+
3737
+ def has_simple_start(self):
3738
+ return True
3739
+
3740
+ def _compile(self, reverse, fuzzy):
3741
+ flags = 0
3742
+ if self.positive:
3743
+ flags |= POSITIVE_OP
3744
+ if self.zerowidth:
3745
+ flags |= ZEROWIDTH_OP
3746
+ if fuzzy:
3747
+ flags |= FUZZY_OP
3748
+ code = [(self._opcode[self.case_flags, reverse], flags)]
3749
+ for m in self.items:
3750
+ code.extend(m.compile())
3751
+
3752
+ code.append((OP.END, ))
3753
+
3754
+ return code
3755
+
3756
+ def dump(self, indent, reverse):
3757
+ print("{}{} {}{}".format(INDENT * indent, self._op_name,
3758
+ POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3759
+ for i in self.items:
3760
+ i.dump(indent + 1, reverse)
3761
+
3762
+ def _handle_case_folding(self, info, in_set):
3763
+ # Is the set case-sensitive?
3764
+ if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3765
+ return self
3766
+
3767
+ # Is full case-folding possible?
3768
+ if (not (self.info.flags & UNICODE) or (self.case_flags &
3769
+ FULLIGNORECASE) != FULLIGNORECASE):
3770
+ return self
3771
+
3772
+ # Get the characters which expand to multiple codepoints on folding.
3773
+ expanding_chars = _regex.get_expand_on_folding()
3774
+
3775
+ # Get the folded characters in the set.
3776
+ items = []
3777
+ seen = set()
3778
+ for ch in expanding_chars:
3779
+ if self.matches(ord(ch)):
3780
+ folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3781
+ if folded not in seen:
3782
+ items.append(String([ord(c) for c in folded],
3783
+ case_flags=self.case_flags))
3784
+ seen.add(folded)
3785
+
3786
+ if not items:
3787
+ # We can fall back to simple case-folding.
3788
+ return self
3789
+
3790
+ return Branch([self] + items)
3791
+
3792
+ def max_width(self):
3793
+ # Is the set case-sensitive?
3794
+ if not self.positive or not (self.case_flags & IGNORECASE):
3795
+ return 1
3796
+
3797
+ # Is full case-folding possible?
3798
+ if (not (self.info.flags & UNICODE) or (self.case_flags &
3799
+ FULLIGNORECASE) != FULLIGNORECASE):
3800
+ return 1
3801
+
3802
+ # Get the characters which expand to multiple codepoints on folding.
3803
+ expanding_chars = _regex.get_expand_on_folding()
3804
+
3805
+ # Get the folded characters in the set.
3806
+ seen = set()
3807
+ for ch in expanding_chars:
3808
+ if self.matches(ord(ch)):
3809
+ folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3810
+ seen.add(folded)
3811
+
3812
+ if not seen:
3813
+ return 1
3814
+
3815
+ return max(len(folded) for folded in seen)
3816
+
3817
+ def __del__(self):
3818
+ self.info = None
3819
+
3820
+ class SetDiff(SetBase):
3821
+ _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3822
+ OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3823
+ OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3824
+ OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3825
+ True): OP.SET_DIFF_IGN_REV}
3826
+ _op_name = "SET_DIFF"
3827
+
3828
+ def optimise(self, info, reverse, in_set=False):
3829
+ items = self.items
3830
+ if len(items) > 2:
3831
+ items = [items[0], SetUnion(info, items[1 : ])]
3832
+
3833
+ if len(items) == 1:
3834
+ return items[0].with_flags(case_flags=self.case_flags,
3835
+ zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3836
+
3837
+ self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3838
+ items)
3839
+
3840
+ return self._handle_case_folding(info, in_set)
3841
+
3842
+ def matches(self, ch):
3843
+ m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3844
+ return m == self.positive
3845
+
3846
+ class SetInter(SetBase):
3847
+ _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3848
+ OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3849
+ False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3850
+ True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3851
+ (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3852
+ _op_name = "SET_INTER"
3853
+
3854
+ def optimise(self, info, reverse, in_set=False):
3855
+ items = []
3856
+ for m in self.items:
3857
+ m = m.optimise(info, reverse, in_set=True)
3858
+ if isinstance(m, SetInter) and m.positive:
3859
+ # Intersection in intersection.
3860
+ items.extend(m.items)
3861
+ else:
3862
+ items.append(m)
3863
+
3864
+ if len(items) == 1:
3865
+ return items[0].with_flags(case_flags=self.case_flags,
3866
+ zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3867
+
3868
+ self.items = tuple(items)
3869
+
3870
+ return self._handle_case_folding(info, in_set)
3871
+
3872
+ def matches(self, ch):
3873
+ m = all(i.matches(ch) for i in self.items)
3874
+ return m == self.positive
3875
+
3876
+ class SetSymDiff(SetBase):
3877
+ _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3878
+ OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3879
+ False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3880
+ (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3881
+ OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3882
+ _op_name = "SET_SYM_DIFF"
3883
+
3884
+ def optimise(self, info, reverse, in_set=False):
3885
+ items = []
3886
+ for m in self.items:
3887
+ m = m.optimise(info, reverse, in_set=True)
3888
+ if isinstance(m, SetSymDiff) and m.positive:
3889
+ # Symmetric difference in symmetric difference.
3890
+ items.extend(m.items)
3891
+ else:
3892
+ items.append(m)
3893
+
3894
+ if len(items) == 1:
3895
+ return items[0].with_flags(case_flags=self.case_flags,
3896
+ zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3897
+
3898
+ self.items = tuple(items)
3899
+
3900
+ return self._handle_case_folding(info, in_set)
3901
+
3902
+ def matches(self, ch):
3903
+ m = False
3904
+ for i in self.items:
3905
+ m = m != i.matches(ch)
3906
+
3907
+ return m == self.positive
3908
+
3909
+ class SetUnion(SetBase):
3910
+ _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3911
+ OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3912
+ False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3913
+ True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3914
+ (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3915
+ _op_name = "SET_UNION"
3916
+
3917
+ def optimise(self, info, reverse, in_set=False):
3918
+ items = []
3919
+ for m in self.items:
3920
+ m = m.optimise(info, reverse, in_set=True)
3921
+ if isinstance(m, SetUnion) and m.positive:
3922
+ # Union in union.
3923
+ items.extend(m.items)
3924
+ elif isinstance(m, AnyAll):
3925
+ return AnyAll()
3926
+ else:
3927
+ items.append(m)
3928
+
3929
+ # Are there complementary properties?
3930
+ properties = (set(), set())
3931
+
3932
+ for m in items:
3933
+ if isinstance(m, Property):
3934
+ properties[m.positive].add((m.value, m.case_flags, m.zerowidth))
3935
+
3936
+ if properties[0] & properties[1]:
3937
+ return AnyAll()
3938
+
3939
+ if len(items) == 1:
3940
+ i = items[0]
3941
+ return i.with_flags(positive=i.positive == self.positive,
3942
+ case_flags=self.case_flags,
3943
+ zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3944
+
3945
+ self.items = tuple(items)
3946
+
3947
+ return self._handle_case_folding(info, in_set)
3948
+
3949
+ def _compile(self, reverse, fuzzy):
3950
+ flags = 0
3951
+ if self.positive:
3952
+ flags |= POSITIVE_OP
3953
+ if self.zerowidth:
3954
+ flags |= ZEROWIDTH_OP
3955
+ if fuzzy:
3956
+ flags |= FUZZY_OP
3957
+
3958
+ characters, others = defaultdict(list), []
3959
+ for m in self.items:
3960
+ if isinstance(m, Character):
3961
+ characters[m.positive].append(m.value)
3962
+ else:
3963
+ others.append(m)
3964
+
3965
+ code = [(self._opcode[self.case_flags, reverse], flags)]
3966
+
3967
+ for positive, values in characters.items():
3968
+ flags = 0
3969
+ if positive:
3970
+ flags |= POSITIVE_OP
3971
+ if len(values) == 1:
3972
+ code.append((OP.CHARACTER, flags, values[0]))
3973
+ else:
3974
+ code.append((OP.STRING, flags, len(values)) + tuple(values))
3975
+
3976
+ for m in others:
3977
+ code.extend(m.compile())
3978
+
3979
+ code.append((OP.END, ))
3980
+
3981
+ return code
3982
+
3983
+ def matches(self, ch):
3984
+ m = any(i.matches(ch) for i in self.items)
3985
+ return m == self.positive
3986
+
3987
+ class Skip(ZeroWidthBase):
3988
+ _op_name = "SKIP"
3989
+ _opcode = OP.SKIP
3990
+
3991
+ class StartOfLine(ZeroWidthBase):
3992
+ _opcode = OP.START_OF_LINE
3993
+ _op_name = "START_OF_LINE"
3994
+
3995
+ class StartOfLineU(StartOfLine):
3996
+ _opcode = OP.START_OF_LINE_U
3997
+ _op_name = "START_OF_LINE_U"
3998
+
3999
+ class StartOfString(ZeroWidthBase):
4000
+ _opcode = OP.START_OF_STRING
4001
+ _op_name = "START_OF_STRING"
4002
+
4003
+ class StartOfWord(ZeroWidthBase):
4004
+ _opcode = OP.START_OF_WORD
4005
+ _op_name = "START_OF_WORD"
4006
+
4007
+ class String(RegexBase):
4008
+ _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
4009
+ (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
4010
+ (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
4011
+ (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
4012
+ OP.STRING_FLD_REV}
4013
+
4014
+ def __init__(self, characters, case_flags=NOCASE):
4015
+ self.characters = tuple(characters)
4016
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4017
+
4018
+ if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
4019
+ folded_characters = []
4020
+ for char in self.characters:
4021
+ folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
4022
+ folded_characters.extend(ord(c) for c in folded)
4023
+ else:
4024
+ folded_characters = self.characters
4025
+
4026
+ self.folded_characters = tuple(folded_characters)
4027
+ self.required = False
4028
+
4029
+ self._key = self.__class__, self.characters, self.case_flags
4030
+
4031
+ def get_firstset(self, reverse):
4032
+ if reverse:
4033
+ pos = -1
4034
+ else:
4035
+ pos = 0
4036
+ return set([Character(self.characters[pos],
4037
+ case_flags=self.case_flags)])
4038
+
4039
+ def has_simple_start(self):
4040
+ return True
4041
+
4042
+ def _compile(self, reverse, fuzzy):
4043
+ flags = 0
4044
+ if fuzzy:
4045
+ flags |= FUZZY_OP
4046
+ if self.required:
4047
+ flags |= REQUIRED_OP
4048
+ return [(self._opcode[self.case_flags, reverse], flags,
4049
+ len(self.folded_characters)) + self.folded_characters]
4050
+
4051
+ def dump(self, indent, reverse):
4052
+ display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
4053
+ print("{}STRING {}{}".format(INDENT * indent, display,
4054
+ CASE_TEXT[self.case_flags]))
4055
+
4056
+ def max_width(self):
4057
+ return len(self.folded_characters)
4058
+
4059
+ def get_required_string(self, reverse):
4060
+ return 0, self
4061
+
4062
+ class Literal(String):
4063
+ def dump(self, indent, reverse):
4064
+ literal = ''.join(chr(c) for c in self.characters)
4065
+ display = ascii(literal).lstrip("bu")
4066
+ print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
4067
+ CASE_TEXT[self.case_flags]))
4068
+
4069
+ class StringSet(Branch):
4070
+ def __init__(self, info, name, case_flags=NOCASE):
4071
+ self.info = info
4072
+ self.name = name
4073
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4074
+
4075
+ self._key = self.__class__, self.name, self.case_flags
4076
+
4077
+ self.set_key = (name, self.case_flags)
4078
+ if self.set_key not in info.named_lists_used:
4079
+ info.named_lists_used[self.set_key] = len(info.named_lists_used)
4080
+
4081
+ index = self.info.named_lists_used[self.set_key]
4082
+ items = self.info.kwargs[self.name]
4083
+
4084
+ case_flags = self.case_flags
4085
+
4086
+ encoding = self.info.flags & _ALL_ENCODINGS
4087
+ fold_flags = encoding | case_flags
4088
+
4089
+ choices = []
4090
+
4091
+ for string in items:
4092
+ if isinstance(string, str):
4093
+ string = [ord(c) for c in string]
4094
+
4095
+ choices.append([Character(c, case_flags=case_flags) for c in
4096
+ string])
4097
+
4098
+ # Sort from longest to shortest.
4099
+ choices.sort(key=len, reverse=True)
4100
+
4101
+ self.branches = [Sequence(choice) for choice in choices]
4102
+
4103
+ def dump(self, indent, reverse):
4104
+ print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
4105
+ CASE_TEXT[self.case_flags]))
4106
+
4107
+ def __del__(self):
4108
+ self.info = None
4109
+
4110
+ class Source:
4111
+ "Scanner for the regular expression source string."
4112
+ def __init__(self, string):
4113
+ if isinstance(string, str):
4114
+ self.string = string
4115
+ self.char_type = chr
4116
+ else:
4117
+ self.string = string.decode("latin-1")
4118
+ self.char_type = lambda c: bytes([c])
4119
+
4120
+ self.pos = 0
4121
+ self.ignore_space = False
4122
+ self.sep = string[ : 0]
4123
+
4124
+ def peek(self, override_ignore=False):
4125
+ string = self.string
4126
+ pos = self.pos
4127
+
4128
+ try:
4129
+ if self.ignore_space and not override_ignore:
4130
+ while True:
4131
+ if string[pos].isspace():
4132
+ # Skip over the whitespace.
4133
+ pos += 1
4134
+ elif string[pos] == "#":
4135
+ # Skip over the comment to the end of the line.
4136
+ pos = string.index("\n", pos)
4137
+ else:
4138
+ break
4139
+
4140
+ return string[pos]
4141
+ except IndexError:
4142
+ # We've reached the end of the string.
4143
+ return string[ : 0]
4144
+ except ValueError:
4145
+ # The comment extended to the end of the string.
4146
+ return string[ : 0]
4147
+
4148
+ def get(self, override_ignore=False):
4149
+ string = self.string
4150
+ pos = self.pos
4151
+
4152
+ try:
4153
+ if self.ignore_space and not override_ignore:
4154
+ while True:
4155
+ if string[pos].isspace():
4156
+ # Skip over the whitespace.
4157
+ pos += 1
4158
+ elif string[pos] == "#":
4159
+ # Skip over the comment to the end of the line.
4160
+ pos = string.index("\n", pos)
4161
+ else:
4162
+ break
4163
+
4164
+ ch = string[pos]
4165
+ self.pos = pos + 1
4166
+ return ch
4167
+ except IndexError:
4168
+ # We've reached the end of the string.
4169
+ self.pos = pos
4170
+ return string[ : 0]
4171
+ except ValueError:
4172
+ # The comment extended to the end of the string.
4173
+ self.pos = len(string)
4174
+ return string[ : 0]
4175
+
4176
+ def get_many(self, count=1):
4177
+ string = self.string
4178
+ pos = self.pos
4179
+
4180
+ try:
4181
+ if self.ignore_space:
4182
+ substring = []
4183
+
4184
+ while len(substring) < count:
4185
+ while True:
4186
+ if string[pos].isspace():
4187
+ # Skip over the whitespace.
4188
+ pos += 1
4189
+ elif string[pos] == "#":
4190
+ # Skip over the comment to the end of the line.
4191
+ pos = string.index("\n", pos)
4192
+ else:
4193
+ break
4194
+
4195
+ substring.append(string[pos])
4196
+ pos += 1
4197
+
4198
+ substring = "".join(substring)
4199
+ else:
4200
+ substring = string[pos : pos + count]
4201
+ pos += len(substring)
4202
+
4203
+ self.pos = pos
4204
+ return substring
4205
+ except IndexError:
4206
+ # We've reached the end of the string.
4207
+ self.pos = len(string)
4208
+ return "".join(substring)
4209
+ except ValueError:
4210
+ # The comment extended to the end of the string.
4211
+ self.pos = len(string)
4212
+ return "".join(substring)
4213
+
4214
+ def get_while(self, test_set, include=True, keep_spaces=False):
4215
+ string = self.string
4216
+ pos = self.pos
4217
+
4218
+ if self.ignore_space and not keep_spaces:
4219
+ try:
4220
+ substring = []
4221
+
4222
+ while True:
4223
+ if string[pos].isspace():
4224
+ # Skip over the whitespace.
4225
+ pos += 1
4226
+ elif string[pos] == "#":
4227
+ # Skip over the comment to the end of the line.
4228
+ pos = string.index("\n", pos)
4229
+ elif (string[pos] in test_set) == include:
4230
+ substring.append(string[pos])
4231
+ pos += 1
4232
+ else:
4233
+ break
4234
+
4235
+ self.pos = pos
4236
+ except IndexError:
4237
+ # We've reached the end of the string.
4238
+ self.pos = len(string)
4239
+ except ValueError:
4240
+ # The comment extended to the end of the string.
4241
+ self.pos = len(string)
4242
+
4243
+ return "".join(substring)
4244
+ else:
4245
+ try:
4246
+ while (string[pos] in test_set) == include:
4247
+ pos += 1
4248
+
4249
+ substring = string[self.pos : pos]
4250
+
4251
+ self.pos = pos
4252
+
4253
+ return substring
4254
+ except IndexError:
4255
+ # We've reached the end of the string.
4256
+ substring = string[self.pos : pos]
4257
+
4258
+ self.pos = pos
4259
+
4260
+ return substring
4261
+
4262
+ def skip_while(self, test_set, include=True):
4263
+ string = self.string
4264
+ pos = self.pos
4265
+
4266
+ try:
4267
+ if self.ignore_space:
4268
+ while True:
4269
+ if string[pos].isspace():
4270
+ # Skip over the whitespace.
4271
+ pos += 1
4272
+ elif string[pos] == "#":
4273
+ # Skip over the comment to the end of the line.
4274
+ pos = string.index("\n", pos)
4275
+ elif (string[pos] in test_set) == include:
4276
+ pos += 1
4277
+ else:
4278
+ break
4279
+ else:
4280
+ while (string[pos] in test_set) == include:
4281
+ pos += 1
4282
+
4283
+ self.pos = pos
4284
+ except IndexError:
4285
+ # We've reached the end of the string.
4286
+ self.pos = len(string)
4287
+ except ValueError:
4288
+ # The comment extended to the end of the string.
4289
+ self.pos = len(string)
4290
+
4291
+ def match(self, substring):
4292
+ string = self.string
4293
+ pos = self.pos
4294
+
4295
+ if self.ignore_space:
4296
+ try:
4297
+ for c in substring:
4298
+ while True:
4299
+ if string[pos].isspace():
4300
+ # Skip over the whitespace.
4301
+ pos += 1
4302
+ elif string[pos] == "#":
4303
+ # Skip over the comment to the end of the line.
4304
+ pos = string.index("\n", pos)
4305
+ else:
4306
+ break
4307
+
4308
+ if string[pos] != c:
4309
+ return False
4310
+
4311
+ pos += 1
4312
+
4313
+ self.pos = pos
4314
+
4315
+ return True
4316
+ except IndexError:
4317
+ # We've reached the end of the string.
4318
+ return False
4319
+ except ValueError:
4320
+ # The comment extended to the end of the string.
4321
+ return False
4322
+ else:
4323
+ if not string.startswith(substring, pos):
4324
+ return False
4325
+
4326
+ self.pos = pos + len(substring)
4327
+
4328
+ return True
4329
+
4330
+ def expect(self, substring):
4331
+ if not self.match(substring):
4332
+ raise error("missing {}".format(substring), self.string, self.pos)
4333
+
4334
+ def at_end(self):
4335
+ string = self.string
4336
+ pos = self.pos
4337
+
4338
+ try:
4339
+ if self.ignore_space:
4340
+ while True:
4341
+ if string[pos].isspace():
4342
+ pos += 1
4343
+ elif string[pos] == "#":
4344
+ pos = string.index("\n", pos)
4345
+ else:
4346
+ break
4347
+
4348
+ return pos >= len(string)
4349
+ except IndexError:
4350
+ # We've reached the end of the string.
4351
+ return True
4352
+ except ValueError:
4353
+ # The comment extended to the end of the string.
4354
+ return True
4355
+
4356
+ class Info:
4357
+ "Info about the regular expression."
4358
+
4359
+ def __init__(self, flags=0, char_type=None, kwargs={}):
4360
+ flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4361
+ self.flags = flags
4362
+ self.global_flags = flags
4363
+ self.inline_locale = False
4364
+
4365
+ self.kwargs = kwargs
4366
+
4367
+ self.group_count = 0
4368
+ self.group_index = {}
4369
+ self.group_name = {}
4370
+ self.char_type = char_type
4371
+ self.named_lists_used = {}
4372
+ self.open_groups = []
4373
+ self.open_group_count = {}
4374
+ self.defined_groups = {}
4375
+ self.group_calls = []
4376
+ self.private_groups = {}
4377
+
4378
+ def open_group(self, name=None):
4379
+ group = self.group_index.get(name)
4380
+ if group is None:
4381
+ while True:
4382
+ self.group_count += 1
4383
+ if name is None or self.group_count not in self.group_name:
4384
+ break
4385
+
4386
+ group = self.group_count
4387
+ if name:
4388
+ self.group_index[name] = group
4389
+ self.group_name[group] = name
4390
+
4391
+ if group in self.open_groups:
4392
+ # We have a nested named group. We'll assign it a private group
4393
+ # number, initially negative until we can assign a proper
4394
+ # (positive) number.
4395
+ group_alias = -(len(self.private_groups) + 1)
4396
+ self.private_groups[group_alias] = group
4397
+ group = group_alias
4398
+
4399
+ self.open_groups.append(group)
4400
+ self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4401
+
4402
+ return group
4403
+
4404
+ def close_group(self):
4405
+ self.open_groups.pop()
4406
+
4407
+ def is_open_group(self, name):
4408
+ # In version 1, a group reference can refer to an open group. We'll
4409
+ # just pretend the group isn't open.
4410
+ version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4411
+ if version == VERSION1:
4412
+ return False
4413
+
4414
+ if name.isdigit():
4415
+ group = int(name)
4416
+ else:
4417
+ group = self.group_index.get(name)
4418
+
4419
+ return group in self.open_groups
4420
+
4421
+ def _check_group_features(info, parsed):
4422
+ """Checks whether the reverse and fuzzy features of the group calls match
4423
+ the groups which they call.
4424
+ """
4425
+ call_refs = {}
4426
+ additional_groups = []
4427
+ for call, reverse, fuzzy in info.group_calls:
4428
+ # Look up the reference of this group call.
4429
+ key = (call.group, reverse, fuzzy)
4430
+ ref = call_refs.get(key)
4431
+ if ref is None:
4432
+ # This group doesn't have a reference yet, so look up its features.
4433
+ if call.group == 0:
4434
+ # Calling the pattern as a whole.
4435
+ rev = bool(info.flags & REVERSE)
4436
+ fuz = isinstance(parsed, Fuzzy)
4437
+ if (rev, fuz) != (reverse, fuzzy):
4438
+ # The pattern as a whole doesn't have the features we want,
4439
+ # so we'll need to make a copy of it with the desired
4440
+ # features.
4441
+ additional_groups.append((CallRef(len(call_refs), parsed),
4442
+ reverse, fuzzy))
4443
+ else:
4444
+ # Calling a capture group.
4445
+ def_info = info.defined_groups[call.group]
4446
+ group = def_info[0]
4447
+ if def_info[1 : ] != (reverse, fuzzy):
4448
+ # The group doesn't have the features we want, so we'll
4449
+ # need to make a copy of it with the desired features.
4450
+ additional_groups.append((group, reverse, fuzzy))
4451
+
4452
+ ref = len(call_refs)
4453
+ call_refs[key] = ref
4454
+
4455
+ call.call_ref = ref
4456
+
4457
+ info.call_refs = call_refs
4458
+ info.additional_groups = additional_groups
4459
+
4460
+ def _get_required_string(parsed, flags):
4461
+ "Gets the required string and related info of a parsed pattern."
4462
+
4463
+ req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4464
+ if required:
4465
+ required.required = True
4466
+ if req_offset >= UNLIMITED:
4467
+ req_offset = -1
4468
+
4469
+ req_flags = required.case_flags
4470
+ if not (flags & UNICODE):
4471
+ req_flags &= ~UNICODE
4472
+
4473
+ req_chars = required.folded_characters
4474
+ else:
4475
+ req_offset = 0
4476
+ req_chars = ()
4477
+ req_flags = 0
4478
+
4479
+ return req_offset, req_chars, req_flags
4480
+
4481
+ class Scanner:
4482
+ def __init__(self, lexicon, flags=0):
4483
+ self.lexicon = lexicon
4484
+
4485
+ # Combine phrases into a compound pattern.
4486
+ patterns = []
4487
+ for phrase, action in lexicon:
4488
+ # Parse the regular expression.
4489
+ source = Source(phrase)
4490
+ info = Info(flags, source.char_type)
4491
+ source.ignore_space = bool(info.flags & VERBOSE)
4492
+ parsed = _parse_pattern(source, info)
4493
+ if not source.at_end():
4494
+ raise error("unbalanced parenthesis", source.string,
4495
+ source.pos)
4496
+
4497
+ # We want to forbid capture groups within each phrase.
4498
+ patterns.append(parsed.remove_captures())
4499
+
4500
+ # Combine all the subpatterns into one pattern.
4501
+ info = Info(flags)
4502
+ patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4503
+ parsed = Branch(patterns)
4504
+
4505
+ # Optimise the compound pattern.
4506
+ reverse = bool(info.flags & REVERSE)
4507
+ parsed = parsed.optimise(info, reverse)
4508
+ parsed = parsed.pack_characters(info)
4509
+
4510
+ # Get the required string.
4511
+ req_offset, req_chars, req_flags = _get_required_string(parsed,
4512
+ info.flags)
4513
+
4514
+ # Check the features of the groups.
4515
+ _check_group_features(info, parsed)
4516
+
4517
+ # Complain if there are any group calls. They are not supported by the
4518
+ # Scanner class.
4519
+ if info.call_refs:
4520
+ raise error("recursive regex not supported by Scanner",
4521
+ source.string, source.pos)
4522
+
4523
+ reverse = bool(info.flags & REVERSE)
4524
+
4525
+ # Compile the compound pattern. The result is a list of tuples.
4526
+ code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4527
+
4528
+ # Flatten the code into a list of ints.
4529
+ code = _flatten_code(code)
4530
+
4531
+ if not parsed.has_simple_start():
4532
+ # Get the first set, if possible.
4533
+ try:
4534
+ fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4535
+ fs_code = _flatten_code(fs_code)
4536
+ code = fs_code + code
4537
+ except _FirstSetError:
4538
+ pass
4539
+
4540
+ # Check the global flags for conflicts.
4541
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4542
+ if version not in (0, VERSION0, VERSION1):
4543
+ raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4544
+
4545
+ # Create the PatternObject.
4546
+ #
4547
+ # Local flags like IGNORECASE affect the code generation, but aren't
4548
+ # needed by the PatternObject itself. Conversely, global flags like
4549
+ # LOCALE _don't_ affect the code generation but _are_ needed by the
4550
+ # PatternObject.
4551
+ self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4552
+ code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4553
+ len(patterns))
4554
+
4555
+ def scan(self, string):
4556
+ result = []
4557
+ append = result.append
4558
+ match = self.scanner.scanner(string).match
4559
+ i = 0
4560
+ while True:
4561
+ m = match()
4562
+ if not m:
4563
+ break
4564
+ j = m.end()
4565
+ if i == j:
4566
+ break
4567
+ action = self.lexicon[m.lastindex - 1][1]
4568
+ if hasattr(action, '__call__'):
4569
+ self.match = m
4570
+ action = action(self, m.group())
4571
+ if action is not None:
4572
+ append(action)
4573
+ i = j
4574
+
4575
+ return result, string[i : ]
4576
+
4577
+ # Get the known properties dict.
4578
+ PROPERTIES = _regex.get_properties()
4579
+
4580
+ # Build the inverse of the properties dict.
4581
+ PROPERTY_NAMES = {}
4582
+ for prop_name, (prop_id, values) in PROPERTIES.items():
4583
+ name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4584
+ name = max(name, prop_name, key=len)
4585
+ PROPERTY_NAMES[prop_id] = name, prop_values
4586
+
4587
+ for val_name, val_id in values.items():
4588
+ prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4589
+ key=len)
4590
+
4591
+ # Character escape sequences.
4592
+ CHARACTER_ESCAPES = {
4593
+ "a": "\a",
4594
+ "b": "\b",
4595
+ "f": "\f",
4596
+ "n": "\n",
4597
+ "r": "\r",
4598
+ "t": "\t",
4599
+ "v": "\v",
4600
+ }
4601
+
4602
+ ASCII_ENCODING = 1
4603
+ UNICODE_ENCODING = 2
4604
+
4605
+ # Predefined character set escape sequences.
4606
+ CHARSET_ESCAPES = {
4607
+ "d": lookup_property(None, "Digit", True),
4608
+ "D": lookup_property(None, "Digit", False),
4609
+ "h": lookup_property(None, "Blank", True),
4610
+ "s": lookup_property(None, "Space", True),
4611
+ "S": lookup_property(None, "Space", False),
4612
+ "w": lookup_property(None, "Word", True),
4613
+ "W": lookup_property(None, "Word", False),
4614
+ }
4615
+
4616
+ ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4617
+ ASCII_CHARSET_ESCAPES.update({
4618
+ "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),
4619
+ "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),
4620
+ "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),
4621
+ "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),
4622
+ "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),
4623
+ "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),
4624
+ })
4625
+ UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4626
+ UNICODE_CHARSET_ESCAPES.update({
4627
+ "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),
4628
+ "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),
4629
+ "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),
4630
+ "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),
4631
+ "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),
4632
+ "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),
4633
+ })
4634
+
4635
+ # Positional escape sequences.
4636
+ POSITION_ESCAPES = {
4637
+ "A": StartOfString(),
4638
+ "b": Boundary(),
4639
+ "B": Boundary(False),
4640
+ "K": Keep(),
4641
+ "m": StartOfWord(),
4642
+ "M": EndOfWord(),
4643
+ "Z": EndOfString(),
4644
+ }
4645
+ ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4646
+ ASCII_POSITION_ESCAPES.update({
4647
+ "b": Boundary(encoding=ASCII_ENCODING),
4648
+ "B": Boundary(False, encoding=ASCII_ENCODING),
4649
+ "m": StartOfWord(encoding=ASCII_ENCODING),
4650
+ "M": EndOfWord(encoding=ASCII_ENCODING),
4651
+ })
4652
+ UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4653
+ UNICODE_POSITION_ESCAPES.update({
4654
+ "b": Boundary(encoding=UNICODE_ENCODING),
4655
+ "B": Boundary(False, encoding=UNICODE_ENCODING),
4656
+ "m": StartOfWord(encoding=UNICODE_ENCODING),
4657
+ "M": EndOfWord(encoding=UNICODE_ENCODING),
4658
+ })
4659
+
4660
+ # Positional escape sequences when WORD flag set.
4661
+ WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4662
+ WORD_POSITION_ESCAPES.update({
4663
+ "b": DefaultBoundary(),
4664
+ "B": DefaultBoundary(False),
4665
+ "m": DefaultStartOfWord(),
4666
+ "M": DefaultEndOfWord(),
4667
+ })
4668
+
4669
+ # Regex control verbs.
4670
+ VERBS = {
4671
+ "FAIL": Failure(),
4672
+ "F": Failure(),
4673
+ "PRUNE": Prune(),
4674
+ "SKIP": Skip(),
4675
+ }