regex 2025.11.3__cp313-cp313-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of regex might be problematic. Click here for more details.

regex/_regex_core.py ADDED
@@ -0,0 +1,4663 @@
1
+ #
2
+ # Secret Labs' Regular Expression Engine core module
3
+ #
4
+ # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5
+ #
6
+ # This version of the SRE library can be redistributed under CNRI's
7
+ # Python 1.6 license. For any other use, please contact Secret Labs
8
+ # AB (info@pythonware.com).
9
+ #
10
+ # Portions of this engine have been developed in cooperation with
11
+ # CNRI. Hewlett-Packard provided funding for 1.6 integration and
12
+ # other compatibility work.
13
+ #
14
+ # 2010-01-16 mrab Python front-end re-written and extended
15
+
16
+ import enum
17
+ import string
18
+ import unicodedata
19
+ from collections import defaultdict
20
+
21
+ from regex import _regex
22
+
23
+ __all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24
+ "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25
+ "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26
+ "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27
+ "Scanner", "RegexFlag"]
28
+
29
+ # The regex exception.
30
+ class error(Exception):
31
+ """Exception raised for invalid regular expressions.
32
+
33
+ Attributes:
34
+
35
+ msg: The unformatted error message
36
+ pattern: The regular expression pattern
37
+ pos: The position in the pattern where compilation failed, or None
38
+ lineno: The line number where compilation failed, unless pos is None
39
+ colno: The column number where compilation failed, unless pos is None
40
+ """
41
+
42
+ def __init__(self, message, pattern=None, pos=None):
43
+ newline = '\n' if isinstance(pattern, str) else b'\n'
44
+ self.msg = message
45
+ self.pattern = pattern
46
+ self.pos = pos
47
+ if pattern is not None and pos is not None:
48
+ self.lineno = pattern.count(newline, 0, pos) + 1
49
+ self.colno = pos - pattern.rfind(newline, 0, pos)
50
+
51
+ message = "{} at position {}".format(message, pos)
52
+
53
+ if newline in pattern:
54
+ message += " (line {}, column {})".format(self.lineno,
55
+ self.colno)
56
+
57
+ Exception.__init__(self, message)
58
+
59
+ # The exception for when a positional flag has been turned on in the old
60
+ # behaviour.
61
+ class _UnscopedFlagSet(Exception):
62
+ pass
63
+
64
+ # The exception for when parsing fails and we want to try something else.
65
+ class ParseError(Exception):
66
+ pass
67
+
68
+ # The exception for when there isn't a valid first set.
69
+ class _FirstSetError(Exception):
70
+ pass
71
+
72
+ # Flags.
73
+ class RegexFlag(enum.IntFlag):
74
+ A = ASCII = 0x80 # Assume ASCII locale.
75
+ B = BESTMATCH = 0x1000 # Best fuzzy match.
76
+ D = DEBUG = 0x200 # Print parsed pattern.
77
+ E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78
+ # fuzzy match.
79
+ F = FULLCASE = 0x4000 # Unicode full case-folding.
80
+ I = IGNORECASE = 0x2 # Ignore case.
81
+ L = LOCALE = 0x4 # Assume current 8-bit locale.
82
+ M = MULTILINE = 0x8 # Make anchors look for newline.
83
+ P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84
+ R = REVERSE = 0x400 # Search backwards.
85
+ S = DOTALL = 0x10 # Make dot match newline.
86
+ U = UNICODE = 0x20 # Assume Unicode locale.
87
+ V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88
+ V1 = VERSION1 = 0x100 # New enhanced behaviour.
89
+ W = WORD = 0x800 # Default Unicode word breaks.
90
+ X = VERBOSE = 0x40 # Ignore whitespace and comments.
91
+ T = TEMPLATE = 0x1 # Template (present because re module has it).
92
+
93
+ def __repr__(self):
94
+ if self._name_ is not None:
95
+ return 'regex.%s' % self._name_
96
+
97
+ value = self._value_
98
+ members = []
99
+ negative = value < 0
100
+
101
+ if negative:
102
+ value = ~value
103
+
104
+ for m in self.__class__:
105
+ if value & m._value_:
106
+ value &= ~m._value_
107
+ members.append('regex.%s' % m._name_)
108
+
109
+ if value:
110
+ members.append(hex(value))
111
+
112
+ res = '|'.join(members)
113
+
114
+ if negative:
115
+ if len(members) > 1:
116
+ res = '~(%s)' % res
117
+ else:
118
+ res = '~%s' % res
119
+
120
+ return res
121
+
122
+ __str__ = object.__str__
123
+
124
+ # Put the flags into the module namespace. Being explicit here helps tools like
125
+ # linters and IDEs understand the code better.
126
+ ASCII = RegexFlag.ASCII
127
+ BESTMATCH = RegexFlag.BESTMATCH
128
+ DEBUG = RegexFlag.DEBUG
129
+ DOTALL = RegexFlag.DOTALL
130
+ ENHANCEMATCH = RegexFlag.ENHANCEMATCH
131
+ FULLCASE = RegexFlag.FULLCASE
132
+ IGNORECASE = RegexFlag.IGNORECASE
133
+ LOCALE = RegexFlag.LOCALE
134
+ MULTILINE = RegexFlag.MULTILINE
135
+ POSIX = RegexFlag.POSIX
136
+ REVERSE = RegexFlag.REVERSE
137
+ TEMPLATE = RegexFlag.TEMPLATE
138
+ UNICODE = RegexFlag.UNICODE
139
+ VERBOSE = RegexFlag.VERBOSE
140
+ VERSION0 = RegexFlag.VERSION0
141
+ VERSION1 = RegexFlag.VERSION1
142
+ WORD = RegexFlag.WORD
143
+ A = RegexFlag.A
144
+ B = RegexFlag.B
145
+ D = RegexFlag.D
146
+ E = RegexFlag.E
147
+ F = RegexFlag.F
148
+ I = RegexFlag.I
149
+ L = RegexFlag.L
150
+ M = RegexFlag.M
151
+ P = RegexFlag.P
152
+ R = RegexFlag.R
153
+ S = RegexFlag.S
154
+ U = RegexFlag.U
155
+ V0 = RegexFlag.V0
156
+ V1 = RegexFlag.V1
157
+ W = RegexFlag.W
158
+ X = RegexFlag.X
159
+ T = RegexFlag.T
160
+
161
+ DEFAULT_VERSION = VERSION1
162
+
163
+ _ALL_VERSIONS = VERSION0 | VERSION1
164
+ _ALL_ENCODINGS = ASCII | LOCALE | UNICODE
165
+
166
+ # The default flags for the various versions.
167
+ DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
168
+
169
+ # The mask for the flags.
170
+ GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
171
+ REVERSE)
172
+ SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
173
+ _ALL_ENCODINGS)
174
+
175
+ ALPHA = frozenset(string.ascii_letters)
176
+ DIGITS = frozenset(string.digits)
177
+ ALNUM = ALPHA | DIGITS
178
+ OCT_DIGITS = frozenset(string.octdigits)
179
+ HEX_DIGITS = frozenset(string.hexdigits)
180
+ SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
181
+ NAMED_CHAR_PART = ALNUM | frozenset(" -")
182
+ PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
183
+ SET_OPS = ("||", "~~", "&&", "--")
184
+
185
+ # The width of the code words inside the regex engine.
186
+ BYTES_PER_CODE = _regex.get_code_size()
187
+ BITS_PER_CODE = BYTES_PER_CODE * 8
188
+
189
+ # The repeat count which represents infinity.
190
+ UNLIMITED = (1 << BITS_PER_CODE) - 1
191
+
192
+ # The regular expression flags.
193
+ REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
194
+ "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
195
+ "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
196
+ VERBOSE}
197
+
198
+ # The case flags.
199
+ CASE_FLAGS = FULLCASE | IGNORECASE
200
+ NOCASE = 0
201
+ FULLIGNORECASE = FULLCASE | IGNORECASE
202
+
203
+ FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
204
+
205
+ CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
206
+ FULLIGNORECASE: FULLIGNORECASE}
207
+
208
+ # The number of digits in hexadecimal escapes.
209
+ HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
210
+
211
+ # The names of the opcodes.
212
+ OPCODES = """
213
+ FAILURE
214
+ SUCCESS
215
+ ANY
216
+ ANY_ALL
217
+ ANY_ALL_REV
218
+ ANY_REV
219
+ ANY_U
220
+ ANY_U_REV
221
+ ATOMIC
222
+ BOUNDARY
223
+ BRANCH
224
+ CALL_REF
225
+ CHARACTER
226
+ CHARACTER_IGN
227
+ CHARACTER_IGN_REV
228
+ CHARACTER_REV
229
+ CONDITIONAL
230
+ DEFAULT_BOUNDARY
231
+ DEFAULT_END_OF_WORD
232
+ DEFAULT_START_OF_WORD
233
+ END
234
+ END_OF_LINE
235
+ END_OF_LINE_U
236
+ END_OF_STRING
237
+ END_OF_STRING_LINE
238
+ END_OF_STRING_LINE_U
239
+ END_OF_WORD
240
+ FUZZY
241
+ GRAPHEME_BOUNDARY
242
+ GREEDY_REPEAT
243
+ GROUP
244
+ GROUP_CALL
245
+ GROUP_EXISTS
246
+ KEEP
247
+ LAZY_REPEAT
248
+ LOOKAROUND
249
+ NEXT
250
+ PROPERTY
251
+ PROPERTY_IGN
252
+ PROPERTY_IGN_REV
253
+ PROPERTY_REV
254
+ PRUNE
255
+ RANGE
256
+ RANGE_IGN
257
+ RANGE_IGN_REV
258
+ RANGE_REV
259
+ REF_GROUP
260
+ REF_GROUP_FLD
261
+ REF_GROUP_FLD_REV
262
+ REF_GROUP_IGN
263
+ REF_GROUP_IGN_REV
264
+ REF_GROUP_REV
265
+ SEARCH_ANCHOR
266
+ SET_DIFF
267
+ SET_DIFF_IGN
268
+ SET_DIFF_IGN_REV
269
+ SET_DIFF_REV
270
+ SET_INTER
271
+ SET_INTER_IGN
272
+ SET_INTER_IGN_REV
273
+ SET_INTER_REV
274
+ SET_SYM_DIFF
275
+ SET_SYM_DIFF_IGN
276
+ SET_SYM_DIFF_IGN_REV
277
+ SET_SYM_DIFF_REV
278
+ SET_UNION
279
+ SET_UNION_IGN
280
+ SET_UNION_IGN_REV
281
+ SET_UNION_REV
282
+ SKIP
283
+ START_OF_LINE
284
+ START_OF_LINE_U
285
+ START_OF_STRING
286
+ START_OF_WORD
287
+ STRING
288
+ STRING_FLD
289
+ STRING_FLD_REV
290
+ STRING_IGN
291
+ STRING_IGN_REV
292
+ STRING_REV
293
+ FUZZY_EXT
294
+ """
295
+
296
+ # Define the opcodes in a namespace.
297
+ class Namespace:
298
+ pass
299
+
300
+ OP = Namespace()
301
+ for i, op in enumerate(OPCODES.split()):
302
+ setattr(OP, op, i)
303
+
304
+ def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
305
+ """Make room in the given cache.
306
+
307
+ Args:
308
+ cache_dict: The cache dictionary to modify.
309
+ args_dict: The dictionary of named list args used by patterns.
310
+ max_length: Maximum # of entries in cache_dict before it is shrunk.
311
+ divisor: Cache will shrink to max_length - 1/divisor*max_length items.
312
+ """
313
+ # Toss out a fraction of the entries at random to make room for new ones.
314
+ # A random algorithm was chosen as opposed to simply cache_dict.popitem()
315
+ # as popitem could penalize the same regular expression repeatedly based
316
+ # on its internal hash value. Being random should spread the cache miss
317
+ # love around.
318
+ cache_keys = tuple(cache_dict.keys())
319
+ overage = len(cache_keys) - max_length
320
+ if overage < 0:
321
+ # Cache is already within limits. Normally this should not happen
322
+ # but it could due to multithreading.
323
+ return
324
+
325
+ number_to_toss = max_length // divisor + overage
326
+
327
+ # The import is done here to avoid a circular dependency.
328
+ import random
329
+ if not hasattr(random, 'sample'):
330
+ # Do nothing while resolving the circular dependency:
331
+ # re->random->warnings->tokenize->string->re
332
+ return
333
+
334
+ for doomed_key in random.sample(cache_keys, number_to_toss):
335
+ try:
336
+ del cache_dict[doomed_key]
337
+ except KeyError:
338
+ # Ignore problems if the cache changed from another thread.
339
+ pass
340
+
341
+ # Rebuild the arguments and locale-sensitivity dictionaries.
342
+ args_dict.clear()
343
+ sensitivity_dict = {}
344
+ for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
345
+ args_dict[pattern, pattern_type, flags, default_version, locale] = args
346
+ try:
347
+ sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
348
+ except KeyError:
349
+ pass
350
+
351
+ locale_sensitive.clear()
352
+ locale_sensitive.update(sensitivity_dict)
353
+
354
+ def _fold_case(info, string):
355
+ "Folds the case of a string."
356
+ flags = info.flags
357
+ if (flags & _ALL_ENCODINGS) == 0:
358
+ flags |= info.guess_encoding
359
+
360
+ return _regex.fold_case(flags, string)
361
+
362
+ def is_cased_i(info, char):
363
+ "Checks whether a character is cased."
364
+ return len(_regex.get_all_cases(info.flags, char)) > 1
365
+
366
+ def is_cased_f(flags, char):
367
+ "Checks whether a character is cased."
368
+ return len(_regex.get_all_cases(flags, char)) > 1
369
+
370
+ def _compile_firstset(info, fs):
371
+ "Compiles the firstset for the pattern."
372
+ reverse = bool(info.flags & REVERSE)
373
+ fs = _check_firstset(info, reverse, fs)
374
+ if not fs or isinstance(fs, AnyAll):
375
+ return []
376
+
377
+ # Compile the firstset.
378
+ return fs.compile(reverse)
379
+
380
+ def _check_firstset(info, reverse, fs):
381
+ "Checks the firstset for the pattern."
382
+ if not fs or None in fs:
383
+ return None
384
+
385
+ # If we ignore the case, for simplicity we won't build a firstset.
386
+ members = set()
387
+ case_flags = NOCASE
388
+ for i in fs:
389
+ if isinstance(i, Character) and not i.positive:
390
+ return None
391
+
392
+ # if i.case_flags:
393
+ # if isinstance(i, Character):
394
+ # if is_cased_i(info, i.value):
395
+ # return []
396
+ # elif isinstance(i, SetBase):
397
+ # return []
398
+ case_flags |= i.case_flags
399
+ members.add(i.with_flags(case_flags=NOCASE))
400
+
401
+ if case_flags == (FULLCASE | IGNORECASE):
402
+ return None
403
+
404
+ # Build the firstset.
405
+ fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
406
+ zerowidth=True)
407
+ fs = fs.optimise(info, reverse, in_set=True)
408
+
409
+ return fs
410
+
411
+ def _flatten_code(code):
412
+ "Flattens the code from a list of tuples."
413
+ flat_code = []
414
+ for c in code:
415
+ flat_code.extend(c)
416
+
417
+ return flat_code
418
+
419
+ def make_case_flags(info):
420
+ "Makes the case flags."
421
+ flags = info.flags & CASE_FLAGS
422
+
423
+ # Turn off FULLCASE if ASCII is turned on.
424
+ if info.flags & ASCII:
425
+ flags &= ~FULLCASE
426
+
427
+ return flags
428
+
429
+ def make_character(info, value, in_set=False):
430
+ "Makes a character literal."
431
+ if in_set:
432
+ # A character set is built case-sensitively.
433
+ return Character(value)
434
+
435
+ return Character(value, case_flags=make_case_flags(info))
436
+
437
+ def make_ref_group(info, name, position):
438
+ "Makes a group reference."
439
+ return RefGroup(info, name, position, case_flags=make_case_flags(info))
440
+
441
+ def make_string_set(info, name):
442
+ "Makes a string set."
443
+ return StringSet(info, name, case_flags=make_case_flags(info))
444
+
445
+ def make_property(info, prop, in_set):
446
+ "Makes a property."
447
+ if in_set:
448
+ return prop
449
+
450
+ return prop.with_flags(case_flags=make_case_flags(info))
451
+
452
+ def _parse_pattern(source, info):
453
+ "Parses a pattern, eg. 'a|b|c'."
454
+ branches = [parse_sequence(source, info)]
455
+ while source.match("|"):
456
+ branches.append(parse_sequence(source, info))
457
+
458
+ if len(branches) == 1:
459
+ return branches[0]
460
+ return Branch(branches)
461
+
462
+ def parse_sequence(source, info):
463
+ "Parses a sequence, eg. 'abc'."
464
+ sequence = [None]
465
+ case_flags = make_case_flags(info)
466
+ while True:
467
+ saved_pos = source.pos
468
+ ch = source.get()
469
+ if ch in SPECIAL_CHARS:
470
+ if ch in ")|":
471
+ # The end of a sequence. At the end of the pattern ch is "".
472
+ source.pos = saved_pos
473
+ break
474
+ elif ch == "\\":
475
+ # An escape sequence outside a set.
476
+ sequence.append(parse_escape(source, info, False))
477
+ elif ch == "(":
478
+ # A parenthesised subpattern or a flag.
479
+ element = parse_paren(source, info)
480
+ if element is None:
481
+ case_flags = make_case_flags(info)
482
+ else:
483
+ sequence.append(element)
484
+ elif ch == ".":
485
+ # Any character.
486
+ if info.flags & DOTALL:
487
+ sequence.append(AnyAll())
488
+ elif info.flags & WORD:
489
+ sequence.append(AnyU())
490
+ else:
491
+ sequence.append(Any())
492
+ elif ch == "[":
493
+ # A character set.
494
+ sequence.append(parse_set(source, info))
495
+ elif ch == "^":
496
+ # The start of a line or the string.
497
+ if info.flags & MULTILINE:
498
+ if info.flags & WORD:
499
+ sequence.append(StartOfLineU())
500
+ else:
501
+ sequence.append(StartOfLine())
502
+ else:
503
+ sequence.append(StartOfString())
504
+ elif ch == "$":
505
+ # The end of a line or the string.
506
+ if info.flags & MULTILINE:
507
+ if info.flags & WORD:
508
+ sequence.append(EndOfLineU())
509
+ else:
510
+ sequence.append(EndOfLine())
511
+ else:
512
+ if info.flags & WORD:
513
+ sequence.append(EndOfStringLineU())
514
+ else:
515
+ sequence.append(EndOfStringLine())
516
+ elif ch in "?*+{":
517
+ # Looks like a quantifier.
518
+ counts = parse_quantifier(source, info, ch)
519
+ if counts:
520
+ # It _is_ a quantifier.
521
+ apply_quantifier(source, info, counts, case_flags, ch,
522
+ saved_pos, sequence)
523
+ sequence.append(None)
524
+ else:
525
+ # It's not a quantifier. Maybe it's a fuzzy constraint.
526
+ constraints = parse_fuzzy(source, info, ch, case_flags)
527
+ if constraints:
528
+ # It _is_ a fuzzy constraint.
529
+ apply_constraint(source, info, constraints, case_flags,
530
+ saved_pos, sequence)
531
+ sequence.append(None)
532
+ else:
533
+ # The element was just a literal.
534
+ sequence.append(Character(ord(ch),
535
+ case_flags=case_flags))
536
+ else:
537
+ # A literal.
538
+ sequence.append(Character(ord(ch), case_flags=case_flags))
539
+ else:
540
+ # A literal.
541
+ sequence.append(Character(ord(ch), case_flags=case_flags))
542
+
543
+ sequence = [item for item in sequence if item is not None]
544
+ return Sequence(sequence)
545
+
546
+ def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
547
+ sequence):
548
+ element = sequence.pop()
549
+ if element is None:
550
+ if sequence:
551
+ raise error("multiple repeat", source.string, saved_pos)
552
+ raise error("nothing to repeat", source.string, saved_pos)
553
+
554
+ if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
555
+ raise error("multiple repeat", source.string, saved_pos)
556
+
557
+ min_count, max_count = counts
558
+ saved_pos = source.pos
559
+ ch = source.get()
560
+ if ch == "?":
561
+ # The "?" suffix that means it's a lazy repeat.
562
+ repeated = LazyRepeat
563
+ elif ch == "+":
564
+ # The "+" suffix that means it's a possessive repeat.
565
+ repeated = PossessiveRepeat
566
+ else:
567
+ # No suffix means that it's a greedy repeat.
568
+ source.pos = saved_pos
569
+ repeated = GreedyRepeat
570
+
571
+ # Ignore the quantifier if it applies to a zero-width item or the number of
572
+ # repeats is fixed at 1.
573
+ if not element.is_empty() and (min_count != 1 or max_count != 1):
574
+ element = repeated(element, min_count, max_count)
575
+
576
+ sequence.append(element)
577
+
578
+ def apply_constraint(source, info, constraints, case_flags, saved_pos,
579
+ sequence):
580
+ element = sequence.pop()
581
+ if element is None:
582
+ raise error("nothing for fuzzy constraint", source.string, saved_pos)
583
+
584
+ # If a group is marked as fuzzy then put all of the fuzzy part in the
585
+ # group.
586
+ if isinstance(element, Group):
587
+ element.subpattern = Fuzzy(element.subpattern, constraints)
588
+ sequence.append(element)
589
+ else:
590
+ sequence.append(Fuzzy(element, constraints))
591
+
592
+ _QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
593
+
594
+ def parse_quantifier(source, info, ch):
595
+ "Parses a quantifier."
596
+ q = _QUANTIFIERS.get(ch)
597
+ if q:
598
+ # It's a quantifier.
599
+ return q
600
+
601
+ if ch == "{":
602
+ # Looks like a limited repeated element, eg. 'a{2,3}'.
603
+ counts = parse_limited_quantifier(source)
604
+ if counts:
605
+ return counts
606
+
607
+ return None
608
+
609
+ def is_above_limit(count):
610
+ "Checks whether a count is above the maximum."
611
+ return count is not None and count >= UNLIMITED
612
+
613
+ def parse_limited_quantifier(source):
614
+ "Parses a limited quantifier."
615
+ saved_pos = source.pos
616
+ min_count = parse_count(source)
617
+ if source.match(","):
618
+ max_count = parse_count(source)
619
+
620
+ # No minimum means 0 and no maximum means unlimited.
621
+ min_count = int(min_count or 0)
622
+ max_count = int(max_count) if max_count else None
623
+ else:
624
+ if not min_count:
625
+ source.pos = saved_pos
626
+ return None
627
+
628
+ min_count = max_count = int(min_count)
629
+
630
+ if not source.match ("}"):
631
+ source.pos = saved_pos
632
+ return None
633
+
634
+ if is_above_limit(min_count) or is_above_limit(max_count):
635
+ raise error("repeat count too big", source.string, saved_pos)
636
+
637
+ if max_count is not None and min_count > max_count:
638
+ raise error("min repeat greater than max repeat", source.string,
639
+ saved_pos)
640
+
641
+ return min_count, max_count
642
+
643
+ def parse_fuzzy(source, info, ch, case_flags):
644
+ "Parses a fuzzy setting, if present."
645
+ saved_pos = source.pos
646
+
647
+ if ch != "{":
648
+ return None
649
+
650
+ constraints = {}
651
+ try:
652
+ parse_fuzzy_item(source, constraints)
653
+ while source.match(","):
654
+ parse_fuzzy_item(source, constraints)
655
+ except ParseError:
656
+ source.pos = saved_pos
657
+ return None
658
+
659
+ if source.match(":"):
660
+ constraints["test"] = parse_fuzzy_test(source, info, case_flags)
661
+
662
+ if not source.match("}"):
663
+ raise error("expected }", source.string, source.pos)
664
+
665
+ return constraints
666
+
667
+ def parse_fuzzy_item(source, constraints):
668
+ "Parses a fuzzy setting item."
669
+ saved_pos = source.pos
670
+ try:
671
+ parse_cost_constraint(source, constraints)
672
+ except ParseError:
673
+ source.pos = saved_pos
674
+
675
+ parse_cost_equation(source, constraints)
676
+
677
+ def parse_cost_constraint(source, constraints):
678
+ "Parses a cost constraint."
679
+ saved_pos = source.pos
680
+ ch = source.get()
681
+ if ch in ALPHA:
682
+ # Syntax: constraint [("<=" | "<") cost]
683
+ constraint = parse_constraint(source, constraints, ch)
684
+
685
+ max_inc = parse_fuzzy_compare(source)
686
+
687
+ if max_inc is None:
688
+ # No maximum cost.
689
+ constraints[constraint] = 0, None
690
+ else:
691
+ # There's a maximum cost.
692
+ cost_pos = source.pos
693
+ max_cost = parse_cost_limit(source)
694
+
695
+ # Inclusive or exclusive limit?
696
+ if not max_inc:
697
+ max_cost -= 1
698
+
699
+ if max_cost < 0:
700
+ raise error("bad fuzzy cost limit", source.string, cost_pos)
701
+
702
+ constraints[constraint] = 0, max_cost
703
+ elif ch in DIGITS:
704
+ # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
705
+ source.pos = saved_pos
706
+
707
+ # Minimum cost.
708
+ cost_pos = source.pos
709
+ min_cost = parse_cost_limit(source)
710
+
711
+ min_inc = parse_fuzzy_compare(source)
712
+ if min_inc is None:
713
+ raise ParseError()
714
+
715
+ constraint = parse_constraint(source, constraints, source.get())
716
+
717
+ max_inc = parse_fuzzy_compare(source)
718
+ if max_inc is None:
719
+ raise ParseError()
720
+
721
+ # Maximum cost.
722
+ cost_pos = source.pos
723
+ max_cost = parse_cost_limit(source)
724
+
725
+ # Inclusive or exclusive limits?
726
+ if not min_inc:
727
+ min_cost += 1
728
+ if not max_inc:
729
+ max_cost -= 1
730
+
731
+ if not 0 <= min_cost <= max_cost:
732
+ raise error("bad fuzzy cost limit", source.string, cost_pos)
733
+
734
+ constraints[constraint] = min_cost, max_cost
735
+ else:
736
+ raise ParseError()
737
+
738
+ def parse_cost_limit(source):
739
+ "Parses a cost limit."
740
+ cost_pos = source.pos
741
+ digits = parse_count(source)
742
+
743
+ try:
744
+ return int(digits)
745
+ except ValueError:
746
+ pass
747
+
748
+ raise error("bad fuzzy cost limit", source.string, cost_pos)
749
+
750
+ def parse_constraint(source, constraints, ch):
751
+ "Parses a constraint."
752
+ if ch not in "deis":
753
+ raise ParseError()
754
+
755
+ if ch in constraints:
756
+ raise ParseError()
757
+
758
+ return ch
759
+
760
+ def parse_fuzzy_compare(source):
761
+ "Parses a cost comparator."
762
+ if source.match("<="):
763
+ return True
764
+ elif source.match("<"):
765
+ return False
766
+ else:
767
+ return None
768
+
769
+ def parse_cost_equation(source, constraints):
770
+ "Parses a cost equation."
771
+ if "cost" in constraints:
772
+ raise error("more than one cost equation", source.string, source.pos)
773
+
774
+ cost = {}
775
+
776
+ parse_cost_term(source, cost)
777
+ while source.match("+"):
778
+ parse_cost_term(source, cost)
779
+
780
+ max_inc = parse_fuzzy_compare(source)
781
+ if max_inc is None:
782
+ raise ParseError()
783
+
784
+ max_cost = int(parse_count(source))
785
+
786
+ if not max_inc:
787
+ max_cost -= 1
788
+
789
+ if max_cost < 0:
790
+ raise error("bad fuzzy cost limit", source.string, source.pos)
791
+
792
+ cost["max"] = max_cost
793
+
794
+ constraints["cost"] = cost
795
+
796
+ def parse_cost_term(source, cost):
797
+ "Parses a cost equation term."
798
+ coeff = parse_count(source)
799
+ ch = source.get()
800
+ if ch not in "dis":
801
+ raise ParseError()
802
+
803
+ if ch in cost:
804
+ raise error("repeated fuzzy cost", source.string, source.pos)
805
+
806
+ cost[ch] = int(coeff or 1)
807
+
808
+ def parse_fuzzy_test(source, info, case_flags):
809
+ saved_pos = source.pos
810
+ ch = source.get()
811
+ if ch in SPECIAL_CHARS:
812
+ if ch == "\\":
813
+ # An escape sequence outside a set.
814
+ return parse_escape(source, info, False)
815
+ elif ch == ".":
816
+ # Any character.
817
+ if info.flags & DOTALL:
818
+ return AnyAll()
819
+ elif info.flags & WORD:
820
+ return AnyU()
821
+ else:
822
+ return Any()
823
+ elif ch == "[":
824
+ # A character set.
825
+ return parse_set(source, info)
826
+ else:
827
+ raise error("expected character set", source.string, saved_pos)
828
+ elif ch:
829
+ # A literal.
830
+ return Character(ord(ch), case_flags=case_flags)
831
+ else:
832
+ raise error("expected character set", source.string, saved_pos)
833
+
834
+ def parse_count(source):
835
+ "Parses a quantifier's count, which can be empty."
836
+ return source.get_while(DIGITS)
837
+
838
+ def parse_paren(source, info):
839
+ """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
840
+ inline flag.
841
+ """
842
+ saved_pos = source.pos
843
+ ch = source.get(True)
844
+ if ch == "?":
845
+ # (?...
846
+ saved_pos_2 = source.pos
847
+ ch = source.get(True)
848
+ if ch == "<":
849
+ # (?<...
850
+ saved_pos_3 = source.pos
851
+ ch = source.get()
852
+ if ch in ("=", "!"):
853
+ # (?<=... or (?<!...: lookbehind.
854
+ return parse_lookaround(source, info, True, ch == "=")
855
+
856
+ # (?<...: a named capture group.
857
+ source.pos = saved_pos_3
858
+ name = parse_name(source)
859
+ group = info.open_group(name)
860
+ source.expect(">")
861
+ saved_flags = info.flags
862
+ try:
863
+ subpattern = _parse_pattern(source, info)
864
+ source.expect(")")
865
+ finally:
866
+ info.flags = saved_flags
867
+ source.ignore_space = bool(info.flags & VERBOSE)
868
+
869
+ info.close_group()
870
+ return Group(info, group, subpattern)
871
+ if ch in ("=", "!"):
872
+ # (?=... or (?!...: lookahead.
873
+ return parse_lookaround(source, info, False, ch == "=")
874
+ if ch == "P":
875
+ # (?P...: a Python extension.
876
+ return parse_extension(source, info)
877
+ if ch == "#":
878
+ # (?#...: a comment.
879
+ return parse_comment(source)
880
+ if ch == "(":
881
+ # (?(...: a conditional subpattern.
882
+ return parse_conditional(source, info)
883
+ if ch == ">":
884
+ # (?>...: an atomic subpattern.
885
+ return parse_atomic(source, info)
886
+ if ch == "|":
887
+ # (?|...: a common/reset groups branch.
888
+ return parse_common(source, info)
889
+ if ch == "R" or "0" <= ch <= "9":
890
+ # (?R...: probably a call to a group.
891
+ return parse_call_group(source, info, ch, saved_pos_2)
892
+ if ch == "&":
893
+ # (?&...: a call to a named group.
894
+ return parse_call_named_group(source, info, saved_pos_2)
895
+ if (ch == "+" or ch == "-") and source.peek() in DIGITS:
896
+ return parse_rel_call_group(source, info, ch, saved_pos_2)
897
+
898
+ # (?...: probably a flags subpattern.
899
+ source.pos = saved_pos_2
900
+ return parse_flags_subpattern(source, info)
901
+
902
+ if ch == "*":
903
+ # (*...
904
+ saved_pos_2 = source.pos
905
+ word = source.get_while(set(")>"), include=False)
906
+ if word[ : 1].isalpha():
907
+ verb = VERBS.get(word)
908
+ if not verb:
909
+ raise error("unknown verb", source.string, saved_pos_2)
910
+
911
+ source.expect(")")
912
+
913
+ return verb
914
+
915
+ # (...: an unnamed capture group.
916
+ source.pos = saved_pos
917
+ group = info.open_group()
918
+ saved_flags = info.flags
919
+ try:
920
+ subpattern = _parse_pattern(source, info)
921
+ source.expect(")")
922
+ finally:
923
+ info.flags = saved_flags
924
+ source.ignore_space = bool(info.flags & VERBOSE)
925
+
926
+ info.close_group()
927
+
928
+ return Group(info, group, subpattern)
929
+
930
+ def parse_extension(source, info):
931
+ "Parses a Python extension."
932
+ saved_pos = source.pos
933
+ ch = source.get()
934
+ if ch == "<":
935
+ # (?P<...: a named capture group.
936
+ name = parse_name(source)
937
+ group = info.open_group(name)
938
+ source.expect(">")
939
+ saved_flags = info.flags
940
+ try:
941
+ subpattern = _parse_pattern(source, info)
942
+ source.expect(")")
943
+ finally:
944
+ info.flags = saved_flags
945
+ source.ignore_space = bool(info.flags & VERBOSE)
946
+
947
+ info.close_group()
948
+
949
+ return Group(info, group, subpattern)
950
+ if ch == "=":
951
+ # (?P=...: a named group reference.
952
+ name = parse_name(source, allow_numeric=True)
953
+ source.expect(")")
954
+ if info.is_open_group(name):
955
+ raise error("cannot refer to an open group", source.string,
956
+ saved_pos)
957
+
958
+ return make_ref_group(info, name, saved_pos)
959
+ if ch == ">" or ch == "&":
960
+ # (?P>...: a call to a group.
961
+ return parse_call_named_group(source, info, saved_pos)
962
+
963
+ source.pos = saved_pos
964
+ raise error("unknown extension", source.string, saved_pos)
965
+
966
+ def parse_comment(source):
967
+ "Parses a comment."
968
+ while True:
969
+ saved_pos = source.pos
970
+ c = source.get(True)
971
+
972
+ if not c or c == ")":
973
+ break
974
+
975
+ if c == "\\":
976
+ c = source.get(True)
977
+
978
+ source.pos = saved_pos
979
+ source.expect(")")
980
+
981
+ return None
982
+
983
+ def parse_lookaround(source, info, behind, positive):
984
+ "Parses a lookaround."
985
+ saved_flags = info.flags
986
+ try:
987
+ subpattern = _parse_pattern(source, info)
988
+ source.expect(")")
989
+ finally:
990
+ info.flags = saved_flags
991
+ source.ignore_space = bool(info.flags & VERBOSE)
992
+
993
+ return LookAround(behind, positive, subpattern)
994
+
995
+ def parse_conditional(source, info):
996
+ "Parses a conditional subpattern."
997
+ saved_flags = info.flags
998
+ saved_pos = source.pos
999
+ ch = source.get()
1000
+ if ch == "?":
1001
+ # (?(?...
1002
+ ch = source.get()
1003
+ if ch in ("=", "!"):
1004
+ # (?(?=... or (?(?!...: lookahead conditional.
1005
+ return parse_lookaround_conditional(source, info, False, ch == "=")
1006
+ if ch == "<":
1007
+ # (?(?<...
1008
+ ch = source.get()
1009
+ if ch in ("=", "!"):
1010
+ # (?(?<=... or (?(?<!...: lookbehind conditional.
1011
+ return parse_lookaround_conditional(source, info, True, ch ==
1012
+ "=")
1013
+
1014
+ source.pos = saved_pos
1015
+ raise error("expected lookaround conditional", source.string,
1016
+ source.pos)
1017
+
1018
+ source.pos = saved_pos
1019
+ try:
1020
+ group = parse_name(source, True)
1021
+ source.expect(")")
1022
+ yes_branch = parse_sequence(source, info)
1023
+ if source.match("|"):
1024
+ no_branch = parse_sequence(source, info)
1025
+ else:
1026
+ no_branch = Sequence()
1027
+
1028
+ source.expect(")")
1029
+ finally:
1030
+ info.flags = saved_flags
1031
+ source.ignore_space = bool(info.flags & VERBOSE)
1032
+
1033
+ if yes_branch.is_empty() and no_branch.is_empty():
1034
+ return Sequence()
1035
+
1036
+ return Conditional(info, group, yes_branch, no_branch, saved_pos)
1037
+
1038
+ def parse_lookaround_conditional(source, info, behind, positive):
1039
+ saved_flags = info.flags
1040
+ try:
1041
+ subpattern = _parse_pattern(source, info)
1042
+ source.expect(")")
1043
+ finally:
1044
+ info.flags = saved_flags
1045
+ source.ignore_space = bool(info.flags & VERBOSE)
1046
+
1047
+ yes_branch = parse_sequence(source, info)
1048
+ if source.match("|"):
1049
+ no_branch = parse_sequence(source, info)
1050
+ else:
1051
+ no_branch = Sequence()
1052
+
1053
+ source.expect(")")
1054
+
1055
+ return LookAroundConditional(behind, positive, subpattern, yes_branch,
1056
+ no_branch)
1057
+
1058
+ def parse_atomic(source, info):
1059
+ "Parses an atomic subpattern."
1060
+ saved_flags = info.flags
1061
+ try:
1062
+ subpattern = _parse_pattern(source, info)
1063
+ source.expect(")")
1064
+ finally:
1065
+ info.flags = saved_flags
1066
+ source.ignore_space = bool(info.flags & VERBOSE)
1067
+
1068
+ return Atomic(subpattern)
1069
+
1070
+ def parse_common(source, info):
1071
+ "Parses a common groups branch."
1072
+ # Capture group numbers in different branches can reuse the group numbers.
1073
+ initial_group_count = info.group_count
1074
+ branches = [parse_sequence(source, info)]
1075
+ final_group_count = info.group_count
1076
+ while source.match("|"):
1077
+ info.group_count = initial_group_count
1078
+ branches.append(parse_sequence(source, info))
1079
+ final_group_count = max(final_group_count, info.group_count)
1080
+
1081
+ info.group_count = final_group_count
1082
+ source.expect(")")
1083
+
1084
+ if len(branches) == 1:
1085
+ return branches[0]
1086
+ return Branch(branches)
1087
+
1088
+ def parse_call_group(source, info, ch, pos):
1089
+ "Parses a call to a group."
1090
+ if ch == "R":
1091
+ group = "0"
1092
+ else:
1093
+ group = ch + source.get_while(DIGITS)
1094
+
1095
+ source.expect(")")
1096
+
1097
+ return CallGroup(info, group, pos)
1098
+
1099
+ def parse_rel_call_group(source, info, ch, pos):
1100
+ "Parses a relative call to a group."
1101
+ digits = source.get_while(DIGITS)
1102
+ if not digits:
1103
+ raise error("missing relative group number", source.string, source.pos)
1104
+
1105
+ offset = int(digits)
1106
+ group = info.group_count + offset if ch == "+" else info.group_count - offset + 1
1107
+ if group <= 0:
1108
+ raise error("invalid relative group number", source.string, source.pos)
1109
+
1110
+ source.expect(")")
1111
+
1112
+ return CallGroup(info, group, pos)
1113
+
1114
+ def parse_call_named_group(source, info, pos):
1115
+ "Parses a call to a named group."
1116
+ group = parse_name(source)
1117
+ source.expect(")")
1118
+
1119
+ return CallGroup(info, group, pos)
1120
+
1121
+ def parse_flag_set(source):
1122
+ "Parses a set of inline flags."
1123
+ flags = 0
1124
+
1125
+ try:
1126
+ while True:
1127
+ saved_pos = source.pos
1128
+ ch = source.get()
1129
+ if ch == "V":
1130
+ ch += source.get()
1131
+ flags |= REGEX_FLAGS[ch]
1132
+ except KeyError:
1133
+ source.pos = saved_pos
1134
+
1135
+ return flags
1136
+
1137
+ def parse_flags(source, info):
1138
+ "Parses flags being turned on/off."
1139
+ flags_on = parse_flag_set(source)
1140
+ if source.match("-"):
1141
+ flags_off = parse_flag_set(source)
1142
+ if not flags_off:
1143
+ raise error("bad inline flags: no flags after '-'", source.string,
1144
+ source.pos)
1145
+ else:
1146
+ flags_off = 0
1147
+
1148
+ if flags_on & LOCALE:
1149
+ # Remember that this pattern as an inline locale flag.
1150
+ info.inline_locale = True
1151
+
1152
+ return flags_on, flags_off
1153
+
1154
+ def parse_subpattern(source, info, flags_on, flags_off):
1155
+ "Parses a subpattern with scoped flags."
1156
+ saved_flags = info.flags
1157
+ info.flags = (info.flags | flags_on) & ~flags_off
1158
+
1159
+ # Ensure that there aren't multiple encoding flags set.
1160
+ if info.flags & (ASCII | LOCALE | UNICODE):
1161
+ info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on
1162
+
1163
+ source.ignore_space = bool(info.flags & VERBOSE)
1164
+ try:
1165
+ subpattern = _parse_pattern(source, info)
1166
+ source.expect(")")
1167
+ finally:
1168
+ info.flags = saved_flags
1169
+ source.ignore_space = bool(info.flags & VERBOSE)
1170
+
1171
+ return subpattern
1172
+
1173
+ def parse_flags_subpattern(source, info):
1174
+ """Parses a flags subpattern. It could be inline flags or a subpattern
1175
+ possibly with local flags. If it's a subpattern, then that's returned;
1176
+ if it's a inline flags, then None is returned.
1177
+ """
1178
+ flags_on, flags_off = parse_flags(source, info)
1179
+
1180
+ if flags_off & GLOBAL_FLAGS:
1181
+ raise error("bad inline flags: cannot turn off global flag",
1182
+ source.string, source.pos)
1183
+
1184
+ if flags_on & flags_off:
1185
+ raise error("bad inline flags: flag turned on and off", source.string,
1186
+ source.pos)
1187
+
1188
+ # Handle flags which are global in all regex behaviours.
1189
+ new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1190
+ if new_global_flags:
1191
+ info.global_flags |= new_global_flags
1192
+
1193
+ # A global has been turned on, so reparse the pattern.
1194
+ raise _UnscopedFlagSet(info.global_flags)
1195
+
1196
+ # Ensure that from now on we have only scoped flags.
1197
+ flags_on &= ~GLOBAL_FLAGS
1198
+
1199
+ if source.match(":"):
1200
+ return parse_subpattern(source, info, flags_on, flags_off)
1201
+
1202
+ if source.match(")"):
1203
+ parse_positional_flags(source, info, flags_on, flags_off)
1204
+ return None
1205
+
1206
+ raise error("unknown extension", source.string, source.pos)
1207
+
1208
+ def parse_positional_flags(source, info, flags_on, flags_off):
1209
+ "Parses positional flags."
1210
+ info.flags = (info.flags | flags_on) & ~flags_off
1211
+ source.ignore_space = bool(info.flags & VERBOSE)
1212
+
1213
+ def parse_name(source, allow_numeric=False, allow_group_0=False):
1214
+ "Parses a name."
1215
+ name = source.get_while(set(")>"), include=False)
1216
+
1217
+ if not name:
1218
+ raise error("missing group name", source.string, source.pos)
1219
+
1220
+ if name.isdigit():
1221
+ min_group = 0 if allow_group_0 else 1
1222
+ if not allow_numeric or int(name) < min_group:
1223
+ raise error("bad character in group name", source.string,
1224
+ source.pos)
1225
+ else:
1226
+ if not name.isidentifier():
1227
+ raise error("bad character in group name", source.string,
1228
+ source.pos)
1229
+
1230
+ return name
1231
+
1232
+ def is_octal(string):
1233
+ "Checks whether a string is octal."
1234
+ return all(ch in OCT_DIGITS for ch in string)
1235
+
1236
+ def is_decimal(string):
1237
+ "Checks whether a string is decimal."
1238
+ return all(ch in DIGITS for ch in string)
1239
+
1240
+ def is_hexadecimal(string):
1241
+ "Checks whether a string is hexadecimal."
1242
+ return all(ch in HEX_DIGITS for ch in string)
1243
+
1244
+ def parse_escape(source, info, in_set):
1245
+ "Parses an escape sequence."
1246
+ saved_ignore = source.ignore_space
1247
+ source.ignore_space = False
1248
+ ch = source.get()
1249
+ source.ignore_space = saved_ignore
1250
+ if not ch:
1251
+ # A backslash at the end of the pattern.
1252
+ raise error("bad escape (end of pattern)", source.string, source.pos)
1253
+ if ch in HEX_ESCAPES:
1254
+ # A hexadecimal escape sequence.
1255
+ return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1256
+ elif ch == "g" and not in_set:
1257
+ # A group reference.
1258
+ saved_pos = source.pos
1259
+ try:
1260
+ return parse_group_ref(source, info)
1261
+ except error:
1262
+ # Invalid as a group reference, so assume it's a literal.
1263
+ source.pos = saved_pos
1264
+
1265
+ return make_character(info, ord(ch), in_set)
1266
+ elif ch == "G" and not in_set:
1267
+ # A search anchor.
1268
+ return SearchAnchor()
1269
+ elif ch == "L" and not in_set:
1270
+ # A string set.
1271
+ return parse_string_set(source, info)
1272
+ elif ch == "N":
1273
+ # A named codepoint.
1274
+ return parse_named_char(source, info, in_set)
1275
+ elif ch in "pP":
1276
+ # A Unicode property, positive or negative.
1277
+ return parse_property(source, info, ch == "p", in_set)
1278
+ elif ch == "R" and not in_set:
1279
+ # A line ending.
1280
+ charset = [0x0A, 0x0B, 0x0C, 0x0D]
1281
+ if info.guess_encoding == UNICODE:
1282
+ charset.extend([0x85, 0x2028, 0x2029])
1283
+
1284
+ return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
1285
+ for c in charset])]))
1286
+ elif ch == "X" and not in_set:
1287
+ # A grapheme cluster.
1288
+ return Grapheme()
1289
+ elif ch in ALPHA:
1290
+ # An alphabetic escape sequence.
1291
+ # Positional escapes aren't allowed inside a character set.
1292
+ if not in_set:
1293
+ if info.flags & WORD:
1294
+ value = WORD_POSITION_ESCAPES.get(ch)
1295
+ elif info.flags & ASCII:
1296
+ value = ASCII_POSITION_ESCAPES.get(ch)
1297
+ elif info.flags & UNICODE:
1298
+ value = UNICODE_POSITION_ESCAPES.get(ch)
1299
+ else:
1300
+ value = POSITION_ESCAPES.get(ch)
1301
+
1302
+ if value:
1303
+ return value
1304
+
1305
+ if info.flags & ASCII:
1306
+ value = ASCII_CHARSET_ESCAPES.get(ch)
1307
+ elif info.flags & UNICODE:
1308
+ value = UNICODE_CHARSET_ESCAPES.get(ch)
1309
+ else:
1310
+ value = CHARSET_ESCAPES.get(ch)
1311
+
1312
+ if value:
1313
+ return value
1314
+
1315
+ value = CHARACTER_ESCAPES.get(ch)
1316
+ if value:
1317
+ return Character(ord(value))
1318
+
1319
+ raise error("bad escape \\%s" % ch, source.string, source.pos)
1320
+ elif ch in DIGITS:
1321
+ # A numeric escape sequence.
1322
+ return parse_numeric_escape(source, info, ch, in_set)
1323
+ else:
1324
+ # A literal.
1325
+ return make_character(info, ord(ch), in_set)
1326
+
1327
+ def parse_numeric_escape(source, info, ch, in_set):
1328
+ "Parses a numeric escape sequence."
1329
+ if in_set or ch == "0":
1330
+ # Octal escape sequence, max 3 digits.
1331
+ return parse_octal_escape(source, info, [ch], in_set)
1332
+
1333
+ # At least 1 digit, so either octal escape or group.
1334
+ digits = ch
1335
+ saved_pos = source.pos
1336
+ ch = source.get()
1337
+ if ch in DIGITS:
1338
+ # At least 2 digits, so either octal escape or group.
1339
+ digits += ch
1340
+ saved_pos = source.pos
1341
+ ch = source.get()
1342
+ if is_octal(digits) and ch in OCT_DIGITS:
1343
+ # 3 octal digits, so octal escape sequence.
1344
+ encoding = info.flags & _ALL_ENCODINGS
1345
+ if encoding == ASCII or encoding == LOCALE:
1346
+ octal_mask = 0xFF
1347
+ else:
1348
+ octal_mask = 0x1FF
1349
+
1350
+ value = int(digits + ch, 8) & octal_mask
1351
+ return make_character(info, value)
1352
+
1353
+ # Group reference.
1354
+ source.pos = saved_pos
1355
+ if info.is_open_group(digits):
1356
+ raise error("cannot refer to an open group", source.string, source.pos)
1357
+
1358
+ return make_ref_group(info, digits, source.pos)
1359
+
1360
+ def parse_octal_escape(source, info, digits, in_set):
1361
+ "Parses an octal escape sequence."
1362
+ saved_pos = source.pos
1363
+ ch = source.get()
1364
+ while len(digits) < 3 and ch in OCT_DIGITS:
1365
+ digits.append(ch)
1366
+ saved_pos = source.pos
1367
+ ch = source.get()
1368
+
1369
+ source.pos = saved_pos
1370
+ try:
1371
+ value = int("".join(digits), 8)
1372
+ return make_character(info, value, in_set)
1373
+ except ValueError:
1374
+ if digits[0] in OCT_DIGITS:
1375
+ raise error("incomplete escape \\%s" % ''.join(digits),
1376
+ source.string, source.pos)
1377
+ else:
1378
+ raise error("bad escape \\%s" % digits[0], source.string,
1379
+ source.pos)
1380
+
1381
+ def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1382
+ "Parses a hex escape sequence."
1383
+ saved_pos = source.pos
1384
+ digits = []
1385
+ for i in range(expected_len):
1386
+ ch = source.get()
1387
+ if ch not in HEX_DIGITS:
1388
+ raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1389
+ source.string, saved_pos)
1390
+ digits.append(ch)
1391
+
1392
+ try:
1393
+ value = int("".join(digits), 16)
1394
+ except ValueError:
1395
+ pass
1396
+ else:
1397
+ if value < 0x110000:
1398
+ return make_character(info, value, in_set)
1399
+
1400
+ # Bad hex escape.
1401
+ raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1402
+ source.string, saved_pos)
1403
+
1404
+ def parse_group_ref(source, info):
1405
+ "Parses a group reference."
1406
+ source.expect("<")
1407
+ saved_pos = source.pos
1408
+ name = parse_name(source, True)
1409
+ source.expect(">")
1410
+ if info.is_open_group(name):
1411
+ raise error("cannot refer to an open group", source.string, source.pos)
1412
+
1413
+ return make_ref_group(info, name, saved_pos)
1414
+
1415
+ def parse_string_set(source, info):
1416
+ "Parses a string set reference."
1417
+ source.expect("<")
1418
+ name = parse_name(source, True)
1419
+ source.expect(">")
1420
+ if name is None or name not in info.kwargs:
1421
+ raise error("undefined named list", source.string, source.pos)
1422
+
1423
+ return make_string_set(info, name)
1424
+
1425
+ def parse_named_char(source, info, in_set):
1426
+ "Parses a named character."
1427
+ saved_pos = source.pos
1428
+ if source.match("{"):
1429
+ name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
1430
+ if source.match("}"):
1431
+ try:
1432
+ value = unicodedata.lookup(name)
1433
+ return make_character(info, ord(value), in_set)
1434
+ except KeyError:
1435
+ raise error("undefined character name", source.string,
1436
+ source.pos)
1437
+
1438
+ source.pos = saved_pos
1439
+ return make_character(info, ord("N"), in_set)
1440
+
1441
+ def parse_property(source, info, positive, in_set):
1442
+ "Parses a Unicode property."
1443
+ saved_pos = source.pos
1444
+ ch = source.get()
1445
+ if ch == "{":
1446
+ negate = source.match("^")
1447
+ prop_name, name = parse_property_name(source)
1448
+ if source.match("}"):
1449
+ # It's correctly delimited.
1450
+ if info.flags & ASCII:
1451
+ encoding = ASCII_ENCODING
1452
+ elif info.flags & UNICODE:
1453
+ encoding = UNICODE_ENCODING
1454
+ else:
1455
+ encoding = 0
1456
+
1457
+ prop = lookup_property(prop_name, name, positive != negate, source,
1458
+ encoding=encoding)
1459
+ return make_property(info, prop, in_set)
1460
+ elif ch and ch in "CLMNPSZ":
1461
+ # An abbreviated property, eg \pL.
1462
+ if info.flags & ASCII:
1463
+ encoding = ASCII_ENCODING
1464
+ elif info.flags & UNICODE:
1465
+ encoding = UNICODE_ENCODING
1466
+ else:
1467
+ encoding = 0
1468
+
1469
+ prop = lookup_property(None, ch, positive, source, encoding=encoding)
1470
+ return make_property(info, prop, in_set)
1471
+
1472
+ # Not a property, so treat as a literal "p" or "P".
1473
+ source.pos = saved_pos
1474
+ ch = "p" if positive else "P"
1475
+ return make_character(info, ord(ch), in_set)
1476
+
1477
+ def parse_property_name(source):
1478
+ "Parses a property name, which may be qualified."
1479
+ name = source.get_while(PROPERTY_NAME_PART)
1480
+ saved_pos = source.pos
1481
+
1482
+ ch = source.get()
1483
+ if ch and ch in ":=":
1484
+ prop_name = name
1485
+ name = source.get_while(ALNUM | set(" &_-./")).strip()
1486
+
1487
+ if name:
1488
+ # Name after the ":" or "=", so it's a qualified name.
1489
+ saved_pos = source.pos
1490
+ else:
1491
+ # No name after the ":" or "=", so assume it's an unqualified name.
1492
+ prop_name, name = None, prop_name
1493
+ else:
1494
+ prop_name = None
1495
+
1496
+ source.pos = saved_pos
1497
+ return prop_name, name
1498
+
1499
+ def parse_set(source, info):
1500
+ "Parses a character set."
1501
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1502
+
1503
+ saved_ignore = source.ignore_space
1504
+ source.ignore_space = False
1505
+ # Negative set?
1506
+ negate = source.match("^")
1507
+ try:
1508
+ if version == VERSION0:
1509
+ item = parse_set_imp_union(source, info)
1510
+ else:
1511
+ item = parse_set_union(source, info)
1512
+
1513
+ if not source.match("]"):
1514
+ raise error("missing ]", source.string, source.pos)
1515
+ finally:
1516
+ source.ignore_space = saved_ignore
1517
+
1518
+ if negate:
1519
+ item = item.with_flags(positive=not item.positive)
1520
+
1521
+ item = item.with_flags(case_flags=make_case_flags(info))
1522
+
1523
+ return item
1524
+
1525
+ def parse_set_union(source, info):
1526
+ "Parses a set union ([x||y])."
1527
+ items = [parse_set_symm_diff(source, info)]
1528
+ while source.match("||"):
1529
+ items.append(parse_set_symm_diff(source, info))
1530
+
1531
+ if len(items) == 1:
1532
+ return items[0]
1533
+ return SetUnion(info, items)
1534
+
1535
+ def parse_set_symm_diff(source, info):
1536
+ "Parses a set symmetric difference ([x~~y])."
1537
+ items = [parse_set_inter(source, info)]
1538
+ while source.match("~~"):
1539
+ items.append(parse_set_inter(source, info))
1540
+
1541
+ if len(items) == 1:
1542
+ return items[0]
1543
+ return SetSymDiff(info, items)
1544
+
1545
+ def parse_set_inter(source, info):
1546
+ "Parses a set intersection ([x&&y])."
1547
+ items = [parse_set_diff(source, info)]
1548
+ while source.match("&&"):
1549
+ items.append(parse_set_diff(source, info))
1550
+
1551
+ if len(items) == 1:
1552
+ return items[0]
1553
+ return SetInter(info, items)
1554
+
1555
+ def parse_set_diff(source, info):
1556
+ "Parses a set difference ([x--y])."
1557
+ items = [parse_set_imp_union(source, info)]
1558
+ while source.match("--"):
1559
+ items.append(parse_set_imp_union(source, info))
1560
+
1561
+ if len(items) == 1:
1562
+ return items[0]
1563
+ return SetDiff(info, items)
1564
+
1565
+ def parse_set_imp_union(source, info):
1566
+ "Parses a set implicit union ([xy])."
1567
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1568
+
1569
+ items = [parse_set_member(source, info)]
1570
+ while True:
1571
+ saved_pos = source.pos
1572
+ if source.match("]"):
1573
+ # End of the set.
1574
+ source.pos = saved_pos
1575
+ break
1576
+
1577
+ if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1578
+ # The new behaviour has set operators.
1579
+ source.pos = saved_pos
1580
+ break
1581
+
1582
+ items.append(parse_set_member(source, info))
1583
+
1584
+ if len(items) == 1:
1585
+ return items[0]
1586
+ return SetUnion(info, items)
1587
+
1588
+ def parse_set_member(source, info):
1589
+ "Parses a member in a character set."
1590
+ # Parse a set item.
1591
+ start = parse_set_item(source, info)
1592
+ saved_pos1 = source.pos
1593
+ if (not isinstance(start, Character) or not start.positive or not
1594
+ source.match("-")):
1595
+ # It's not the start of a range.
1596
+ return start
1597
+
1598
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1599
+
1600
+ # It looks like the start of a range of characters.
1601
+ saved_pos2 = source.pos
1602
+ if version == VERSION1 and source.match("-"):
1603
+ # It's actually the set difference operator '--', so return the
1604
+ # character.
1605
+ source.pos = saved_pos1
1606
+ return start
1607
+
1608
+ if source.match("]"):
1609
+ # We've reached the end of the set, so return both the character and
1610
+ # hyphen.
1611
+ source.pos = saved_pos2
1612
+ return SetUnion(info, [start, Character(ord("-"))])
1613
+
1614
+ # Parse a set item.
1615
+ end = parse_set_item(source, info)
1616
+ if not isinstance(end, Character) or not end.positive:
1617
+ # It's not a range, so return the character, hyphen and property.
1618
+ return SetUnion(info, [start, Character(ord("-")), end])
1619
+
1620
+ # It _is_ a range.
1621
+ if start.value > end.value:
1622
+ raise error("bad character range", source.string, source.pos)
1623
+
1624
+ if start.value == end.value:
1625
+ return start
1626
+
1627
+ return Range(start.value, end.value)
1628
+
1629
+ def parse_set_item(source, info):
1630
+ "Parses an item in a character set."
1631
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1632
+
1633
+ if source.match("\\"):
1634
+ # An escape sequence in a set.
1635
+ return parse_escape(source, info, True)
1636
+
1637
+ saved_pos = source.pos
1638
+ if source.match("[:"):
1639
+ # Looks like a POSIX character class.
1640
+ try:
1641
+ return parse_posix_class(source, info)
1642
+ except ParseError:
1643
+ # Not a POSIX character class.
1644
+ source.pos = saved_pos
1645
+
1646
+ if version == VERSION1 and source.match("["):
1647
+ # It's the start of a nested set.
1648
+
1649
+ # Negative set?
1650
+ negate = source.match("^")
1651
+ item = parse_set_union(source, info)
1652
+
1653
+ if not source.match("]"):
1654
+ raise error("missing ]", source.string, source.pos)
1655
+
1656
+ if negate:
1657
+ item = item.with_flags(positive=not item.positive)
1658
+
1659
+ return item
1660
+
1661
+ ch = source.get()
1662
+ if not ch:
1663
+ raise error("unterminated character set", source.string, source.pos)
1664
+
1665
+ return Character(ord(ch))
1666
+
1667
+ def parse_posix_class(source, info):
1668
+ "Parses a POSIX character class."
1669
+ negate = source.match("^")
1670
+ prop_name, name = parse_property_name(source)
1671
+ if not source.match(":]"):
1672
+ raise ParseError()
1673
+
1674
+ return lookup_property(prop_name, name, not negate, source, posix=True)
1675
+
1676
+ def float_to_rational(flt):
1677
+ "Converts a float to a rational pair."
1678
+ int_part = int(flt)
1679
+ error = flt - int_part
1680
+ if abs(error) < 0.0001:
1681
+ return int_part, 1
1682
+
1683
+ den, num = float_to_rational(1.0 / error)
1684
+
1685
+ return int_part * den + num, den
1686
+
1687
+ def numeric_to_rational(numeric):
1688
+ "Converts a numeric string to a rational string, if possible."
1689
+ if numeric[ : 1] == "-":
1690
+ sign, numeric = numeric[0], numeric[1 : ]
1691
+ else:
1692
+ sign = ""
1693
+
1694
+ parts = numeric.split("/")
1695
+ if len(parts) == 2:
1696
+ num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1697
+ elif len(parts) == 1:
1698
+ num, den = float_to_rational(float(parts[0]))
1699
+ else:
1700
+ raise ValueError()
1701
+
1702
+ result = "{}{}/{}".format(sign, num, den)
1703
+ if result.endswith("/1"):
1704
+ return result[ : -2]
1705
+
1706
+ return result
1707
+
1708
+ def standardise_name(name):
1709
+ "Standardises a property or value name."
1710
+ try:
1711
+ return numeric_to_rational("".join(name))
1712
+ except (ValueError, ZeroDivisionError):
1713
+ return "".join(ch for ch in name if ch not in "_- ").upper()
1714
+
1715
+ _POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1716
+
1717
+ _BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1718
+
1719
+ def lookup_property(property, value, positive, source=None, posix=False, encoding=0):
1720
+ "Looks up a property."
1721
+ # Normalise the names (which may still be lists).
1722
+ property = standardise_name(property) if property else None
1723
+ value = standardise_name(value)
1724
+
1725
+ if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1726
+ property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1727
+
1728
+ if posix and not property and value.upper() in _POSIX_CLASSES:
1729
+ value = 'POSIX' + value
1730
+
1731
+ if property:
1732
+ # Both the property and the value are provided.
1733
+ prop = PROPERTIES.get(property)
1734
+ if not prop:
1735
+ if not source:
1736
+ raise error("unknown property")
1737
+
1738
+ raise error("unknown property", source.string, source.pos)
1739
+
1740
+ prop_id, value_dict = prop
1741
+ val_id = value_dict.get(value)
1742
+ if val_id is None:
1743
+ if not source:
1744
+ raise error("unknown property value")
1745
+
1746
+ raise error("unknown property value", source.string, source.pos)
1747
+
1748
+ return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1749
+
1750
+ # Only the value is provided.
1751
+ # It might be the name of a GC, script or block value.
1752
+ for property in ("GC", "SCRIPT", "BLOCK"):
1753
+ prop_id, value_dict = PROPERTIES.get(property)
1754
+ val_id = value_dict.get(value)
1755
+ if val_id is not None:
1756
+ return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1757
+
1758
+ # It might be the name of a binary property.
1759
+ prop = PROPERTIES.get(value)
1760
+ if prop:
1761
+ prop_id, value_dict = prop
1762
+ if set(value_dict) == _BINARY_VALUES:
1763
+ return Property((prop_id << 16) | 1, positive, encoding=encoding)
1764
+
1765
+ return Property(prop_id << 16, not positive, encoding=encoding)
1766
+
1767
+ # It might be the name of a binary property starting with a prefix.
1768
+ if value.startswith("IS"):
1769
+ prop = PROPERTIES.get(value[2 : ])
1770
+ if prop:
1771
+ prop_id, value_dict = prop
1772
+ if "YES" in value_dict:
1773
+ return Property((prop_id << 16) | 1, positive, encoding=encoding)
1774
+
1775
+ # It might be the name of a script or block starting with a prefix.
1776
+ for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1777
+ if value.startswith(prefix):
1778
+ prop_id, value_dict = PROPERTIES.get(property)
1779
+ val_id = value_dict.get(value[2 : ])
1780
+ if val_id is not None:
1781
+ return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1782
+
1783
+ # Unknown property.
1784
+ if not source:
1785
+ raise error("unknown property")
1786
+
1787
+ raise error("unknown property", source.string, source.pos)
1788
+
1789
+ def _compile_replacement(source, pattern, is_unicode):
1790
+ "Compiles a replacement template escape sequence."
1791
+ ch = source.get()
1792
+ if ch in ALPHA:
1793
+ # An alphabetic escape sequence.
1794
+ value = CHARACTER_ESCAPES.get(ch)
1795
+ if value:
1796
+ return False, [ord(value)]
1797
+
1798
+ if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1799
+ # A hexadecimal escape sequence.
1800
+ return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1801
+
1802
+ if ch == "g":
1803
+ # A group preference.
1804
+ return True, [compile_repl_group(source, pattern)]
1805
+
1806
+ if ch == "N" and is_unicode:
1807
+ # A named character.
1808
+ value = parse_repl_named_char(source)
1809
+ if value is not None:
1810
+ return False, [value]
1811
+
1812
+ raise error("bad escape \\%s" % ch, source.string, source.pos)
1813
+
1814
+ if isinstance(source.sep, bytes):
1815
+ octal_mask = 0xFF
1816
+ else:
1817
+ octal_mask = 0x1FF
1818
+
1819
+ if ch == "0":
1820
+ # An octal escape sequence.
1821
+ digits = ch
1822
+ while len(digits) < 3:
1823
+ saved_pos = source.pos
1824
+ ch = source.get()
1825
+ if ch not in OCT_DIGITS:
1826
+ source.pos = saved_pos
1827
+ break
1828
+ digits += ch
1829
+
1830
+ return False, [int(digits, 8) & octal_mask]
1831
+
1832
+ if ch in DIGITS:
1833
+ # Either an octal escape sequence (3 digits) or a group reference (max
1834
+ # 2 digits).
1835
+ digits = ch
1836
+ saved_pos = source.pos
1837
+ ch = source.get()
1838
+ if ch in DIGITS:
1839
+ digits += ch
1840
+ saved_pos = source.pos
1841
+ ch = source.get()
1842
+ if ch and is_octal(digits + ch):
1843
+ # An octal escape sequence.
1844
+ return False, [int(digits + ch, 8) & octal_mask]
1845
+
1846
+ # A group reference.
1847
+ source.pos = saved_pos
1848
+ return True, [int(digits)]
1849
+
1850
+ if ch == "\\":
1851
+ # An escaped backslash is a backslash.
1852
+ return False, [ord("\\")]
1853
+
1854
+ if not ch:
1855
+ # A trailing backslash.
1856
+ raise error("bad escape (end of pattern)", source.string, source.pos)
1857
+
1858
+ # An escaped non-backslash is a backslash followed by the literal.
1859
+ return False, [ord("\\"), ord(ch)]
1860
+
1861
+ def parse_repl_hex_escape(source, expected_len, type):
1862
+ "Parses a hex escape sequence in a replacement string."
1863
+ digits = []
1864
+ for i in range(expected_len):
1865
+ ch = source.get()
1866
+ if ch not in HEX_DIGITS:
1867
+ raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1868
+ source.string, source.pos)
1869
+ digits.append(ch)
1870
+
1871
+ return int("".join(digits), 16)
1872
+
1873
+ def parse_repl_named_char(source):
1874
+ "Parses a named character in a replacement string."
1875
+ saved_pos = source.pos
1876
+ if source.match("{"):
1877
+ name = source.get_while(ALPHA | set(" "))
1878
+
1879
+ if source.match("}"):
1880
+ try:
1881
+ value = unicodedata.lookup(name)
1882
+ return ord(value)
1883
+ except KeyError:
1884
+ raise error("undefined character name", source.string,
1885
+ source.pos)
1886
+
1887
+ source.pos = saved_pos
1888
+ return None
1889
+
1890
+ def compile_repl_group(source, pattern):
1891
+ "Compiles a replacement template group reference."
1892
+ source.expect("<")
1893
+ name = parse_name(source, True, True)
1894
+
1895
+ source.expect(">")
1896
+ if name.isdigit():
1897
+ index = int(name)
1898
+ if not 0 <= index <= pattern.groups:
1899
+ raise error("invalid group reference", source.string, source.pos)
1900
+
1901
+ return index
1902
+
1903
+ try:
1904
+ return pattern.groupindex[name]
1905
+ except KeyError:
1906
+ raise IndexError("unknown group")
1907
+
1908
+ # The regular expression is parsed into a syntax tree. The different types of
1909
+ # node are defined below.
1910
+
1911
+ INDENT = " "
1912
+ POSITIVE_OP = 0x1
1913
+ ZEROWIDTH_OP = 0x2
1914
+ FUZZY_OP = 0x4
1915
+ REVERSE_OP = 0x8
1916
+ REQUIRED_OP = 0x10
1917
+ ENCODING_OP_SHIFT = 5
1918
+
1919
+ POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1920
+ CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1921
+ FULLIGNORECASE: " FULL_IGNORE_CASE"}
1922
+
1923
+ def make_sequence(items):
1924
+ if len(items) == 1:
1925
+ return items[0]
1926
+ return Sequence(items)
1927
+
1928
+ # Common base class for all nodes.
1929
+ class RegexBase:
1930
+ def __init__(self):
1931
+ self._key = self.__class__
1932
+
1933
+ def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1934
+ if positive is None:
1935
+ positive = self.positive
1936
+ else:
1937
+ positive = bool(positive)
1938
+ if case_flags is None:
1939
+ case_flags = self.case_flags
1940
+ else:
1941
+ case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1942
+ if zerowidth is None:
1943
+ zerowidth = self.zerowidth
1944
+ else:
1945
+ zerowidth = bool(zerowidth)
1946
+
1947
+ if (positive == self.positive and case_flags == self.case_flags and
1948
+ zerowidth == self.zerowidth):
1949
+ return self
1950
+
1951
+ return self.rebuild(positive, case_flags, zerowidth)
1952
+
1953
+ def fix_groups(self, pattern, reverse, fuzzy):
1954
+ pass
1955
+
1956
+ def optimise(self, info, reverse):
1957
+ return self
1958
+
1959
+ def pack_characters(self, info):
1960
+ return self
1961
+
1962
+ def remove_captures(self):
1963
+ return self
1964
+
1965
+ def is_atomic(self):
1966
+ return True
1967
+
1968
+ def can_be_affix(self):
1969
+ return True
1970
+
1971
+ def contains_group(self):
1972
+ return False
1973
+
1974
+ def get_firstset(self, reverse):
1975
+ raise _FirstSetError()
1976
+
1977
+ def has_simple_start(self):
1978
+ return False
1979
+
1980
+ def compile(self, reverse=False, fuzzy=False):
1981
+ return self._compile(reverse, fuzzy)
1982
+
1983
+ def is_empty(self):
1984
+ return False
1985
+
1986
+ def __hash__(self):
1987
+ return hash(self._key)
1988
+
1989
+ def __eq__(self, other):
1990
+ return type(self) is type(other) and self._key == other._key
1991
+
1992
+ def __ne__(self, other):
1993
+ return not self.__eq__(other)
1994
+
1995
+ def get_required_string(self, reverse):
1996
+ return self.max_width(), None
1997
+
1998
+ # Base class for zero-width nodes.
1999
+ class ZeroWidthBase(RegexBase):
2000
+ def __init__(self, positive=True, encoding=0):
2001
+ RegexBase.__init__(self)
2002
+ self.positive = bool(positive)
2003
+ self.encoding = encoding
2004
+
2005
+ self._key = self.__class__, self.positive
2006
+
2007
+ def get_firstset(self, reverse):
2008
+ return set([None])
2009
+
2010
+ def _compile(self, reverse, fuzzy):
2011
+ flags = 0
2012
+ if self.positive:
2013
+ flags |= POSITIVE_OP
2014
+ if fuzzy:
2015
+ flags |= FUZZY_OP
2016
+ if reverse:
2017
+ flags |= REVERSE_OP
2018
+ flags |= self.encoding << ENCODING_OP_SHIFT
2019
+ return [(self._opcode, flags)]
2020
+
2021
+ def dump(self, indent, reverse):
2022
+ print("{}{} {}{}".format(INDENT * indent, self._op_name,
2023
+ POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))
2024
+
2025
+ def max_width(self):
2026
+ return 0
2027
+
2028
+ class Any(RegexBase):
2029
+ _opcode = {False: OP.ANY, True: OP.ANY_REV}
2030
+ _op_name = "ANY"
2031
+
2032
+ def has_simple_start(self):
2033
+ return True
2034
+
2035
+ def _compile(self, reverse, fuzzy):
2036
+ flags = 0
2037
+ if fuzzy:
2038
+ flags |= FUZZY_OP
2039
+ return [(self._opcode[reverse], flags)]
2040
+
2041
+ def dump(self, indent, reverse):
2042
+ print("{}{}".format(INDENT * indent, self._op_name))
2043
+
2044
+ def max_width(self):
2045
+ return 1
2046
+
2047
+ class AnyAll(Any):
2048
+ _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
2049
+ _op_name = "ANY_ALL"
2050
+
2051
+ def __init__(self):
2052
+ self.positive = True
2053
+ self.zerowidth = False
2054
+ self.case_flags = 0
2055
+
2056
+ self._key = self.__class__, self.positive
2057
+
2058
+ class AnyU(Any):
2059
+ _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
2060
+ _op_name = "ANY_U"
2061
+
2062
+ class Atomic(RegexBase):
2063
+ def __init__(self, subpattern):
2064
+ RegexBase.__init__(self)
2065
+ self.subpattern = subpattern
2066
+
2067
+ def fix_groups(self, pattern, reverse, fuzzy):
2068
+ self.subpattern.fix_groups(pattern, reverse, fuzzy)
2069
+
2070
+ def optimise(self, info, reverse):
2071
+ self.subpattern = self.subpattern.optimise(info, reverse)
2072
+
2073
+ if self.subpattern.is_empty():
2074
+ return self.subpattern
2075
+ return self
2076
+
2077
+ def pack_characters(self, info):
2078
+ self.subpattern = self.subpattern.pack_characters(info)
2079
+ return self
2080
+
2081
+ def remove_captures(self):
2082
+ self.subpattern = self.subpattern.remove_captures()
2083
+ return self
2084
+
2085
+ def can_be_affix(self):
2086
+ return self.subpattern.can_be_affix()
2087
+
2088
+ def contains_group(self):
2089
+ return self.subpattern.contains_group()
2090
+
2091
+ def get_firstset(self, reverse):
2092
+ return self.subpattern.get_firstset(reverse)
2093
+
2094
+ def has_simple_start(self):
2095
+ return self.subpattern.has_simple_start()
2096
+
2097
+ def _compile(self, reverse, fuzzy):
2098
+ return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2099
+ [(OP.END, )])
2100
+
2101
+ def dump(self, indent, reverse):
2102
+ print("{}ATOMIC".format(INDENT * indent))
2103
+ self.subpattern.dump(indent + 1, reverse)
2104
+
2105
+ def is_empty(self):
2106
+ return self.subpattern.is_empty()
2107
+
2108
+ def __eq__(self, other):
2109
+ return (type(self) is type(other) and self.subpattern ==
2110
+ other.subpattern)
2111
+
2112
+ def max_width(self):
2113
+ return self.subpattern.max_width()
2114
+
2115
+ def get_required_string(self, reverse):
2116
+ return self.subpattern.get_required_string(reverse)
2117
+
2118
+ class Boundary(ZeroWidthBase):
2119
+ _opcode = OP.BOUNDARY
2120
+ _op_name = "BOUNDARY"
2121
+
2122
+ class Branch(RegexBase):
2123
+ def __init__(self, branches):
2124
+ RegexBase.__init__(self)
2125
+ self.branches = branches
2126
+
2127
+ def fix_groups(self, pattern, reverse, fuzzy):
2128
+ for b in self.branches:
2129
+ b.fix_groups(pattern, reverse, fuzzy)
2130
+
2131
+ def optimise(self, info, reverse):
2132
+ if not self.branches:
2133
+ return Sequence([])
2134
+
2135
+ # Flatten branches within branches.
2136
+ branches = Branch._flatten_branches(info, reverse, self.branches)
2137
+
2138
+ # Move any common prefix or suffix out of the branches.
2139
+ if reverse:
2140
+ suffix, branches = Branch._split_common_suffix(info, branches)
2141
+ prefix = []
2142
+ else:
2143
+ prefix, branches = Branch._split_common_prefix(info, branches)
2144
+ suffix = []
2145
+
2146
+ # Try to reduce adjacent single-character branches to sets.
2147
+ branches = Branch._reduce_to_set(info, reverse, branches)
2148
+
2149
+ if len(branches) > 1:
2150
+ sequence = [Branch(branches)]
2151
+
2152
+ if not prefix or not suffix:
2153
+ # We might be able to add a quick precheck before the branches.
2154
+ firstset = self._add_precheck(info, reverse, branches)
2155
+
2156
+ if firstset:
2157
+ if reverse:
2158
+ sequence.append(firstset)
2159
+ else:
2160
+ sequence.insert(0, firstset)
2161
+ else:
2162
+ sequence = branches
2163
+
2164
+ return make_sequence(prefix + sequence + suffix)
2165
+
2166
+ def _add_precheck(self, info, reverse, branches):
2167
+ charset = set()
2168
+ pos = -1 if reverse else 0
2169
+
2170
+ for branch in branches:
2171
+ if type(branch) is Literal and branch.case_flags == NOCASE:
2172
+ charset.add(branch.characters[pos])
2173
+ else:
2174
+ return
2175
+
2176
+ if not charset:
2177
+ return None
2178
+
2179
+ return _check_firstset(info, reverse, [Character(c) for c in charset])
2180
+
2181
+ def pack_characters(self, info):
2182
+ self.branches = [b.pack_characters(info) for b in self.branches]
2183
+ return self
2184
+
2185
+ def remove_captures(self):
2186
+ self.branches = [b.remove_captures() for b in self.branches]
2187
+ return self
2188
+
2189
+ def is_atomic(self):
2190
+ return all(b.is_atomic() for b in self.branches)
2191
+
2192
+ def can_be_affix(self):
2193
+ return all(b.can_be_affix() for b in self.branches)
2194
+
2195
+ def contains_group(self):
2196
+ return any(b.contains_group() for b in self.branches)
2197
+
2198
+ def get_firstset(self, reverse):
2199
+ fs = set()
2200
+ for b in self.branches:
2201
+ fs |= b.get_firstset(reverse)
2202
+
2203
+ return fs or set([None])
2204
+
2205
+ def _compile(self, reverse, fuzzy):
2206
+ if not self.branches:
2207
+ return []
2208
+
2209
+ code = [(OP.BRANCH, )]
2210
+ for b in self.branches:
2211
+ code.extend(b.compile(reverse, fuzzy))
2212
+ code.append((OP.NEXT, ))
2213
+
2214
+ code[-1] = (OP.END, )
2215
+
2216
+ return code
2217
+
2218
+ def dump(self, indent, reverse):
2219
+ print("{}BRANCH".format(INDENT * indent))
2220
+ self.branches[0].dump(indent + 1, reverse)
2221
+ for b in self.branches[1 : ]:
2222
+ print("{}OR".format(INDENT * indent))
2223
+ b.dump(indent + 1, reverse)
2224
+
2225
+ @staticmethod
2226
+ def _flatten_branches(info, reverse, branches):
2227
+ # Flatten the branches so that there aren't branches of branches.
2228
+ new_branches = []
2229
+ for b in branches:
2230
+ b = b.optimise(info, reverse)
2231
+ if isinstance(b, Branch):
2232
+ new_branches.extend(b.branches)
2233
+ else:
2234
+ new_branches.append(b)
2235
+
2236
+ return new_branches
2237
+
2238
+ @staticmethod
2239
+ def _split_common_prefix(info, branches):
2240
+ # Common leading items can be moved out of the branches.
2241
+ # Get the items in the branches.
2242
+ alternatives = []
2243
+ for b in branches:
2244
+ if isinstance(b, Sequence):
2245
+ alternatives.append(b.items)
2246
+ else:
2247
+ alternatives.append([b])
2248
+
2249
+ # What is the maximum possible length of the prefix?
2250
+ max_count = min(len(a) for a in alternatives)
2251
+
2252
+ # What is the longest common prefix?
2253
+ prefix = alternatives[0]
2254
+ pos = 0
2255
+ end_pos = max_count
2256
+ while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2257
+ prefix[pos] for a in alternatives):
2258
+ pos += 1
2259
+ count = pos
2260
+
2261
+ if info.flags & UNICODE:
2262
+ # We need to check that we're not splitting a sequence of
2263
+ # characters which could form part of full case-folding.
2264
+ count = pos
2265
+ while count > 0 and not all(Branch._can_split(a, count) for a in
2266
+ alternatives):
2267
+ count -= 1
2268
+
2269
+ # No common prefix is possible.
2270
+ if count == 0:
2271
+ return [], branches
2272
+
2273
+ # Rebuild the branches.
2274
+ new_branches = []
2275
+ for a in alternatives:
2276
+ new_branches.append(make_sequence(a[count : ]))
2277
+
2278
+ return prefix[ : count], new_branches
2279
+
2280
+ @staticmethod
2281
+ def _split_common_suffix(info, branches):
2282
+ # Common trailing items can be moved out of the branches.
2283
+ # Get the items in the branches.
2284
+ alternatives = []
2285
+ for b in branches:
2286
+ if isinstance(b, Sequence):
2287
+ alternatives.append(b.items)
2288
+ else:
2289
+ alternatives.append([b])
2290
+
2291
+ # What is the maximum possible length of the suffix?
2292
+ max_count = min(len(a) for a in alternatives)
2293
+
2294
+ # What is the longest common suffix?
2295
+ suffix = alternatives[0]
2296
+ pos = -1
2297
+ end_pos = -1 - max_count
2298
+ while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2299
+ suffix[pos] for a in alternatives):
2300
+ pos -= 1
2301
+ count = -1 - pos
2302
+
2303
+ if info.flags & UNICODE:
2304
+ # We need to check that we're not splitting a sequence of
2305
+ # characters which could form part of full case-folding.
2306
+ while count > 0 and not all(Branch._can_split_rev(a, count) for a
2307
+ in alternatives):
2308
+ count -= 1
2309
+
2310
+ # No common suffix is possible.
2311
+ if count == 0:
2312
+ return [], branches
2313
+
2314
+ # Rebuild the branches.
2315
+ new_branches = []
2316
+ for a in alternatives:
2317
+ new_branches.append(make_sequence(a[ : -count]))
2318
+
2319
+ return suffix[-count : ], new_branches
2320
+
2321
+ @staticmethod
2322
+ def _can_split(items, count):
2323
+ # Check the characters either side of the proposed split.
2324
+ if not Branch._is_full_case(items, count - 1):
2325
+ return True
2326
+
2327
+ if not Branch._is_full_case(items, count):
2328
+ return True
2329
+
2330
+ # Check whether a 1-1 split would be OK.
2331
+ if Branch._is_folded(items[count - 1 : count + 1]):
2332
+ return False
2333
+
2334
+ # Check whether a 1-2 split would be OK.
2335
+ if (Branch._is_full_case(items, count + 2) and
2336
+ Branch._is_folded(items[count - 1 : count + 2])):
2337
+ return False
2338
+
2339
+ # Check whether a 2-1 split would be OK.
2340
+ if (Branch._is_full_case(items, count - 2) and
2341
+ Branch._is_folded(items[count - 2 : count + 1])):
2342
+ return False
2343
+
2344
+ return True
2345
+
2346
+ @staticmethod
2347
+ def _can_split_rev(items, count):
2348
+ end = len(items)
2349
+
2350
+ # Check the characters either side of the proposed split.
2351
+ if not Branch._is_full_case(items, end - count):
2352
+ return True
2353
+
2354
+ if not Branch._is_full_case(items, end - count - 1):
2355
+ return True
2356
+
2357
+ # Check whether a 1-1 split would be OK.
2358
+ if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2359
+ return False
2360
+
2361
+ # Check whether a 1-2 split would be OK.
2362
+ if (Branch._is_full_case(items, end - count + 2) and
2363
+ Branch._is_folded(items[end - count - 1 : end - count + 2])):
2364
+ return False
2365
+
2366
+ # Check whether a 2-1 split would be OK.
2367
+ if (Branch._is_full_case(items, end - count - 2) and
2368
+ Branch._is_folded(items[end - count - 2 : end - count + 1])):
2369
+ return False
2370
+
2371
+ return True
2372
+
2373
+ @staticmethod
2374
+ def _merge_common_prefixes(info, reverse, branches):
2375
+ # Branches with the same case-sensitive character prefix can be grouped
2376
+ # together if they are separated only by other branches with a
2377
+ # character prefix.
2378
+ prefixed = defaultdict(list)
2379
+ order = {}
2380
+ new_branches = []
2381
+ for b in branches:
2382
+ if Branch._is_simple_character(b):
2383
+ # Branch starts with a simple character.
2384
+ prefixed[b.value].append([b])
2385
+ order.setdefault(b.value, len(order))
2386
+ elif (isinstance(b, Sequence) and b.items and
2387
+ Branch._is_simple_character(b.items[0])):
2388
+ # Branch starts with a simple character.
2389
+ prefixed[b.items[0].value].append(b.items)
2390
+ order.setdefault(b.items[0].value, len(order))
2391
+ else:
2392
+ Branch._flush_char_prefix(info, reverse, prefixed, order,
2393
+ new_branches)
2394
+
2395
+ new_branches.append(b)
2396
+
2397
+ Branch._flush_char_prefix(info, prefixed, order, new_branches)
2398
+
2399
+ return new_branches
2400
+
2401
+ @staticmethod
2402
+ def _is_simple_character(c):
2403
+ return isinstance(c, Character) and c.positive and not c.case_flags
2404
+
2405
+ @staticmethod
2406
+ def _reduce_to_set(info, reverse, branches):
2407
+ # Can the branches be reduced to a set?
2408
+ new_branches = []
2409
+ items = set()
2410
+ case_flags = NOCASE
2411
+ for b in branches:
2412
+ if isinstance(b, (Character, Property, SetBase)):
2413
+ # Branch starts with a single character.
2414
+ if b.case_flags != case_flags:
2415
+ # Different case sensitivity, so flush.
2416
+ Branch._flush_set_members(info, reverse, items, case_flags,
2417
+ new_branches)
2418
+
2419
+ case_flags = b.case_flags
2420
+
2421
+ items.add(b.with_flags(case_flags=NOCASE))
2422
+ else:
2423
+ Branch._flush_set_members(info, reverse, items, case_flags,
2424
+ new_branches)
2425
+
2426
+ new_branches.append(b)
2427
+
2428
+ Branch._flush_set_members(info, reverse, items, case_flags,
2429
+ new_branches)
2430
+
2431
+ return new_branches
2432
+
2433
+ @staticmethod
2434
+ def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2435
+ # Flush the prefixed branches.
2436
+ if not prefixed:
2437
+ return
2438
+
2439
+ for value, branches in sorted(prefixed.items(), key=lambda pair:
2440
+ order[pair[0]]):
2441
+ if len(branches) == 1:
2442
+ new_branches.append(make_sequence(branches[0]))
2443
+ else:
2444
+ subbranches = []
2445
+ optional = False
2446
+ for b in branches:
2447
+ if len(b) > 1:
2448
+ subbranches.append(make_sequence(b[1 : ]))
2449
+ elif not optional:
2450
+ subbranches.append(Sequence())
2451
+ optional = True
2452
+
2453
+ sequence = Sequence([Character(value), Branch(subbranches)])
2454
+ new_branches.append(sequence.optimise(info, reverse))
2455
+
2456
+ prefixed.clear()
2457
+ order.clear()
2458
+
2459
+ @staticmethod
2460
+ def _flush_set_members(info, reverse, items, case_flags, new_branches):
2461
+ # Flush the set members.
2462
+ if not items:
2463
+ return
2464
+
2465
+ if len(items) == 1:
2466
+ item = list(items)[0]
2467
+ else:
2468
+ item = SetUnion(info, list(items)).optimise(info, reverse)
2469
+
2470
+ new_branches.append(item.with_flags(case_flags=case_flags))
2471
+
2472
+ items.clear()
2473
+
2474
+ @staticmethod
2475
+ def _is_full_case(items, i):
2476
+ if not 0 <= i < len(items):
2477
+ return False
2478
+
2479
+ item = items[i]
2480
+ return (isinstance(item, Character) and item.positive and
2481
+ (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2482
+
2483
+ @staticmethod
2484
+ def _is_folded(items):
2485
+ if len(items) < 2:
2486
+ return False
2487
+
2488
+ for i in items:
2489
+ if (not isinstance(i, Character) or not i.positive or not
2490
+ i.case_flags):
2491
+ return False
2492
+
2493
+ folded = "".join(chr(i.value) for i in items)
2494
+ folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2495
+
2496
+ # Get the characters which expand to multiple codepoints on folding.
2497
+ expanding_chars = _regex.get_expand_on_folding()
2498
+
2499
+ for c in expanding_chars:
2500
+ if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2501
+ return True
2502
+
2503
+ return False
2504
+
2505
+ def is_empty(self):
2506
+ return all(b.is_empty() for b in self.branches)
2507
+
2508
+ def __eq__(self, other):
2509
+ return type(self) is type(other) and self.branches == other.branches
2510
+
2511
+ def max_width(self):
2512
+ return max(b.max_width() for b in self.branches)
2513
+
2514
+ class CallGroup(RegexBase):
2515
+ def __init__(self, info, group, position):
2516
+ RegexBase.__init__(self)
2517
+ self.info = info
2518
+ self.group = group
2519
+ self.position = position
2520
+
2521
+ self._key = self.__class__, self.group
2522
+
2523
+ def fix_groups(self, pattern, reverse, fuzzy):
2524
+ try:
2525
+ self.group = int(self.group)
2526
+ except ValueError:
2527
+ try:
2528
+ self.group = self.info.group_index[self.group]
2529
+ except KeyError:
2530
+ raise error("invalid group reference", pattern, self.position)
2531
+
2532
+ if not 0 <= self.group <= self.info.group_count:
2533
+ raise error("unknown group", pattern, self.position)
2534
+
2535
+ if self.group > 0 and self.info.open_group_count[self.group] > 1:
2536
+ raise error("ambiguous group reference", pattern, self.position)
2537
+
2538
+ self.info.group_calls.append((self, reverse, fuzzy))
2539
+
2540
+ self._key = self.__class__, self.group
2541
+
2542
+ def remove_captures(self):
2543
+ raise error("group reference not allowed", self.pattern, self.position)
2544
+
2545
+ def _compile(self, reverse, fuzzy):
2546
+ return [(OP.GROUP_CALL, self.call_ref)]
2547
+
2548
+ def dump(self, indent, reverse):
2549
+ print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2550
+
2551
+ def __eq__(self, other):
2552
+ return type(self) is type(other) and self.group == other.group
2553
+
2554
+ def max_width(self):
2555
+ return UNLIMITED
2556
+
2557
+ def __del__(self):
2558
+ self.info = None
2559
+
2560
+ class CallRef(RegexBase):
2561
+ def __init__(self, ref, parsed):
2562
+ self.ref = ref
2563
+ self.parsed = parsed
2564
+
2565
+ def _compile(self, reverse, fuzzy):
2566
+ return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2567
+ fuzzy) + [(OP.END, )])
2568
+
2569
+ class Character(RegexBase):
2570
+ _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2571
+ OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2572
+ False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2573
+ True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2574
+ (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2575
+
2576
+ def __init__(self, value, positive=True, case_flags=NOCASE,
2577
+ zerowidth=False):
2578
+ RegexBase.__init__(self)
2579
+ self.value = value
2580
+ self.positive = bool(positive)
2581
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2582
+ self.zerowidth = bool(zerowidth)
2583
+
2584
+ if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2585
+ FULLIGNORECASE):
2586
+ self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2587
+ else:
2588
+ self.folded = chr(self.value)
2589
+
2590
+ self._key = (self.__class__, self.value, self.positive,
2591
+ self.case_flags, self.zerowidth)
2592
+
2593
+ def rebuild(self, positive, case_flags, zerowidth):
2594
+ return Character(self.value, positive, case_flags, zerowidth)
2595
+
2596
+ def optimise(self, info, reverse, in_set=False):
2597
+ return self
2598
+
2599
+ def get_firstset(self, reverse):
2600
+ return set([self])
2601
+
2602
+ def has_simple_start(self):
2603
+ return True
2604
+
2605
+ def _compile(self, reverse, fuzzy):
2606
+ flags = 0
2607
+ if self.positive:
2608
+ flags |= POSITIVE_OP
2609
+ if self.zerowidth:
2610
+ flags |= ZEROWIDTH_OP
2611
+ if fuzzy:
2612
+ flags |= FUZZY_OP
2613
+
2614
+ code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2615
+ self.value])
2616
+
2617
+ if len(self.folded) > 1:
2618
+ # The character expands on full case-folding.
2619
+ code = Branch([code, String([ord(c) for c in self.folded],
2620
+ case_flags=self.case_flags)])
2621
+
2622
+ return code.compile(reverse, fuzzy)
2623
+
2624
+ def dump(self, indent, reverse):
2625
+ display = ascii(chr(self.value)).lstrip("bu")
2626
+ print("{}CHARACTER {} {}{}".format(INDENT * indent,
2627
+ POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2628
+
2629
+ def matches(self, ch):
2630
+ return (ch == self.value) == self.positive
2631
+
2632
+ def max_width(self):
2633
+ return len(self.folded)
2634
+
2635
+ def get_required_string(self, reverse):
2636
+ if not self.positive:
2637
+ return 1, None
2638
+
2639
+ self.folded_characters = tuple(ord(c) for c in self.folded)
2640
+
2641
+ return 0, self
2642
+
2643
+ class Conditional(RegexBase):
2644
+ def __init__(self, info, group, yes_item, no_item, position):
2645
+ RegexBase.__init__(self)
2646
+ self.info = info
2647
+ self.group = group
2648
+ self.yes_item = yes_item
2649
+ self.no_item = no_item
2650
+ self.position = position
2651
+
2652
+ def fix_groups(self, pattern, reverse, fuzzy):
2653
+ try:
2654
+ self.group = int(self.group)
2655
+ except ValueError:
2656
+ try:
2657
+ self.group = self.info.group_index[self.group]
2658
+ except KeyError:
2659
+ if self.group == 'DEFINE':
2660
+ # 'DEFINE' is a special name unless there's a group with
2661
+ # that name.
2662
+ self.group = 0
2663
+ else:
2664
+ raise error("unknown group", pattern, self.position)
2665
+
2666
+ if not 0 <= self.group <= self.info.group_count:
2667
+ raise error("invalid group reference", pattern, self.position)
2668
+
2669
+ self.yes_item.fix_groups(pattern, reverse, fuzzy)
2670
+ self.no_item.fix_groups(pattern, reverse, fuzzy)
2671
+
2672
+ def optimise(self, info, reverse):
2673
+ yes_item = self.yes_item.optimise(info, reverse)
2674
+ no_item = self.no_item.optimise(info, reverse)
2675
+
2676
+ return Conditional(info, self.group, yes_item, no_item, self.position)
2677
+
2678
+ def pack_characters(self, info):
2679
+ self.yes_item = self.yes_item.pack_characters(info)
2680
+ self.no_item = self.no_item.pack_characters(info)
2681
+ return self
2682
+
2683
+ def remove_captures(self):
2684
+ self.yes_item = self.yes_item.remove_captures()
2685
+ self.no_item = self.no_item.remove_captures()
2686
+
2687
+ def is_atomic(self):
2688
+ return self.yes_item.is_atomic() and self.no_item.is_atomic()
2689
+
2690
+ def can_be_affix(self):
2691
+ return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2692
+
2693
+ def contains_group(self):
2694
+ return self.yes_item.contains_group() or self.no_item.contains_group()
2695
+
2696
+ def get_firstset(self, reverse):
2697
+ return (self.yes_item.get_firstset(reverse) |
2698
+ self.no_item.get_firstset(reverse))
2699
+
2700
+ def _compile(self, reverse, fuzzy):
2701
+ code = [(OP.GROUP_EXISTS, self.group)]
2702
+ code.extend(self.yes_item.compile(reverse, fuzzy))
2703
+ add_code = self.no_item.compile(reverse, fuzzy)
2704
+ if add_code:
2705
+ code.append((OP.NEXT, ))
2706
+ code.extend(add_code)
2707
+
2708
+ code.append((OP.END, ))
2709
+
2710
+ return code
2711
+
2712
+ def dump(self, indent, reverse):
2713
+ print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2714
+ self.yes_item.dump(indent + 1, reverse)
2715
+ if not self.no_item.is_empty():
2716
+ print("{}OR".format(INDENT * indent))
2717
+ self.no_item.dump(indent + 1, reverse)
2718
+
2719
+ def is_empty(self):
2720
+ return self.yes_item.is_empty() and self.no_item.is_empty()
2721
+
2722
+ def __eq__(self, other):
2723
+ return type(self) is type(other) and (self.group, self.yes_item,
2724
+ self.no_item) == (other.group, other.yes_item, other.no_item)
2725
+
2726
+ def max_width(self):
2727
+ return max(self.yes_item.max_width(), self.no_item.max_width())
2728
+
2729
+ def __del__(self):
2730
+ self.info = None
2731
+
2732
+ class DefaultBoundary(ZeroWidthBase):
2733
+ _opcode = OP.DEFAULT_BOUNDARY
2734
+ _op_name = "DEFAULT_BOUNDARY"
2735
+
2736
+ class DefaultEndOfWord(ZeroWidthBase):
2737
+ _opcode = OP.DEFAULT_END_OF_WORD
2738
+ _op_name = "DEFAULT_END_OF_WORD"
2739
+
2740
+ class DefaultStartOfWord(ZeroWidthBase):
2741
+ _opcode = OP.DEFAULT_START_OF_WORD
2742
+ _op_name = "DEFAULT_START_OF_WORD"
2743
+
2744
+ class EndOfLine(ZeroWidthBase):
2745
+ _opcode = OP.END_OF_LINE
2746
+ _op_name = "END_OF_LINE"
2747
+
2748
+ class EndOfLineU(EndOfLine):
2749
+ _opcode = OP.END_OF_LINE_U
2750
+ _op_name = "END_OF_LINE_U"
2751
+
2752
+ class EndOfString(ZeroWidthBase):
2753
+ _opcode = OP.END_OF_STRING
2754
+ _op_name = "END_OF_STRING"
2755
+
2756
+ class EndOfStringLine(ZeroWidthBase):
2757
+ _opcode = OP.END_OF_STRING_LINE
2758
+ _op_name = "END_OF_STRING_LINE"
2759
+
2760
+ class EndOfStringLineU(EndOfStringLine):
2761
+ _opcode = OP.END_OF_STRING_LINE_U
2762
+ _op_name = "END_OF_STRING_LINE_U"
2763
+
2764
+ class EndOfWord(ZeroWidthBase):
2765
+ _opcode = OP.END_OF_WORD
2766
+ _op_name = "END_OF_WORD"
2767
+
2768
+ class Failure(ZeroWidthBase):
2769
+ _op_name = "FAILURE"
2770
+
2771
+ def _compile(self, reverse, fuzzy):
2772
+ return [(OP.FAILURE, )]
2773
+
2774
+ class Fuzzy(RegexBase):
2775
+ def __init__(self, subpattern, constraints=None):
2776
+ RegexBase.__init__(self)
2777
+ if constraints is None:
2778
+ constraints = {}
2779
+ self.subpattern = subpattern
2780
+ self.constraints = constraints
2781
+
2782
+ # If an error type is mentioned in the cost equation, then its maximum
2783
+ # defaults to unlimited.
2784
+ if "cost" in constraints:
2785
+ for e in "dis":
2786
+ if e in constraints["cost"]:
2787
+ constraints.setdefault(e, (0, None))
2788
+
2789
+ # If any error type is mentioned, then all the error maxima default to
2790
+ # 0, otherwise they default to unlimited.
2791
+ if set(constraints) & set("dis"):
2792
+ for e in "dis":
2793
+ constraints.setdefault(e, (0, 0))
2794
+ else:
2795
+ for e in "dis":
2796
+ constraints.setdefault(e, (0, None))
2797
+
2798
+ # The maximum of the generic error type defaults to unlimited.
2799
+ constraints.setdefault("e", (0, None))
2800
+
2801
+ # The cost equation defaults to equal costs. Also, the cost of any
2802
+ # error type not mentioned in the cost equation defaults to 0.
2803
+ if "cost" in constraints:
2804
+ for e in "dis":
2805
+ constraints["cost"].setdefault(e, 0)
2806
+ else:
2807
+ constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2808
+ constraints["e"][1]}
2809
+
2810
+ def fix_groups(self, pattern, reverse, fuzzy):
2811
+ self.subpattern.fix_groups(pattern, reverse, True)
2812
+
2813
+ def pack_characters(self, info):
2814
+ self.subpattern = self.subpattern.pack_characters(info)
2815
+ return self
2816
+
2817
+ def remove_captures(self):
2818
+ self.subpattern = self.subpattern.remove_captures()
2819
+ return self
2820
+
2821
+ def is_atomic(self):
2822
+ return self.subpattern.is_atomic()
2823
+
2824
+ def contains_group(self):
2825
+ return self.subpattern.contains_group()
2826
+
2827
+ def _compile(self, reverse, fuzzy):
2828
+ # The individual limits.
2829
+ arguments = []
2830
+ for e in "dise":
2831
+ v = self.constraints[e]
2832
+ arguments.append(v[0])
2833
+ arguments.append(UNLIMITED if v[1] is None else v[1])
2834
+
2835
+ # The coeffs of the cost equation.
2836
+ for e in "dis":
2837
+ arguments.append(self.constraints["cost"][e])
2838
+
2839
+ # The maximum of the cost equation.
2840
+ v = self.constraints["cost"]["max"]
2841
+ arguments.append(UNLIMITED if v is None else v)
2842
+
2843
+ flags = 0
2844
+ if reverse:
2845
+ flags |= REVERSE_OP
2846
+
2847
+ test = self.constraints.get("test")
2848
+
2849
+ if test:
2850
+ return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2851
+ test.compile(reverse, True) + [(OP.NEXT,)] +
2852
+ self.subpattern.compile(reverse, True) + [(OP.END,)])
2853
+
2854
+ return ([(OP.FUZZY, flags) + tuple(arguments)] +
2855
+ self.subpattern.compile(reverse, True) + [(OP.END,)])
2856
+
2857
+ def dump(self, indent, reverse):
2858
+ constraints = self._constraints_to_string()
2859
+ if constraints:
2860
+ constraints = " " + constraints
2861
+ print("{}FUZZY{}".format(INDENT * indent, constraints))
2862
+ self.subpattern.dump(indent + 1, reverse)
2863
+
2864
+ def is_empty(self):
2865
+ return self.subpattern.is_empty()
2866
+
2867
+ def __eq__(self, other):
2868
+ return (type(self) is type(other) and self.subpattern ==
2869
+ other.subpattern and self.constraints == other.constraints)
2870
+
2871
+ def max_width(self):
2872
+ return UNLIMITED
2873
+
2874
+ def _constraints_to_string(self):
2875
+ constraints = []
2876
+
2877
+ for name in "ids":
2878
+ min, max = self.constraints[name]
2879
+ if max == 0:
2880
+ continue
2881
+
2882
+ con = ""
2883
+
2884
+ if min > 0:
2885
+ con = "{}<=".format(min)
2886
+
2887
+ con += name
2888
+
2889
+ if max is not None:
2890
+ con += "<={}".format(max)
2891
+
2892
+ constraints.append(con)
2893
+
2894
+ cost = []
2895
+ for name in "ids":
2896
+ coeff = self.constraints["cost"][name]
2897
+ if coeff > 0:
2898
+ cost.append("{}{}".format(coeff, name))
2899
+
2900
+ limit = self.constraints["cost"]["max"]
2901
+ if limit is not None and limit > 0:
2902
+ cost = "{}<={}".format("+".join(cost), limit)
2903
+ constraints.append(cost)
2904
+
2905
+ return ",".join(constraints)
2906
+
2907
+ class Grapheme(RegexBase):
2908
+ def _compile(self, reverse, fuzzy):
2909
+ # Match at least 1 character until a grapheme boundary is reached. Note
2910
+ # that this is the same whether matching forwards or backwards.
2911
+ grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2912
+ GraphemeBoundary()]))
2913
+
2914
+ return grapheme_matcher.compile(reverse, fuzzy)
2915
+
2916
+ def dump(self, indent, reverse):
2917
+ print("{}GRAPHEME".format(INDENT * indent))
2918
+
2919
+ def max_width(self):
2920
+ return UNLIMITED
2921
+
2922
+ class GraphemeBoundary:
2923
+ def compile(self, reverse, fuzzy):
2924
+ return [(OP.GRAPHEME_BOUNDARY, 1)]
2925
+
2926
+ class GreedyRepeat(RegexBase):
2927
+ _opcode = OP.GREEDY_REPEAT
2928
+ _op_name = "GREEDY_REPEAT"
2929
+
2930
+ def __init__(self, subpattern, min_count, max_count):
2931
+ RegexBase.__init__(self)
2932
+ self.subpattern = subpattern
2933
+ self.min_count = min_count
2934
+ self.max_count = max_count
2935
+
2936
+ def fix_groups(self, pattern, reverse, fuzzy):
2937
+ self.subpattern.fix_groups(pattern, reverse, fuzzy)
2938
+
2939
+ def optimise(self, info, reverse):
2940
+ subpattern = self.subpattern.optimise(info, reverse)
2941
+
2942
+ return type(self)(subpattern, self.min_count, self.max_count)
2943
+
2944
+ def pack_characters(self, info):
2945
+ self.subpattern = self.subpattern.pack_characters(info)
2946
+ return self
2947
+
2948
+ def remove_captures(self):
2949
+ self.subpattern = self.subpattern.remove_captures()
2950
+ return self
2951
+
2952
+ def is_atomic(self):
2953
+ return self.min_count == self.max_count and self.subpattern.is_atomic()
2954
+
2955
+ def can_be_affix(self):
2956
+ return False
2957
+
2958
+ def contains_group(self):
2959
+ return self.subpattern.contains_group()
2960
+
2961
+ def get_firstset(self, reverse):
2962
+ fs = self.subpattern.get_firstset(reverse)
2963
+ if self.min_count == 0:
2964
+ fs.add(None)
2965
+
2966
+ return fs
2967
+
2968
+ def _compile(self, reverse, fuzzy):
2969
+ repeat = [self._opcode, self.min_count]
2970
+ if self.max_count is None:
2971
+ repeat.append(UNLIMITED)
2972
+ else:
2973
+ repeat.append(self.max_count)
2974
+
2975
+ subpattern = self.subpattern.compile(reverse, fuzzy)
2976
+ if not subpattern:
2977
+ return []
2978
+
2979
+ return ([tuple(repeat)] + subpattern + [(OP.END, )])
2980
+
2981
+ def dump(self, indent, reverse):
2982
+ if self.max_count is None:
2983
+ limit = "INF"
2984
+ else:
2985
+ limit = self.max_count
2986
+ print("{}{} {} {}".format(INDENT * indent, self._op_name,
2987
+ self.min_count, limit))
2988
+
2989
+ self.subpattern.dump(indent + 1, reverse)
2990
+
2991
+ def is_empty(self):
2992
+ return self.subpattern.is_empty()
2993
+
2994
+ def __eq__(self, other):
2995
+ return type(self) is type(other) and (self.subpattern, self.min_count,
2996
+ self.max_count) == (other.subpattern, other.min_count,
2997
+ other.max_count)
2998
+
2999
+ def max_width(self):
3000
+ if self.max_count is None:
3001
+ return UNLIMITED
3002
+
3003
+ return self.subpattern.max_width() * self.max_count
3004
+
3005
+ def get_required_string(self, reverse):
3006
+ max_count = UNLIMITED if self.max_count is None else self.max_count
3007
+ if self.min_count == 0:
3008
+ w = self.subpattern.max_width() * max_count
3009
+ return min(w, UNLIMITED), None
3010
+
3011
+ ofs, req = self.subpattern.get_required_string(reverse)
3012
+ if req:
3013
+ return ofs, req
3014
+
3015
+ w = self.subpattern.max_width() * max_count
3016
+ return min(w, UNLIMITED), None
3017
+
3018
+ class PossessiveRepeat(GreedyRepeat):
3019
+ def is_atomic(self):
3020
+ return True
3021
+
3022
+ def _compile(self, reverse, fuzzy):
3023
+ subpattern = self.subpattern.compile(reverse, fuzzy)
3024
+ if not subpattern:
3025
+ return []
3026
+
3027
+ repeat = [self._opcode, self.min_count]
3028
+ if self.max_count is None:
3029
+ repeat.append(UNLIMITED)
3030
+ else:
3031
+ repeat.append(self.max_count)
3032
+
3033
+ return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
3034
+ (OP.END, )])
3035
+
3036
+ def dump(self, indent, reverse):
3037
+ print("{}ATOMIC".format(INDENT * indent))
3038
+
3039
+ if self.max_count is None:
3040
+ limit = "INF"
3041
+ else:
3042
+ limit = self.max_count
3043
+ print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
3044
+ self.min_count, limit))
3045
+
3046
+ self.subpattern.dump(indent + 2, reverse)
3047
+
3048
+ class Group(RegexBase):
3049
+ def __init__(self, info, group, subpattern):
3050
+ RegexBase.__init__(self)
3051
+ self.info = info
3052
+ self.group = group
3053
+ self.subpattern = subpattern
3054
+
3055
+ self.call_ref = None
3056
+
3057
+ def fix_groups(self, pattern, reverse, fuzzy):
3058
+ self.info.defined_groups[self.group] = (self, reverse, fuzzy)
3059
+ self.subpattern.fix_groups(pattern, reverse, fuzzy)
3060
+
3061
+ def optimise(self, info, reverse):
3062
+ subpattern = self.subpattern.optimise(info, reverse)
3063
+
3064
+ return Group(self.info, self.group, subpattern)
3065
+
3066
+ def pack_characters(self, info):
3067
+ self.subpattern = self.subpattern.pack_characters(info)
3068
+ return self
3069
+
3070
+ def remove_captures(self):
3071
+ return self.subpattern.remove_captures()
3072
+
3073
+ def is_atomic(self):
3074
+ return self.subpattern.is_atomic()
3075
+
3076
+ def can_be_affix(self):
3077
+ return False
3078
+
3079
+ def contains_group(self):
3080
+ return True
3081
+
3082
+ def get_firstset(self, reverse):
3083
+ return self.subpattern.get_firstset(reverse)
3084
+
3085
+ def has_simple_start(self):
3086
+ return self.subpattern.has_simple_start()
3087
+
3088
+ def _compile(self, reverse, fuzzy):
3089
+ code = []
3090
+
3091
+ public_group = private_group = self.group
3092
+ if private_group < 0:
3093
+ public_group = self.info.private_groups[private_group]
3094
+ private_group = self.info.group_count - private_group
3095
+
3096
+ key = self.group, reverse, fuzzy
3097
+ ref = self.info.call_refs.get(key)
3098
+ if ref is not None:
3099
+ code += [(OP.CALL_REF, ref)]
3100
+
3101
+ code += [(OP.GROUP, int(not reverse), private_group, public_group)]
3102
+ code += self.subpattern.compile(reverse, fuzzy)
3103
+ code += [(OP.END, )]
3104
+
3105
+ if ref is not None:
3106
+ code += [(OP.END, )]
3107
+
3108
+ return code
3109
+
3110
+ def dump(self, indent, reverse):
3111
+ group = self.group
3112
+ if group < 0:
3113
+ group = self.info.private_groups[group]
3114
+ print("{}GROUP {}".format(INDENT * indent, group))
3115
+ self.subpattern.dump(indent + 1, reverse)
3116
+
3117
+ def __eq__(self, other):
3118
+ return (type(self) is type(other) and (self.group, self.subpattern) ==
3119
+ (other.group, other.subpattern))
3120
+
3121
+ def max_width(self):
3122
+ return self.subpattern.max_width()
3123
+
3124
+ def get_required_string(self, reverse):
3125
+ return self.subpattern.get_required_string(reverse)
3126
+
3127
+ def __del__(self):
3128
+ self.info = None
3129
+
3130
+ class Keep(ZeroWidthBase):
3131
+ _opcode = OP.KEEP
3132
+ _op_name = "KEEP"
3133
+
3134
+ class LazyRepeat(GreedyRepeat):
3135
+ _opcode = OP.LAZY_REPEAT
3136
+ _op_name = "LAZY_REPEAT"
3137
+
3138
+ class LookAround(RegexBase):
3139
+ _dir_text = {False: "AHEAD", True: "BEHIND"}
3140
+
3141
+ def __init__(self, behind, positive, subpattern):
3142
+ RegexBase.__init__(self)
3143
+ self.behind = bool(behind)
3144
+ self.positive = bool(positive)
3145
+ self.subpattern = subpattern
3146
+
3147
+ def fix_groups(self, pattern, reverse, fuzzy):
3148
+ self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3149
+
3150
+ def optimise(self, info, reverse):
3151
+ subpattern = self.subpattern.optimise(info, self.behind)
3152
+ if self.positive and subpattern.is_empty():
3153
+ return subpattern
3154
+
3155
+ return LookAround(self.behind, self.positive, subpattern)
3156
+
3157
+ def pack_characters(self, info):
3158
+ self.subpattern = self.subpattern.pack_characters(info)
3159
+ return self
3160
+
3161
+ def remove_captures(self):
3162
+ return self.subpattern.remove_captures()
3163
+
3164
+ def is_atomic(self):
3165
+ return self.subpattern.is_atomic()
3166
+
3167
+ def can_be_affix(self):
3168
+ return self.subpattern.can_be_affix()
3169
+
3170
+ def contains_group(self):
3171
+ return self.subpattern.contains_group()
3172
+
3173
+ def get_firstset(self, reverse):
3174
+ if self.positive and self.behind == reverse:
3175
+ return self.subpattern.get_firstset(reverse)
3176
+
3177
+ return set([None])
3178
+
3179
+ def _compile(self, reverse, fuzzy):
3180
+ flags = 0
3181
+ if self.positive:
3182
+ flags |= POSITIVE_OP
3183
+ if fuzzy:
3184
+ flags |= FUZZY_OP
3185
+ if reverse:
3186
+ flags |= REVERSE_OP
3187
+
3188
+ return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3189
+ self.subpattern.compile(self.behind) + [(OP.END, )])
3190
+
3191
+ def dump(self, indent, reverse):
3192
+ print("{}LOOK{} {}".format(INDENT * indent,
3193
+ self._dir_text[self.behind], POS_TEXT[self.positive]))
3194
+ self.subpattern.dump(indent + 1, self.behind)
3195
+
3196
+ def is_empty(self):
3197
+ return self.positive and self.subpattern.is_empty()
3198
+
3199
+ def __eq__(self, other):
3200
+ return type(self) is type(other) and (self.behind, self.positive,
3201
+ self.subpattern) == (other.behind, other.positive, other.subpattern)
3202
+
3203
+ def max_width(self):
3204
+ return 0
3205
+
3206
+ class LookAroundConditional(RegexBase):
3207
+ _dir_text = {False: "AHEAD", True: "BEHIND"}
3208
+
3209
+ def __init__(self, behind, positive, subpattern, yes_item, no_item):
3210
+ RegexBase.__init__(self)
3211
+ self.behind = bool(behind)
3212
+ self.positive = bool(positive)
3213
+ self.subpattern = subpattern
3214
+ self.yes_item = yes_item
3215
+ self.no_item = no_item
3216
+
3217
+ def fix_groups(self, pattern, reverse, fuzzy):
3218
+ self.subpattern.fix_groups(pattern, reverse, fuzzy)
3219
+ self.yes_item.fix_groups(pattern, reverse, fuzzy)
3220
+ self.no_item.fix_groups(pattern, reverse, fuzzy)
3221
+
3222
+ def optimise(self, info, reverse):
3223
+ subpattern = self.subpattern.optimise(info, self.behind)
3224
+ yes_item = self.yes_item.optimise(info, self.behind)
3225
+ no_item = self.no_item.optimise(info, self.behind)
3226
+
3227
+ return LookAroundConditional(self.behind, self.positive, subpattern,
3228
+ yes_item, no_item)
3229
+
3230
+ def pack_characters(self, info):
3231
+ self.subpattern = self.subpattern.pack_characters(info)
3232
+ self.yes_item = self.yes_item.pack_characters(info)
3233
+ self.no_item = self.no_item.pack_characters(info)
3234
+ return self
3235
+
3236
+ def remove_captures(self):
3237
+ self.subpattern = self.subpattern.remove_captures()
3238
+ self.yes_item = self.yes_item.remove_captures()
3239
+ self.no_item = self.no_item.remove_captures()
3240
+
3241
+ def is_atomic(self):
3242
+ return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3243
+ self.no_item.is_atomic())
3244
+
3245
+ def can_be_affix(self):
3246
+ return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3247
+ and self.no_item.can_be_affix())
3248
+
3249
+ def contains_group(self):
3250
+ return (self.subpattern.contains_group() or
3251
+ self.yes_item.contains_group() or self.no_item.contains_group())
3252
+
3253
+ def _compile(self, reverse, fuzzy):
3254
+ code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3255
+ code.extend(self.subpattern.compile(self.behind, fuzzy))
3256
+ code.append((OP.NEXT, ))
3257
+ code.extend(self.yes_item.compile(reverse, fuzzy))
3258
+ add_code = self.no_item.compile(reverse, fuzzy)
3259
+ if add_code:
3260
+ code.append((OP.NEXT, ))
3261
+ code.extend(add_code)
3262
+
3263
+ code.append((OP.END, ))
3264
+
3265
+ return code
3266
+
3267
+ def dump(self, indent, reverse):
3268
+ print("{}CONDITIONAL {} {}".format(INDENT * indent,
3269
+ self._dir_text[self.behind], POS_TEXT[self.positive]))
3270
+ self.subpattern.dump(indent + 1, self.behind)
3271
+ print("{}EITHER".format(INDENT * indent))
3272
+ self.yes_item.dump(indent + 1, reverse)
3273
+ if not self.no_item.is_empty():
3274
+ print("{}OR".format(INDENT * indent))
3275
+ self.no_item.dump(indent + 1, reverse)
3276
+
3277
+ def is_empty(self):
3278
+ return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3279
+ self.no_item.is_empty())
3280
+
3281
+ def __eq__(self, other):
3282
+ return type(self) is type(other) and (self.subpattern, self.yes_item,
3283
+ self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3284
+
3285
+ def max_width(self):
3286
+ return max(self.yes_item.max_width(), self.no_item.max_width())
3287
+
3288
+ def get_required_string(self, reverse):
3289
+ return self.max_width(), None
3290
+
3291
+ class PrecompiledCode(RegexBase):
3292
+ def __init__(self, code):
3293
+ self.code = code
3294
+
3295
+ def _compile(self, reverse, fuzzy):
3296
+ return [tuple(self.code)]
3297
+
3298
+ class Property(RegexBase):
3299
+ _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3300
+ OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3301
+ OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3302
+ OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3303
+ True): OP.PROPERTY_IGN_REV}
3304
+
3305
+ def __init__(self, value, positive=True, case_flags=NOCASE,
3306
+ zerowidth=False, encoding=0):
3307
+ RegexBase.__init__(self)
3308
+ self.value = value
3309
+ self.positive = bool(positive)
3310
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3311
+ self.zerowidth = bool(zerowidth)
3312
+ self.encoding = encoding
3313
+
3314
+ self._key = (self.__class__, self.value, self.positive,
3315
+ self.case_flags, self.zerowidth)
3316
+
3317
+ def rebuild(self, positive, case_flags, zerowidth):
3318
+ return Property(self.value, positive, case_flags, zerowidth,
3319
+ self.encoding)
3320
+
3321
+ def optimise(self, info, reverse, in_set=False):
3322
+ return self
3323
+
3324
+ def get_firstset(self, reverse):
3325
+ return set([self])
3326
+
3327
+ def has_simple_start(self):
3328
+ return True
3329
+
3330
+ def _compile(self, reverse, fuzzy):
3331
+ flags = 0
3332
+ if self.positive:
3333
+ flags |= POSITIVE_OP
3334
+ if self.zerowidth:
3335
+ flags |= ZEROWIDTH_OP
3336
+ if fuzzy:
3337
+ flags |= FUZZY_OP
3338
+ flags |= self.encoding << ENCODING_OP_SHIFT
3339
+ return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3340
+
3341
+ def dump(self, indent, reverse):
3342
+ prop = PROPERTY_NAMES[self.value >> 16]
3343
+ name, value = prop[0], prop[1][self.value & 0xFFFF]
3344
+ print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,
3345
+ POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],
3346
+ ["", " ASCII"][self.encoding]))
3347
+
3348
+ def matches(self, ch):
3349
+ return _regex.has_property_value(self.value, ch) == self.positive
3350
+
3351
+ def max_width(self):
3352
+ return 1
3353
+
3354
+ class Prune(ZeroWidthBase):
3355
+ _op_name = "PRUNE"
3356
+
3357
+ def _compile(self, reverse, fuzzy):
3358
+ return [(OP.PRUNE, )]
3359
+
3360
+ class Range(RegexBase):
3361
+ _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3362
+ (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3363
+ (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3364
+ (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3365
+ _op_name = "RANGE"
3366
+
3367
+ def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3368
+ zerowidth=False):
3369
+ RegexBase.__init__(self)
3370
+ self.lower = lower
3371
+ self.upper = upper
3372
+ self.positive = bool(positive)
3373
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3374
+ self.zerowidth = bool(zerowidth)
3375
+
3376
+ self._key = (self.__class__, self.lower, self.upper, self.positive,
3377
+ self.case_flags, self.zerowidth)
3378
+
3379
+ def rebuild(self, positive, case_flags, zerowidth):
3380
+ return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3381
+
3382
+ def optimise(self, info, reverse, in_set=False):
3383
+ # Is the range case-sensitive?
3384
+ if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3385
+ return self
3386
+
3387
+ # Is full case-folding possible?
3388
+ if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3389
+ FULLIGNORECASE):
3390
+ return self
3391
+
3392
+ # Get the characters which expand to multiple codepoints on folding.
3393
+ expanding_chars = _regex.get_expand_on_folding()
3394
+
3395
+ # Get the folded characters in the range.
3396
+ items = []
3397
+ for ch in expanding_chars:
3398
+ if self.lower <= ord(ch) <= self.upper:
3399
+ folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3400
+ items.append(String([ord(c) for c in folded],
3401
+ case_flags=self.case_flags))
3402
+
3403
+ if not items:
3404
+ # We can fall back to simple case-folding.
3405
+ return self
3406
+
3407
+ if len(items) < self.upper - self.lower + 1:
3408
+ # Not all the characters are covered by the full case-folding.
3409
+ items.insert(0, self)
3410
+
3411
+ return Branch(items)
3412
+
3413
+ def _compile(self, reverse, fuzzy):
3414
+ flags = 0
3415
+ if self.positive:
3416
+ flags |= POSITIVE_OP
3417
+ if self.zerowidth:
3418
+ flags |= ZEROWIDTH_OP
3419
+ if fuzzy:
3420
+ flags |= FUZZY_OP
3421
+ return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3422
+ self.upper)]
3423
+
3424
+ def dump(self, indent, reverse):
3425
+ display_lower = ascii(chr(self.lower)).lstrip("bu")
3426
+ display_upper = ascii(chr(self.upper)).lstrip("bu")
3427
+ print("{}RANGE {} {} {}{}".format(INDENT * indent,
3428
+ POS_TEXT[self.positive], display_lower, display_upper,
3429
+ CASE_TEXT[self.case_flags]))
3430
+
3431
+ def matches(self, ch):
3432
+ return (self.lower <= ch <= self.upper) == self.positive
3433
+
3434
+ def max_width(self):
3435
+ return 1
3436
+
3437
+ class RefGroup(RegexBase):
3438
+ _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3439
+ OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3440
+ False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3441
+ True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3442
+ (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3443
+
3444
+ def __init__(self, info, group, position, case_flags=NOCASE):
3445
+ RegexBase.__init__(self)
3446
+ self.info = info
3447
+ self.group = group
3448
+ self.position = position
3449
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3450
+
3451
+ self._key = self.__class__, self.group, self.case_flags
3452
+
3453
+ def fix_groups(self, pattern, reverse, fuzzy):
3454
+ try:
3455
+ self.group = int(self.group)
3456
+ except ValueError:
3457
+ try:
3458
+ self.group = self.info.group_index[self.group]
3459
+ except KeyError:
3460
+ raise error("unknown group", pattern, self.position)
3461
+
3462
+ if not 1 <= self.group <= self.info.group_count:
3463
+ raise error("invalid group reference", pattern, self.position)
3464
+
3465
+ self._key = self.__class__, self.group, self.case_flags
3466
+
3467
+ def remove_captures(self):
3468
+ raise error("group reference not allowed", self.pattern, self.position)
3469
+
3470
+ def _compile(self, reverse, fuzzy):
3471
+ flags = 0
3472
+ if fuzzy:
3473
+ flags |= FUZZY_OP
3474
+ return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3475
+
3476
+ def dump(self, indent, reverse):
3477
+ print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3478
+ CASE_TEXT[self.case_flags]))
3479
+
3480
+ def max_width(self):
3481
+ return UNLIMITED
3482
+
3483
+ def __del__(self):
3484
+ self.info = None
3485
+
3486
+ class SearchAnchor(ZeroWidthBase):
3487
+ _opcode = OP.SEARCH_ANCHOR
3488
+ _op_name = "SEARCH_ANCHOR"
3489
+
3490
+ class Sequence(RegexBase):
3491
+ def __init__(self, items=None):
3492
+ RegexBase.__init__(self)
3493
+ if items is None:
3494
+ items = []
3495
+
3496
+ self.items = items
3497
+
3498
+ def fix_groups(self, pattern, reverse, fuzzy):
3499
+ for s in self.items:
3500
+ s.fix_groups(pattern, reverse, fuzzy)
3501
+
3502
+ def optimise(self, info, reverse):
3503
+ # Flatten the sequences.
3504
+ items = []
3505
+ for s in self.items:
3506
+ s = s.optimise(info, reverse)
3507
+ if isinstance(s, Sequence):
3508
+ items.extend(s.items)
3509
+ else:
3510
+ items.append(s)
3511
+
3512
+ return make_sequence(items)
3513
+
3514
+ def pack_characters(self, info):
3515
+ "Packs sequences of characters into strings."
3516
+ items = []
3517
+ characters = []
3518
+ case_flags = NOCASE
3519
+ for s in self.items:
3520
+ if type(s) is Character and s.positive and not s.zerowidth:
3521
+ if s.case_flags != case_flags:
3522
+ # Different case sensitivity, so flush, unless neither the
3523
+ # previous nor the new character are cased.
3524
+ if s.case_flags or is_cased_i(info, s.value):
3525
+ Sequence._flush_characters(info, characters,
3526
+ case_flags, items)
3527
+
3528
+ case_flags = s.case_flags
3529
+
3530
+ characters.append(s.value)
3531
+ elif type(s) is String or type(s) is Literal:
3532
+ if s.case_flags != case_flags:
3533
+ # Different case sensitivity, so flush, unless the neither
3534
+ # the previous nor the new string are cased.
3535
+ if s.case_flags or any(is_cased_i(info, c) for c in
3536
+ characters):
3537
+ Sequence._flush_characters(info, characters,
3538
+ case_flags, items)
3539
+
3540
+ case_flags = s.case_flags
3541
+
3542
+ characters.extend(s.characters)
3543
+ else:
3544
+ Sequence._flush_characters(info, characters, case_flags, items)
3545
+
3546
+ items.append(s.pack_characters(info))
3547
+
3548
+ Sequence._flush_characters(info, characters, case_flags, items)
3549
+
3550
+ return make_sequence(items)
3551
+
3552
+ def remove_captures(self):
3553
+ self.items = [s.remove_captures() for s in self.items]
3554
+ return self
3555
+
3556
+ def is_atomic(self):
3557
+ return all(s.is_atomic() for s in self.items)
3558
+
3559
+ def can_be_affix(self):
3560
+ return False
3561
+
3562
+ def contains_group(self):
3563
+ return any(s.contains_group() for s in self.items)
3564
+
3565
+ def get_firstset(self, reverse):
3566
+ fs = set()
3567
+ items = self.items
3568
+ if reverse:
3569
+ items.reverse()
3570
+ for s in items:
3571
+ fs |= s.get_firstset(reverse)
3572
+ if None not in fs:
3573
+ return fs
3574
+ fs.discard(None)
3575
+
3576
+ return fs | set([None])
3577
+
3578
+ def has_simple_start(self):
3579
+ return bool(self.items) and self.items[0].has_simple_start()
3580
+
3581
+ def _compile(self, reverse, fuzzy):
3582
+ seq = self.items
3583
+ if reverse:
3584
+ seq = seq[::-1]
3585
+
3586
+ code = []
3587
+ for s in seq:
3588
+ code.extend(s.compile(reverse, fuzzy))
3589
+
3590
+ return code
3591
+
3592
+ def dump(self, indent, reverse):
3593
+ for s in self.items:
3594
+ s.dump(indent, reverse)
3595
+
3596
+ @staticmethod
3597
+ def _flush_characters(info, characters, case_flags, items):
3598
+ if not characters:
3599
+ return
3600
+
3601
+ # Disregard case_flags if all of the characters are case-less.
3602
+ if case_flags & IGNORECASE:
3603
+ if not any(is_cased_i(info, c) for c in characters):
3604
+ case_flags = NOCASE
3605
+
3606
+ if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3607
+ literals = Sequence._fix_full_casefold(characters)
3608
+
3609
+ for item in literals:
3610
+ chars = item.characters
3611
+
3612
+ if len(chars) == 1:
3613
+ items.append(Character(chars[0], case_flags=item.case_flags))
3614
+ else:
3615
+ items.append(String(chars, case_flags=item.case_flags))
3616
+ else:
3617
+ if len(characters) == 1:
3618
+ items.append(Character(characters[0], case_flags=case_flags))
3619
+ else:
3620
+ items.append(String(characters, case_flags=case_flags))
3621
+
3622
+ characters[:] = []
3623
+
3624
+ @staticmethod
3625
+ def _fix_full_casefold(characters):
3626
+ # Split a literal needing full case-folding into chunks that need it
3627
+ # and chunks that can use simple case-folding, which is faster.
3628
+ expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3629
+ _regex.get_expand_on_folding()]
3630
+ string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3631
+ for c in characters)).lower()
3632
+ chunks = []
3633
+
3634
+ for e in expanded:
3635
+ found = string.find(e)
3636
+
3637
+ while found >= 0:
3638
+ chunks.append((found, found + len(e)))
3639
+ found = string.find(e, found + 1)
3640
+
3641
+ pos = 0
3642
+ literals = []
3643
+
3644
+ for start, end in Sequence._merge_chunks(chunks):
3645
+ if pos < start:
3646
+ literals.append(Literal(characters[pos : start],
3647
+ case_flags=IGNORECASE))
3648
+
3649
+ literals.append(Literal(characters[start : end],
3650
+ case_flags=FULLIGNORECASE))
3651
+ pos = end
3652
+
3653
+ if pos < len(characters):
3654
+ literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3655
+
3656
+ return literals
3657
+
3658
+ @staticmethod
3659
+ def _merge_chunks(chunks):
3660
+ if len(chunks) < 2:
3661
+ return chunks
3662
+
3663
+ chunks.sort()
3664
+
3665
+ start, end = chunks[0]
3666
+ new_chunks = []
3667
+
3668
+ for s, e in chunks[1 : ]:
3669
+ if s <= end:
3670
+ end = max(end, e)
3671
+ else:
3672
+ new_chunks.append((start, end))
3673
+ start, end = s, e
3674
+
3675
+ new_chunks.append((start, end))
3676
+
3677
+ return new_chunks
3678
+
3679
+ def is_empty(self):
3680
+ return all(i.is_empty() for i in self.items)
3681
+
3682
+ def __eq__(self, other):
3683
+ return type(self) is type(other) and self.items == other.items
3684
+
3685
+ def max_width(self):
3686
+ return sum(s.max_width() for s in self.items)
3687
+
3688
+ def get_required_string(self, reverse):
3689
+ seq = self.items
3690
+ if reverse:
3691
+ seq = seq[::-1]
3692
+
3693
+ offset = 0
3694
+
3695
+ for s in seq:
3696
+ ofs, req = s.get_required_string(reverse)
3697
+ offset += ofs
3698
+ if req:
3699
+ return offset, req
3700
+
3701
+ return offset, None
3702
+
3703
+ class SetBase(RegexBase):
3704
+ def __init__(self, info, items, positive=True, case_flags=NOCASE,
3705
+ zerowidth=False):
3706
+ RegexBase.__init__(self)
3707
+ self.info = info
3708
+ self.items = tuple(items)
3709
+ self.positive = bool(positive)
3710
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3711
+ self.zerowidth = bool(zerowidth)
3712
+
3713
+ self.char_width = 1
3714
+
3715
+ self._key = (self.__class__, self.items, self.positive,
3716
+ self.case_flags, self.zerowidth)
3717
+
3718
+ def rebuild(self, positive, case_flags, zerowidth):
3719
+ return type(self)(self.info, self.items, positive, case_flags,
3720
+ zerowidth).optimise(self.info, False)
3721
+
3722
+ def get_firstset(self, reverse):
3723
+ return set([self])
3724
+
3725
+ def has_simple_start(self):
3726
+ return True
3727
+
3728
+ def _compile(self, reverse, fuzzy):
3729
+ flags = 0
3730
+ if self.positive:
3731
+ flags |= POSITIVE_OP
3732
+ if self.zerowidth:
3733
+ flags |= ZEROWIDTH_OP
3734
+ if fuzzy:
3735
+ flags |= FUZZY_OP
3736
+ code = [(self._opcode[self.case_flags, reverse], flags)]
3737
+ for m in self.items:
3738
+ code.extend(m.compile())
3739
+
3740
+ code.append((OP.END, ))
3741
+
3742
+ return code
3743
+
3744
+ def dump(self, indent, reverse):
3745
+ print("{}{} {}{}".format(INDENT * indent, self._op_name,
3746
+ POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3747
+ for i in self.items:
3748
+ i.dump(indent + 1, reverse)
3749
+
3750
+ def _handle_case_folding(self, info, in_set):
3751
+ # Is the set case-sensitive?
3752
+ if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3753
+ return self
3754
+
3755
+ # Is full case-folding possible?
3756
+ if (not (self.info.flags & UNICODE) or (self.case_flags &
3757
+ FULLIGNORECASE) != FULLIGNORECASE):
3758
+ return self
3759
+
3760
+ # Get the characters which expand to multiple codepoints on folding.
3761
+ expanding_chars = _regex.get_expand_on_folding()
3762
+
3763
+ # Get the folded characters in the set.
3764
+ items = []
3765
+ seen = set()
3766
+ for ch in expanding_chars:
3767
+ if self.matches(ord(ch)):
3768
+ folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3769
+ if folded not in seen:
3770
+ items.append(String([ord(c) for c in folded],
3771
+ case_flags=self.case_flags))
3772
+ seen.add(folded)
3773
+
3774
+ if not items:
3775
+ # We can fall back to simple case-folding.
3776
+ return self
3777
+
3778
+ return Branch([self] + items)
3779
+
3780
+ def max_width(self):
3781
+ # Is the set case-sensitive?
3782
+ if not self.positive or not (self.case_flags & IGNORECASE):
3783
+ return 1
3784
+
3785
+ # Is full case-folding possible?
3786
+ if (not (self.info.flags & UNICODE) or (self.case_flags &
3787
+ FULLIGNORECASE) != FULLIGNORECASE):
3788
+ return 1
3789
+
3790
+ # Get the characters which expand to multiple codepoints on folding.
3791
+ expanding_chars = _regex.get_expand_on_folding()
3792
+
3793
+ # Get the folded characters in the set.
3794
+ seen = set()
3795
+ for ch in expanding_chars:
3796
+ if self.matches(ord(ch)):
3797
+ folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3798
+ seen.add(folded)
3799
+
3800
+ if not seen:
3801
+ return 1
3802
+
3803
+ return max(len(folded) for folded in seen)
3804
+
3805
+ def __del__(self):
3806
+ self.info = None
3807
+
3808
+ class SetDiff(SetBase):
3809
+ _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3810
+ OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3811
+ OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3812
+ OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3813
+ True): OP.SET_DIFF_IGN_REV}
3814
+ _op_name = "SET_DIFF"
3815
+
3816
+ def optimise(self, info, reverse, in_set=False):
3817
+ items = self.items
3818
+ if len(items) > 2:
3819
+ items = [items[0], SetUnion(info, items[1 : ])]
3820
+
3821
+ if len(items) == 1:
3822
+ return items[0].with_flags(case_flags=self.case_flags,
3823
+ zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3824
+
3825
+ self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3826
+ items)
3827
+
3828
+ return self._handle_case_folding(info, in_set)
3829
+
3830
+ def matches(self, ch):
3831
+ m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3832
+ return m == self.positive
3833
+
3834
+ class SetInter(SetBase):
3835
+ _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3836
+ OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3837
+ False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3838
+ True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3839
+ (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3840
+ _op_name = "SET_INTER"
3841
+
3842
+ def optimise(self, info, reverse, in_set=False):
3843
+ items = []
3844
+ for m in self.items:
3845
+ m = m.optimise(info, reverse, in_set=True)
3846
+ if isinstance(m, SetInter) and m.positive:
3847
+ # Intersection in intersection.
3848
+ items.extend(m.items)
3849
+ else:
3850
+ items.append(m)
3851
+
3852
+ if len(items) == 1:
3853
+ return items[0].with_flags(case_flags=self.case_flags,
3854
+ zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3855
+
3856
+ self.items = tuple(items)
3857
+
3858
+ return self._handle_case_folding(info, in_set)
3859
+
3860
+ def matches(self, ch):
3861
+ m = all(i.matches(ch) for i in self.items)
3862
+ return m == self.positive
3863
+
3864
+ class SetSymDiff(SetBase):
3865
+ _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3866
+ OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3867
+ False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3868
+ (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3869
+ OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3870
+ _op_name = "SET_SYM_DIFF"
3871
+
3872
+ def optimise(self, info, reverse, in_set=False):
3873
+ items = []
3874
+ for m in self.items:
3875
+ m = m.optimise(info, reverse, in_set=True)
3876
+ if isinstance(m, SetSymDiff) and m.positive:
3877
+ # Symmetric difference in symmetric difference.
3878
+ items.extend(m.items)
3879
+ else:
3880
+ items.append(m)
3881
+
3882
+ if len(items) == 1:
3883
+ return items[0].with_flags(case_flags=self.case_flags,
3884
+ zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3885
+
3886
+ self.items = tuple(items)
3887
+
3888
+ return self._handle_case_folding(info, in_set)
3889
+
3890
+ def matches(self, ch):
3891
+ m = False
3892
+ for i in self.items:
3893
+ m = m != i.matches(ch)
3894
+
3895
+ return m == self.positive
3896
+
3897
+ class SetUnion(SetBase):
3898
+ _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3899
+ OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3900
+ False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3901
+ True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3902
+ (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3903
+ _op_name = "SET_UNION"
3904
+
3905
+ def optimise(self, info, reverse, in_set=False):
3906
+ items = []
3907
+ for m in self.items:
3908
+ m = m.optimise(info, reverse, in_set=True)
3909
+ if isinstance(m, SetUnion) and m.positive:
3910
+ # Union in union.
3911
+ items.extend(m.items)
3912
+ elif isinstance(m, AnyAll):
3913
+ return AnyAll()
3914
+ else:
3915
+ items.append(m)
3916
+
3917
+ # Are there complementary properties?
3918
+ properties = (set(), set())
3919
+
3920
+ for m in items:
3921
+ if isinstance(m, Property):
3922
+ properties[m.positive].add((m.value, m.case_flags, m.zerowidth))
3923
+
3924
+ if properties[0] & properties[1]:
3925
+ return AnyAll()
3926
+
3927
+ if len(items) == 1:
3928
+ i = items[0]
3929
+ return i.with_flags(positive=i.positive == self.positive,
3930
+ case_flags=self.case_flags,
3931
+ zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3932
+
3933
+ self.items = tuple(items)
3934
+
3935
+ return self._handle_case_folding(info, in_set)
3936
+
3937
+ def _compile(self, reverse, fuzzy):
3938
+ flags = 0
3939
+ if self.positive:
3940
+ flags |= POSITIVE_OP
3941
+ if self.zerowidth:
3942
+ flags |= ZEROWIDTH_OP
3943
+ if fuzzy:
3944
+ flags |= FUZZY_OP
3945
+
3946
+ characters, others = defaultdict(list), []
3947
+ for m in self.items:
3948
+ if isinstance(m, Character):
3949
+ characters[m.positive].append(m.value)
3950
+ else:
3951
+ others.append(m)
3952
+
3953
+ code = [(self._opcode[self.case_flags, reverse], flags)]
3954
+
3955
+ for positive, values in characters.items():
3956
+ flags = 0
3957
+ if positive:
3958
+ flags |= POSITIVE_OP
3959
+ if len(values) == 1:
3960
+ code.append((OP.CHARACTER, flags, values[0]))
3961
+ else:
3962
+ code.append((OP.STRING, flags, len(values)) + tuple(values))
3963
+
3964
+ for m in others:
3965
+ code.extend(m.compile())
3966
+
3967
+ code.append((OP.END, ))
3968
+
3969
+ return code
3970
+
3971
+ def matches(self, ch):
3972
+ m = any(i.matches(ch) for i in self.items)
3973
+ return m == self.positive
3974
+
3975
+ class Skip(ZeroWidthBase):
3976
+ _op_name = "SKIP"
3977
+ _opcode = OP.SKIP
3978
+
3979
+ class StartOfLine(ZeroWidthBase):
3980
+ _opcode = OP.START_OF_LINE
3981
+ _op_name = "START_OF_LINE"
3982
+
3983
+ class StartOfLineU(StartOfLine):
3984
+ _opcode = OP.START_OF_LINE_U
3985
+ _op_name = "START_OF_LINE_U"
3986
+
3987
+ class StartOfString(ZeroWidthBase):
3988
+ _opcode = OP.START_OF_STRING
3989
+ _op_name = "START_OF_STRING"
3990
+
3991
+ class StartOfWord(ZeroWidthBase):
3992
+ _opcode = OP.START_OF_WORD
3993
+ _op_name = "START_OF_WORD"
3994
+
3995
+ class String(RegexBase):
3996
+ _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
3997
+ (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
3998
+ (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
3999
+ (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
4000
+ OP.STRING_FLD_REV}
4001
+
4002
+ def __init__(self, characters, case_flags=NOCASE):
4003
+ self.characters = tuple(characters)
4004
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4005
+
4006
+ if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
4007
+ folded_characters = []
4008
+ for char in self.characters:
4009
+ folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
4010
+ folded_characters.extend(ord(c) for c in folded)
4011
+ else:
4012
+ folded_characters = self.characters
4013
+
4014
+ self.folded_characters = tuple(folded_characters)
4015
+ self.required = False
4016
+
4017
+ self._key = self.__class__, self.characters, self.case_flags
4018
+
4019
+ def get_firstset(self, reverse):
4020
+ if reverse:
4021
+ pos = -1
4022
+ else:
4023
+ pos = 0
4024
+ return set([Character(self.characters[pos],
4025
+ case_flags=self.case_flags)])
4026
+
4027
+ def has_simple_start(self):
4028
+ return True
4029
+
4030
+ def _compile(self, reverse, fuzzy):
4031
+ flags = 0
4032
+ if fuzzy:
4033
+ flags |= FUZZY_OP
4034
+ if self.required:
4035
+ flags |= REQUIRED_OP
4036
+ return [(self._opcode[self.case_flags, reverse], flags,
4037
+ len(self.folded_characters)) + self.folded_characters]
4038
+
4039
+ def dump(self, indent, reverse):
4040
+ display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
4041
+ print("{}STRING {}{}".format(INDENT * indent, display,
4042
+ CASE_TEXT[self.case_flags]))
4043
+
4044
+ def max_width(self):
4045
+ return len(self.folded_characters)
4046
+
4047
+ def get_required_string(self, reverse):
4048
+ return 0, self
4049
+
4050
+ class Literal(String):
4051
+ def dump(self, indent, reverse):
4052
+ literal = ''.join(chr(c) for c in self.characters)
4053
+ display = ascii(literal).lstrip("bu")
4054
+ print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
4055
+ CASE_TEXT[self.case_flags]))
4056
+
4057
+ class StringSet(Branch):
4058
+ def __init__(self, info, name, case_flags=NOCASE):
4059
+ self.info = info
4060
+ self.name = name
4061
+ self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4062
+
4063
+ self._key = self.__class__, self.name, self.case_flags
4064
+
4065
+ self.set_key = (name, self.case_flags)
4066
+ if self.set_key not in info.named_lists_used:
4067
+ info.named_lists_used[self.set_key] = len(info.named_lists_used)
4068
+
4069
+ index = self.info.named_lists_used[self.set_key]
4070
+ items = self.info.kwargs[self.name]
4071
+
4072
+ case_flags = self.case_flags
4073
+
4074
+ encoding = self.info.flags & _ALL_ENCODINGS
4075
+ fold_flags = encoding | case_flags
4076
+
4077
+ choices = []
4078
+
4079
+ for string in items:
4080
+ if isinstance(string, str):
4081
+ string = [ord(c) for c in string]
4082
+
4083
+ choices.append([Character(c, case_flags=case_flags) for c in
4084
+ string])
4085
+
4086
+ # Sort from longest to shortest.
4087
+ choices.sort(key=len, reverse=True)
4088
+
4089
+ self.branches = [Sequence(choice) for choice in choices]
4090
+
4091
+ def dump(self, indent, reverse):
4092
+ print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
4093
+ CASE_TEXT[self.case_flags]))
4094
+
4095
+ def __del__(self):
4096
+ self.info = None
4097
+
4098
+ class Source:
4099
+ "Scanner for the regular expression source string."
4100
+ def __init__(self, string):
4101
+ if isinstance(string, str):
4102
+ self.string = string
4103
+ self.char_type = chr
4104
+ else:
4105
+ self.string = string.decode("latin-1")
4106
+ self.char_type = lambda c: bytes([c])
4107
+
4108
+ self.pos = 0
4109
+ self.ignore_space = False
4110
+ self.sep = string[ : 0]
4111
+
4112
+ def peek(self, override_ignore=False):
4113
+ string = self.string
4114
+ pos = self.pos
4115
+
4116
+ try:
4117
+ if self.ignore_space and not override_ignore:
4118
+ while True:
4119
+ if string[pos].isspace():
4120
+ # Skip over the whitespace.
4121
+ pos += 1
4122
+ elif string[pos] == "#":
4123
+ # Skip over the comment to the end of the line.
4124
+ pos = string.index("\n", pos)
4125
+ else:
4126
+ break
4127
+
4128
+ return string[pos]
4129
+ except IndexError:
4130
+ # We've reached the end of the string.
4131
+ return string[ : 0]
4132
+ except ValueError:
4133
+ # The comment extended to the end of the string.
4134
+ return string[ : 0]
4135
+
4136
+ def get(self, override_ignore=False):
4137
+ string = self.string
4138
+ pos = self.pos
4139
+
4140
+ try:
4141
+ if self.ignore_space and not override_ignore:
4142
+ while True:
4143
+ if string[pos].isspace():
4144
+ # Skip over the whitespace.
4145
+ pos += 1
4146
+ elif string[pos] == "#":
4147
+ # Skip over the comment to the end of the line.
4148
+ pos = string.index("\n", pos)
4149
+ else:
4150
+ break
4151
+
4152
+ ch = string[pos]
4153
+ self.pos = pos + 1
4154
+ return ch
4155
+ except IndexError:
4156
+ # We've reached the end of the string.
4157
+ self.pos = pos
4158
+ return string[ : 0]
4159
+ except ValueError:
4160
+ # The comment extended to the end of the string.
4161
+ self.pos = len(string)
4162
+ return string[ : 0]
4163
+
4164
+ def get_many(self, count=1):
4165
+ string = self.string
4166
+ pos = self.pos
4167
+
4168
+ try:
4169
+ if self.ignore_space:
4170
+ substring = []
4171
+
4172
+ while len(substring) < count:
4173
+ while True:
4174
+ if string[pos].isspace():
4175
+ # Skip over the whitespace.
4176
+ pos += 1
4177
+ elif string[pos] == "#":
4178
+ # Skip over the comment to the end of the line.
4179
+ pos = string.index("\n", pos)
4180
+ else:
4181
+ break
4182
+
4183
+ substring.append(string[pos])
4184
+ pos += 1
4185
+
4186
+ substring = "".join(substring)
4187
+ else:
4188
+ substring = string[pos : pos + count]
4189
+ pos += len(substring)
4190
+
4191
+ self.pos = pos
4192
+ return substring
4193
+ except IndexError:
4194
+ # We've reached the end of the string.
4195
+ self.pos = len(string)
4196
+ return "".join(substring)
4197
+ except ValueError:
4198
+ # The comment extended to the end of the string.
4199
+ self.pos = len(string)
4200
+ return "".join(substring)
4201
+
4202
+ def get_while(self, test_set, include=True, keep_spaces=False):
4203
+ string = self.string
4204
+ pos = self.pos
4205
+
4206
+ if self.ignore_space and not keep_spaces:
4207
+ try:
4208
+ substring = []
4209
+
4210
+ while True:
4211
+ if string[pos].isspace():
4212
+ # Skip over the whitespace.
4213
+ pos += 1
4214
+ elif string[pos] == "#":
4215
+ # Skip over the comment to the end of the line.
4216
+ pos = string.index("\n", pos)
4217
+ elif (string[pos] in test_set) == include:
4218
+ substring.append(string[pos])
4219
+ pos += 1
4220
+ else:
4221
+ break
4222
+
4223
+ self.pos = pos
4224
+ except IndexError:
4225
+ # We've reached the end of the string.
4226
+ self.pos = len(string)
4227
+ except ValueError:
4228
+ # The comment extended to the end of the string.
4229
+ self.pos = len(string)
4230
+
4231
+ return "".join(substring)
4232
+ else:
4233
+ try:
4234
+ while (string[pos] in test_set) == include:
4235
+ pos += 1
4236
+
4237
+ substring = string[self.pos : pos]
4238
+
4239
+ self.pos = pos
4240
+
4241
+ return substring
4242
+ except IndexError:
4243
+ # We've reached the end of the string.
4244
+ substring = string[self.pos : pos]
4245
+
4246
+ self.pos = pos
4247
+
4248
+ return substring
4249
+
4250
+ def skip_while(self, test_set, include=True):
4251
+ string = self.string
4252
+ pos = self.pos
4253
+
4254
+ try:
4255
+ if self.ignore_space:
4256
+ while True:
4257
+ if string[pos].isspace():
4258
+ # Skip over the whitespace.
4259
+ pos += 1
4260
+ elif string[pos] == "#":
4261
+ # Skip over the comment to the end of the line.
4262
+ pos = string.index("\n", pos)
4263
+ elif (string[pos] in test_set) == include:
4264
+ pos += 1
4265
+ else:
4266
+ break
4267
+ else:
4268
+ while (string[pos] in test_set) == include:
4269
+ pos += 1
4270
+
4271
+ self.pos = pos
4272
+ except IndexError:
4273
+ # We've reached the end of the string.
4274
+ self.pos = len(string)
4275
+ except ValueError:
4276
+ # The comment extended to the end of the string.
4277
+ self.pos = len(string)
4278
+
4279
+ def match(self, substring):
4280
+ string = self.string
4281
+ pos = self.pos
4282
+
4283
+ if self.ignore_space:
4284
+ try:
4285
+ for c in substring:
4286
+ while True:
4287
+ if string[pos].isspace():
4288
+ # Skip over the whitespace.
4289
+ pos += 1
4290
+ elif string[pos] == "#":
4291
+ # Skip over the comment to the end of the line.
4292
+ pos = string.index("\n", pos)
4293
+ else:
4294
+ break
4295
+
4296
+ if string[pos] != c:
4297
+ return False
4298
+
4299
+ pos += 1
4300
+
4301
+ self.pos = pos
4302
+
4303
+ return True
4304
+ except IndexError:
4305
+ # We've reached the end of the string.
4306
+ return False
4307
+ except ValueError:
4308
+ # The comment extended to the end of the string.
4309
+ return False
4310
+ else:
4311
+ if not string.startswith(substring, pos):
4312
+ return False
4313
+
4314
+ self.pos = pos + len(substring)
4315
+
4316
+ return True
4317
+
4318
+ def expect(self, substring):
4319
+ if not self.match(substring):
4320
+ raise error("missing {}".format(substring), self.string, self.pos)
4321
+
4322
+ def at_end(self):
4323
+ string = self.string
4324
+ pos = self.pos
4325
+
4326
+ try:
4327
+ if self.ignore_space:
4328
+ while True:
4329
+ if string[pos].isspace():
4330
+ pos += 1
4331
+ elif string[pos] == "#":
4332
+ pos = string.index("\n", pos)
4333
+ else:
4334
+ break
4335
+
4336
+ return pos >= len(string)
4337
+ except IndexError:
4338
+ # We've reached the end of the string.
4339
+ return True
4340
+ except ValueError:
4341
+ # The comment extended to the end of the string.
4342
+ return True
4343
+
4344
+ class Info:
4345
+ "Info about the regular expression."
4346
+
4347
+ def __init__(self, flags=0, char_type=None, kwargs={}):
4348
+ flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4349
+ self.flags = flags
4350
+ self.global_flags = flags
4351
+ self.inline_locale = False
4352
+
4353
+ self.kwargs = kwargs
4354
+
4355
+ self.group_count = 0
4356
+ self.group_index = {}
4357
+ self.group_name = {}
4358
+ self.char_type = char_type
4359
+ self.named_lists_used = {}
4360
+ self.open_groups = []
4361
+ self.open_group_count = {}
4362
+ self.defined_groups = {}
4363
+ self.group_calls = []
4364
+ self.private_groups = {}
4365
+
4366
+ def open_group(self, name=None):
4367
+ group = self.group_index.get(name)
4368
+ if group is None:
4369
+ while True:
4370
+ self.group_count += 1
4371
+ if name is None or self.group_count not in self.group_name:
4372
+ break
4373
+
4374
+ group = self.group_count
4375
+ if name:
4376
+ self.group_index[name] = group
4377
+ self.group_name[group] = name
4378
+
4379
+ if group in self.open_groups:
4380
+ # We have a nested named group. We'll assign it a private group
4381
+ # number, initially negative until we can assign a proper
4382
+ # (positive) number.
4383
+ group_alias = -(len(self.private_groups) + 1)
4384
+ self.private_groups[group_alias] = group
4385
+ group = group_alias
4386
+
4387
+ self.open_groups.append(group)
4388
+ self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4389
+
4390
+ return group
4391
+
4392
+ def close_group(self):
4393
+ self.open_groups.pop()
4394
+
4395
+ def is_open_group(self, name):
4396
+ # In version 1, a group reference can refer to an open group. We'll
4397
+ # just pretend the group isn't open.
4398
+ version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4399
+ if version == VERSION1:
4400
+ return False
4401
+
4402
+ if name.isdigit():
4403
+ group = int(name)
4404
+ else:
4405
+ group = self.group_index.get(name)
4406
+
4407
+ return group in self.open_groups
4408
+
4409
+ def _check_group_features(info, parsed):
4410
+ """Checks whether the reverse and fuzzy features of the group calls match
4411
+ the groups which they call.
4412
+ """
4413
+ call_refs = {}
4414
+ additional_groups = []
4415
+ for call, reverse, fuzzy in info.group_calls:
4416
+ # Look up the reference of this group call.
4417
+ key = (call.group, reverse, fuzzy)
4418
+ ref = call_refs.get(key)
4419
+ if ref is None:
4420
+ # This group doesn't have a reference yet, so look up its features.
4421
+ if call.group == 0:
4422
+ # Calling the pattern as a whole.
4423
+ rev = bool(info.flags & REVERSE)
4424
+ fuz = isinstance(parsed, Fuzzy)
4425
+ if (rev, fuz) != (reverse, fuzzy):
4426
+ # The pattern as a whole doesn't have the features we want,
4427
+ # so we'll need to make a copy of it with the desired
4428
+ # features.
4429
+ additional_groups.append((CallRef(len(call_refs), parsed),
4430
+ reverse, fuzzy))
4431
+ else:
4432
+ # Calling a capture group.
4433
+ def_info = info.defined_groups[call.group]
4434
+ group = def_info[0]
4435
+ if def_info[1 : ] != (reverse, fuzzy):
4436
+ # The group doesn't have the features we want, so we'll
4437
+ # need to make a copy of it with the desired features.
4438
+ additional_groups.append((group, reverse, fuzzy))
4439
+
4440
+ ref = len(call_refs)
4441
+ call_refs[key] = ref
4442
+
4443
+ call.call_ref = ref
4444
+
4445
+ info.call_refs = call_refs
4446
+ info.additional_groups = additional_groups
4447
+
4448
+ def _get_required_string(parsed, flags):
4449
+ "Gets the required string and related info of a parsed pattern."
4450
+
4451
+ req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4452
+ if required:
4453
+ required.required = True
4454
+ if req_offset >= UNLIMITED:
4455
+ req_offset = -1
4456
+
4457
+ req_flags = required.case_flags
4458
+ if not (flags & UNICODE):
4459
+ req_flags &= ~UNICODE
4460
+
4461
+ req_chars = required.folded_characters
4462
+ else:
4463
+ req_offset = 0
4464
+ req_chars = ()
4465
+ req_flags = 0
4466
+
4467
+ return req_offset, req_chars, req_flags
4468
+
4469
+ class Scanner:
4470
+ def __init__(self, lexicon, flags=0):
4471
+ self.lexicon = lexicon
4472
+
4473
+ # Combine phrases into a compound pattern.
4474
+ patterns = []
4475
+ for phrase, action in lexicon:
4476
+ # Parse the regular expression.
4477
+ source = Source(phrase)
4478
+ info = Info(flags, source.char_type)
4479
+ source.ignore_space = bool(info.flags & VERBOSE)
4480
+ parsed = _parse_pattern(source, info)
4481
+ if not source.at_end():
4482
+ raise error("unbalanced parenthesis", source.string,
4483
+ source.pos)
4484
+
4485
+ # We want to forbid capture groups within each phrase.
4486
+ patterns.append(parsed.remove_captures())
4487
+
4488
+ # Combine all the subpatterns into one pattern.
4489
+ info = Info(flags)
4490
+ patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4491
+ parsed = Branch(patterns)
4492
+
4493
+ # Optimise the compound pattern.
4494
+ reverse = bool(info.flags & REVERSE)
4495
+ parsed = parsed.optimise(info, reverse)
4496
+ parsed = parsed.pack_characters(info)
4497
+
4498
+ # Get the required string.
4499
+ req_offset, req_chars, req_flags = _get_required_string(parsed,
4500
+ info.flags)
4501
+
4502
+ # Check the features of the groups.
4503
+ _check_group_features(info, parsed)
4504
+
4505
+ # Complain if there are any group calls. They are not supported by the
4506
+ # Scanner class.
4507
+ if info.call_refs:
4508
+ raise error("recursive regex not supported by Scanner",
4509
+ source.string, source.pos)
4510
+
4511
+ reverse = bool(info.flags & REVERSE)
4512
+
4513
+ # Compile the compound pattern. The result is a list of tuples.
4514
+ code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4515
+
4516
+ # Flatten the code into a list of ints.
4517
+ code = _flatten_code(code)
4518
+
4519
+ if not parsed.has_simple_start():
4520
+ # Get the first set, if possible.
4521
+ try:
4522
+ fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4523
+ fs_code = _flatten_code(fs_code)
4524
+ code = fs_code + code
4525
+ except _FirstSetError:
4526
+ pass
4527
+
4528
+ # Check the global flags for conflicts.
4529
+ version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4530
+ if version not in (0, VERSION0, VERSION1):
4531
+ raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4532
+
4533
+ # Create the PatternObject.
4534
+ #
4535
+ # Local flags like IGNORECASE affect the code generation, but aren't
4536
+ # needed by the PatternObject itself. Conversely, global flags like
4537
+ # LOCALE _don't_ affect the code generation but _are_ needed by the
4538
+ # PatternObject.
4539
+ self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4540
+ code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4541
+ len(patterns))
4542
+
4543
+ def scan(self, string):
4544
+ result = []
4545
+ append = result.append
4546
+ match = self.scanner.scanner(string).match
4547
+ i = 0
4548
+ while True:
4549
+ m = match()
4550
+ if not m:
4551
+ break
4552
+ j = m.end()
4553
+ if i == j:
4554
+ break
4555
+ action = self.lexicon[m.lastindex - 1][1]
4556
+ if hasattr(action, '__call__'):
4557
+ self.match = m
4558
+ action = action(self, m.group())
4559
+ if action is not None:
4560
+ append(action)
4561
+ i = j
4562
+
4563
+ return result, string[i : ]
4564
+
4565
+ # Get the known properties dict.
4566
+ PROPERTIES = _regex.get_properties()
4567
+
4568
+ # Build the inverse of the properties dict.
4569
+ PROPERTY_NAMES = {}
4570
+ for prop_name, (prop_id, values) in PROPERTIES.items():
4571
+ name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4572
+ name = max(name, prop_name, key=len)
4573
+ PROPERTY_NAMES[prop_id] = name, prop_values
4574
+
4575
+ for val_name, val_id in values.items():
4576
+ prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4577
+ key=len)
4578
+
4579
+ # Character escape sequences.
4580
+ CHARACTER_ESCAPES = {
4581
+ "a": "\a",
4582
+ "b": "\b",
4583
+ "f": "\f",
4584
+ "n": "\n",
4585
+ "r": "\r",
4586
+ "t": "\t",
4587
+ "v": "\v",
4588
+ }
4589
+
4590
+ ASCII_ENCODING = 1
4591
+ UNICODE_ENCODING = 2
4592
+
4593
+ # Predefined character set escape sequences.
4594
+ CHARSET_ESCAPES = {
4595
+ "d": lookup_property(None, "Digit", True),
4596
+ "D": lookup_property(None, "Digit", False),
4597
+ "h": lookup_property(None, "Blank", True),
4598
+ "s": lookup_property(None, "Space", True),
4599
+ "S": lookup_property(None, "Space", False),
4600
+ "w": lookup_property(None, "Word", True),
4601
+ "W": lookup_property(None, "Word", False),
4602
+ }
4603
+
4604
+ ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4605
+ ASCII_CHARSET_ESCAPES.update({
4606
+ "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),
4607
+ "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),
4608
+ "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),
4609
+ "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),
4610
+ "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),
4611
+ "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),
4612
+ })
4613
+ UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4614
+ UNICODE_CHARSET_ESCAPES.update({
4615
+ "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),
4616
+ "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),
4617
+ "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),
4618
+ "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),
4619
+ "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),
4620
+ "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),
4621
+ })
4622
+
4623
+ # Positional escape sequences.
4624
+ POSITION_ESCAPES = {
4625
+ "A": StartOfString(),
4626
+ "b": Boundary(),
4627
+ "B": Boundary(False),
4628
+ "K": Keep(),
4629
+ "m": StartOfWord(),
4630
+ "M": EndOfWord(),
4631
+ "Z": EndOfString(),
4632
+ }
4633
+ ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4634
+ ASCII_POSITION_ESCAPES.update({
4635
+ "b": Boundary(encoding=ASCII_ENCODING),
4636
+ "B": Boundary(False, encoding=ASCII_ENCODING),
4637
+ "m": StartOfWord(encoding=ASCII_ENCODING),
4638
+ "M": EndOfWord(encoding=ASCII_ENCODING),
4639
+ })
4640
+ UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4641
+ UNICODE_POSITION_ESCAPES.update({
4642
+ "b": Boundary(encoding=UNICODE_ENCODING),
4643
+ "B": Boundary(False, encoding=UNICODE_ENCODING),
4644
+ "m": StartOfWord(encoding=UNICODE_ENCODING),
4645
+ "M": EndOfWord(encoding=UNICODE_ENCODING),
4646
+ })
4647
+
4648
+ # Positional escape sequences when WORD flag set.
4649
+ WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4650
+ WORD_POSITION_ESCAPES.update({
4651
+ "b": DefaultBoundary(),
4652
+ "B": DefaultBoundary(False),
4653
+ "m": DefaultStartOfWord(),
4654
+ "M": DefaultEndOfWord(),
4655
+ })
4656
+
4657
+ # Regex control verbs.
4658
+ VERBS = {
4659
+ "FAIL": Failure(),
4660
+ "F": Failure(),
4661
+ "PRUNE": Prune(),
4662
+ "SKIP": Skip(),
4663
+ }