jaclang 0.8.4__py3-none-any.whl → 0.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of jaclang might be problematic. Click here for more details.

Files changed (53) hide show
  1. jaclang/cli/cli.py +74 -22
  2. jaclang/compiler/jac.lark +3 -3
  3. jaclang/compiler/larkparse/jac_parser.py +2 -2
  4. jaclang/compiler/parser.py +14 -21
  5. jaclang/compiler/passes/main/__init__.py +3 -1
  6. jaclang/compiler/passes/main/binder_pass.py +594 -0
  7. jaclang/compiler/passes/main/import_pass.py +8 -256
  8. jaclang/compiler/passes/main/inheritance_pass.py +2 -2
  9. jaclang/compiler/passes/main/pyast_gen_pass.py +35 -69
  10. jaclang/compiler/passes/main/pyast_load_pass.py +24 -13
  11. jaclang/compiler/passes/main/sem_def_match_pass.py +1 -1
  12. jaclang/compiler/passes/main/tests/fixtures/M1.jac +3 -0
  13. jaclang/compiler/passes/main/tests/fixtures/sym_binder.jac +47 -0
  14. jaclang/compiler/passes/main/tests/test_binder_pass.py +111 -0
  15. jaclang/compiler/passes/main/tests/test_pyast_gen_pass.py +13 -13
  16. jaclang/compiler/passes/main/tests/test_sem_def_match_pass.py +6 -6
  17. jaclang/compiler/passes/tool/doc_ir_gen_pass.py +2 -0
  18. jaclang/compiler/passes/tool/tests/fixtures/simple_walk_fmt.jac +6 -0
  19. jaclang/compiler/program.py +15 -8
  20. jaclang/compiler/tests/test_sr_errors.py +32 -0
  21. jaclang/compiler/unitree.py +21 -15
  22. jaclang/langserve/engine.jac +23 -4
  23. jaclang/langserve/tests/test_server.py +13 -0
  24. jaclang/runtimelib/importer.py +33 -62
  25. jaclang/runtimelib/utils.py +29 -0
  26. jaclang/tests/fixtures/pyfunc_fmt.py +60 -0
  27. jaclang/tests/fixtures/pyfunc_fstr.py +25 -0
  28. jaclang/tests/fixtures/pyfunc_kwesc.py +33 -0
  29. jaclang/tests/fixtures/python_run_test.py +19 -0
  30. jaclang/tests/test_cli.py +67 -0
  31. jaclang/tests/test_language.py +96 -1
  32. jaclang/utils/lang_tools.py +3 -3
  33. jaclang/utils/module_resolver.py +90 -0
  34. jaclang/utils/symtable_test_helpers.py +125 -0
  35. jaclang/utils/test.py +3 -4
  36. jaclang/vendor/interegular/__init__.py +34 -0
  37. jaclang/vendor/interegular/comparator.py +163 -0
  38. jaclang/vendor/interegular/fsm.py +1015 -0
  39. jaclang/vendor/interegular/patterns.py +732 -0
  40. jaclang/vendor/interegular/py.typed +0 -0
  41. jaclang/vendor/interegular/utils/__init__.py +15 -0
  42. jaclang/vendor/interegular/utils/simple_parser.py +165 -0
  43. jaclang/vendor/interegular-0.3.3.dist-info/INSTALLER +1 -0
  44. jaclang/vendor/interegular-0.3.3.dist-info/LICENSE.txt +21 -0
  45. jaclang/vendor/interegular-0.3.3.dist-info/METADATA +64 -0
  46. jaclang/vendor/interegular-0.3.3.dist-info/RECORD +20 -0
  47. jaclang/vendor/interegular-0.3.3.dist-info/REQUESTED +0 -0
  48. jaclang/vendor/interegular-0.3.3.dist-info/WHEEL +5 -0
  49. jaclang/vendor/interegular-0.3.3.dist-info/top_level.txt +1 -0
  50. {jaclang-0.8.4.dist-info → jaclang-0.8.5.dist-info}/METADATA +1 -1
  51. {jaclang-0.8.4.dist-info → jaclang-0.8.5.dist-info}/RECORD +53 -29
  52. {jaclang-0.8.4.dist-info → jaclang-0.8.5.dist-info}/WHEEL +0 -0
  53. {jaclang-0.8.4.dist-info → jaclang-0.8.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,732 @@
1
+ """
2
+ Allows the parsing of python-style regexes to FSMs.
3
+ Main access point is `parse_pattern(str) -> Pattern`.
4
+ Most other classes are internal and should not be used.
5
+ """
6
+
7
+ from abc import abstractmethod, ABC
8
+ from dataclasses import dataclass
9
+ from enum import Flag, auto
10
+ from textwrap import indent
11
+ from typing import Iterable, FrozenSet, Optional, Tuple, Union
12
+
13
+ from interegular.fsm import FSM, anything_else, epsilon, Alphabet
14
+ from interegular.utils.simple_parser import SimpleParser, nomatch, NoMatch
15
+
16
+ __all__ = ['parse_pattern', 'Pattern', 'Unsupported', 'InvalidSyntax', 'REFlags']
17
+
18
+
19
+ class Unsupported(Exception):
20
+ pass
21
+
22
+
23
+ class InvalidSyntax(Exception):
24
+ pass
25
+
26
+
27
+ class REFlags(Flag):
28
+ CASE_INSENSITIVE = I = auto()
29
+ MULTILINE = M = auto()
30
+ SINGLE_LINE = S = auto()
31
+
32
+
33
+ _flags = {
34
+ 'i': REFlags.I,
35
+ 'm': REFlags.M,
36
+ 's': REFlags.S,
37
+ }
38
+
39
+
40
+ def _get_flags(plus: str) -> REFlags:
41
+ res = REFlags(0)
42
+ for c in plus:
43
+ try:
44
+ res |= _flags[c]
45
+ except KeyError:
46
+ raise Unsupported(f"Flag {c} is not implemented")
47
+ return res
48
+
49
+
50
+ def _combine_flags(base: REFlags, added: REFlags, removed: REFlags):
51
+ base |= added
52
+ base &= ~removed
53
+ # TODO: Check for incorrect combinations (aLu)
54
+ return base
55
+
56
+
57
+ @dataclass(frozen=True)
58
+ class _BasePattern(ABC):
59
+ __slots__ = '_alphabet_cache', '_prefix_cache', '_lengths_cache'
60
+
61
+ @abstractmethod
62
+ def to_fsm(self, alphabet=None, prefix_postfix=None, flags=None) -> FSM:
63
+ raise NotImplementedError
64
+
65
+ @abstractmethod
66
+ def _get_alphabet(self, flags: REFlags) -> Alphabet:
67
+ raise NotImplementedError
68
+
69
+ def get_alphabet(self, flags: REFlags) -> Alphabet:
70
+ if not hasattr(self, '_alphabet_cache'):
71
+ super(_BasePattern, self).__setattr__('_alphabet_cache', {})
72
+ if flags not in self._alphabet_cache:
73
+ self._alphabet_cache[flags] = self._get_alphabet(flags)
74
+ return self._alphabet_cache[flags]
75
+
76
+ @abstractmethod
77
+ def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
78
+ raise NotImplementedError
79
+
80
+ @property
81
+ def prefix_postfix(self) -> Tuple[int, Optional[int]]:
82
+ """Returns the number of dots that have to be pre-/postfixed to support look(aheads|backs)"""
83
+ if not hasattr(self, '_prefix_cache'):
84
+ super(_BasePattern, self).__setattr__('_prefix_cache', self._get_prefix_postfix())
85
+ return self._prefix_cache
86
+
87
+ @abstractmethod
88
+ def _get_lengths(self) -> Tuple[int, Optional[int]]:
89
+ raise NotImplementedError
90
+
91
+ @property
92
+ def lengths(self) -> Tuple[int, Optional[int]]:
93
+ """Returns the minimum and maximum length that this pattern can match
94
+ (maximum can be None bei infinite length)"""
95
+ if not hasattr(self, '_lengths_cache'):
96
+ super(_BasePattern, self).__setattr__('_lengths_cache', self._get_lengths())
97
+ return self._lengths_cache
98
+
99
+ @abstractmethod
100
+ def simplify(self) -> '_BasePattern':
101
+ raise NotImplementedError
102
+
103
+
104
+ class _Repeatable(_BasePattern, ABC):
105
+ pass
106
+
107
+
108
+ @dataclass(frozen=True)
109
+ class _CharGroup(_Repeatable):
110
+ """Represents the smallest possible pattern that can be matched: A single char.
111
+ Direct port from the lego module"""
112
+ chars: FrozenSet[str]
113
+ negated: bool
114
+ __slots__ = 'chars', 'negated'
115
+
116
+ def _get_alphabet(self, flags: REFlags) -> Alphabet:
117
+ if flags & REFlags.CASE_INSENSITIVE:
118
+ relevant = {*map(str.lower, self.chars), *map(str.upper, self.chars)}
119
+ else:
120
+ relevant = self.chars
121
+ return Alphabet.from_groups(relevant, {anything_else})
122
+
123
+ def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
124
+ return 0, 0
125
+
126
+ def _get_lengths(self) -> Tuple[int, Optional[int]]:
127
+ return 1, 1
128
+
129
+ def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
130
+ if alphabet is None:
131
+ alphabet = self.get_alphabet(flags)
132
+ if prefix_postfix is None:
133
+ prefix_postfix = self.prefix_postfix
134
+ if prefix_postfix != (0, 0):
135
+ raise ValueError("Can not have prefix/postfix on CharGroup-level")
136
+ insensitive = flags & REFlags.CASE_INSENSITIVE
137
+ flags &= ~REFlags.CASE_INSENSITIVE
138
+ flags &= ~REFlags.SINGLE_LINE
139
+ if flags:
140
+ raise Unsupported(flags)
141
+ if insensitive:
142
+ chars = frozenset({*(c.lower() for c in self.chars), *(c.upper() for c in self.chars)})
143
+ else:
144
+ chars = self.chars
145
+
146
+ # State: 0 is initial, 1 is final
147
+
148
+ # If negated, make a singular FSM accepting any other characters
149
+ if self.negated:
150
+ mapping = {
151
+ 0: {alphabet[symbol]: 1 for symbol in set(alphabet) - chars},
152
+ }
153
+
154
+ # If normal, make a singular FSM accepting only these characters
155
+ else:
156
+ mapping = {
157
+ 0: {alphabet[symbol]: 1 for symbol in chars},
158
+ }
159
+
160
+ return FSM(
161
+ alphabet=alphabet,
162
+ states={0, 1},
163
+ initial=0,
164
+ finals={1},
165
+ map=mapping,
166
+ )
167
+
168
+ def simplify(self) -> '_CharGroup':
169
+ return self
170
+
171
+
172
+ def _combine_char_groups(*groups: _CharGroup, negate):
173
+ pos = set().union(*(g.chars for g in groups if not g.negated))
174
+ neg = set().union(*(g.chars for g in groups if g.negated))
175
+ if neg:
176
+ return _CharGroup(frozenset(neg - pos), not negate)
177
+ else:
178
+ return _CharGroup(frozenset(pos - neg), negate)
179
+
180
+
181
+ @dataclass(frozen=True)
182
+ class __DotCls(_Repeatable):
183
+
184
+ def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
185
+ if alphabet is None:
186
+ alphabet = self.get_alphabet(flags)
187
+ if flags is None or not flags & REFlags.SINGLE_LINE:
188
+ symbols = set(alphabet) - {'\n'}
189
+ else:
190
+ symbols = alphabet
191
+ return FSM(
192
+ alphabet=alphabet,
193
+ states={0, 1},
194
+ initial=0,
195
+ finals={1},
196
+ map={0: {alphabet[sym]: 1 for sym in symbols}},
197
+ )
198
+
199
+ def _get_alphabet(self, flags: REFlags) -> Alphabet:
200
+ if flags & REFlags.SINGLE_LINE:
201
+ return Alphabet.from_groups({anything_else})
202
+ else:
203
+ return Alphabet.from_groups({anything_else}, {'\n'})
204
+
205
+ def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
206
+ return 0, 0
207
+
208
+ def _get_lengths(self) -> Tuple[int, Optional[int]]:
209
+ return 1, 1
210
+
211
+ def simplify(self) -> '__DotCls':
212
+ return self
213
+
214
+
215
+ @dataclass(frozen=True)
216
+ class __EmptyCls(_BasePattern):
217
+
218
+ def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
219
+ if alphabet is None:
220
+ alphabet = self.get_alphabet(flags)
221
+ return epsilon(alphabet)
222
+
223
+ def _get_alphabet(self, flags: REFlags) -> Alphabet:
224
+ return Alphabet.from_groups({anything_else})
225
+
226
+ def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
227
+ return 0, 0
228
+
229
+ def _get_lengths(self) -> Tuple[int, Optional[int]]:
230
+ return 0, 0
231
+
232
+ def simplify(self) -> '__EmptyCls':
233
+ return self
234
+
235
+
236
+ _DOT = __DotCls()
237
+ _EMPTY = __EmptyCls()
238
+ _NONE = _CharGroup(frozenset(""), False)
239
+ _ALL = _CharGroup(frozenset(""), True)
240
+ _CHAR_GROUPS = {
241
+ 'w': _CharGroup(frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"), False),
242
+ 'W': _CharGroup(frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"), True),
243
+ 'd': _CharGroup(frozenset("0123456789"), False),
244
+ 'D': _CharGroup(frozenset("0123456789"), True),
245
+ 's': _CharGroup(frozenset(" \t\n\r\f\v"), False),
246
+ 'S': _CharGroup(frozenset(" \t\n\r\f\v"), True),
247
+
248
+ 'a': _CharGroup(frozenset("\a"), False),
249
+ 'b': _CharGroup(frozenset("\b"), False),
250
+ 'f': _CharGroup(frozenset("\f"), False),
251
+ 'n': _CharGroup(frozenset("\n"), False),
252
+ 'r': _CharGroup(frozenset("\r"), False),
253
+ 't': _CharGroup(frozenset("\t"), False),
254
+ 'v': _CharGroup(frozenset("\v"), False),
255
+ }
256
+
257
+
258
+ @dataclass(frozen=True)
259
+ class _Repeated(_BasePattern):
260
+ """Represents a repeated pattern. `base` can be matched from `min` to `max` times.
261
+ `max` may be None to signal infinite"""
262
+ base: _Repeatable
263
+ min: int
264
+ max: Optional[int]
265
+
266
+ def __str__(self):
267
+ return f"Repeated[{self.min}:{self.max if self.max is not None else ''}]:\n" \
268
+ f"{indent(str(self.base), ' ')}"
269
+
270
+ def _get_alphabet(self, flags: REFlags) -> Alphabet:
271
+ return self.base.get_alphabet(flags)
272
+
273
+ def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
274
+ return self.base.prefix_postfix
275
+
276
+ def _get_lengths(self) -> Tuple[int, Optional[int]]:
277
+ l, h = self.base.lengths
278
+ return l * self.min, (h * self.max if None not in (h, self.max) else None)
279
+
280
+ def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
281
+ if alphabet is None:
282
+ alphabet = self.get_alphabet(flags)
283
+ if prefix_postfix is None:
284
+ prefix_postfix = self.prefix_postfix
285
+ if prefix_postfix != (0, 0):
286
+ raise ValueError("Can not have prefix/postfix on CharGroup-level")
287
+
288
+ unit = self.base.to_fsm(alphabet, (0, 0), flags=flags)
289
+ mandatory = unit * self.min
290
+ if self.max is None:
291
+ optional = unit.star()
292
+ else:
293
+ optional = unit.copy()
294
+ optional.__dict__['finals'] |= {optional.initial}
295
+ optional *= (self.max - self.min)
296
+ return mandatory + optional
297
+
298
+ def simplify(self) -> '_Repeated':
299
+ return self.__class__(self.base.simplify(), self.min, self.max)
300
+
301
+
302
+ _ALL_STAR = _Repeated(_ALL, 0, None)
303
+
304
+
305
+ @dataclass(frozen=True)
306
+ class _NonCapturing:
307
+ """Represents a lookahead/lookback. Matches `inner` without 'consuming' anything. Can be negated.
308
+ Only valid inside a `_Concatenation`"""
309
+ inner: _BasePattern
310
+ backwards: bool
311
+ negate: bool
312
+ __slots__ = 'inner', 'backwards', 'negate'
313
+
314
+ def get_alphabet(self, flags: REFlags) -> Alphabet:
315
+ return self.inner.get_alphabet(flags)
316
+
317
+ def simplify(self) -> '_NonCapturing':
318
+ return self.__class__(self.inner.simplify(), self.backwards, self.negate)
319
+
320
+
321
+ @dataclass(frozen=True)
322
+ class _Concatenation(_BasePattern):
323
+ """Represents multiple Patterns that have to be match in a row. Can contain `_NonCapturing`"""
324
+ parts: Tuple[Union[_BasePattern, _NonCapturing], ...]
325
+ __slots__ = 'parts',
326
+
327
+ def __str__(self):
328
+ return "Concatenation:\n" + "\n".join(indent(str(p), ' ') for p in self.parts)
329
+
330
+ def _get_alphabet(self, flags: REFlags) -> Alphabet:
331
+ return Alphabet.union(*(p.get_alphabet(flags) for p in self.parts))[0]
332
+
333
+ def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
334
+ pre = 0 # What is the longest a lookback could stick out over the beginning?
335
+ off = 0 # How many chars have been consumed, e.g what is the minimum length?
336
+ for p in self.parts:
337
+ if not isinstance(p, _NonCapturing):
338
+ off += p.lengths[0]
339
+ elif p.backwards:
340
+ a, b = p.inner.lengths
341
+ if a != b:
342
+ raise InvalidSyntax(f"lookbacks have to have fixed length {(a, b)}")
343
+ req = a - off
344
+ if req > pre:
345
+ pre = req
346
+ post = 0
347
+ off = 0
348
+ for p in reversed(self.parts):
349
+ if not isinstance(p, _NonCapturing):
350
+ off += p.lengths[0]
351
+ elif not p.backwards:
352
+ a, b = p.inner.lengths
353
+ if b is None:
354
+ req = a - off # TODO: is this correct?
355
+ else:
356
+ req = b - off
357
+ if req > post:
358
+ post = req
359
+ return pre, post
360
+
361
+ def _get_lengths(self) -> Tuple[int, Optional[int]]:
362
+ low, high = 0, 0
363
+ for p in self.parts:
364
+ if not isinstance(p, _NonCapturing):
365
+ pl, ph = p.lengths
366
+ low += pl
367
+ high = high + ph if None not in (high, ph) else None
368
+ return low, high
369
+
370
+ def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
371
+ if alphabet is None:
372
+ alphabet = self.get_alphabet(flags)
373
+ if prefix_postfix is None:
374
+ prefix_postfix = self.prefix_postfix
375
+ if prefix_postfix[0] < self.prefix_postfix[0] or prefix_postfix[1] < self.prefix_postfix[1]:
376
+ raise Unsupported("Group can not have lookbacks/lookaheads that go beyond the group bounds.")
377
+
378
+ all_ = _ALL.to_fsm(alphabet)
379
+ all_star = all_.star()
380
+ fsm_parts = []
381
+ current = [all_.times(prefix_postfix[0])]
382
+ for part in self.parts:
383
+ if isinstance(part, _NonCapturing):
384
+ inner = part.inner.to_fsm(alphabet, (0, 0), flags)
385
+ if part.backwards:
386
+ raise Unsupported("lookbacks are not implemented")
387
+ else:
388
+ # try:
389
+ # inner.cardinality()
390
+ # except OverflowError:
391
+ # raise NotImplementedError("Can not deal with infinite length lookaheads")
392
+ fsm_parts.append((None, current))
393
+ fsm_parts.append((part, inner))
394
+ current = []
395
+ else:
396
+ current.append(part.to_fsm(alphabet, (0, 0), flags))
397
+ current.append(all_.times(prefix_postfix[1]))
398
+ result = FSM.concatenate(*current)
399
+ for m, f in reversed(fsm_parts):
400
+ if m is None:
401
+ result = FSM.concatenate(*f, result)
402
+ else:
403
+ assert isinstance(m, _NonCapturing) and not m.backwards
404
+ if m.negate:
405
+ result = result.difference(f + all_star) # TODO: This does not feel right...
406
+ else:
407
+ result = result.intersection(f + all_star)
408
+ return result
409
+
410
+ def simplify(self) -> '_Concatenation':
411
+ return self.__class__(tuple(p.simplify() for p in self.parts))
412
+
413
+
414
+ @dataclass(frozen=True)
415
+ class Pattern(_Repeatable):
416
+ options: Tuple[_BasePattern, ...]
417
+ added_flags: REFlags = REFlags(0)
418
+ removed_flags: REFlags = REFlags(0)
419
+
420
+ def __str__(self):
421
+ return "Pattern:\n" + "\n".join(indent(str(o), ' ') for o in self.options)
422
+
423
+ def _get_alphabet(self, flags: REFlags) -> Alphabet:
424
+ flags = _combine_flags(flags, self.added_flags, self.removed_flags)
425
+ return Alphabet.union(*(p.get_alphabet(flags) for p in self.options))[0]
426
+
427
+ def _get_lengths(self) -> Tuple[int, Optional[int]]:
428
+ low, high = None, 0
429
+ for o in self.options:
430
+ ol, oh = o.lengths
431
+ if low is None or ol < low:
432
+ low = ol
433
+ if oh is None or (high is not None and oh > high):
434
+ high = oh
435
+ return low, high
436
+
437
+ def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
438
+ pre, post = 0, 0
439
+ for o in self.options:
440
+ opre, opost = o.prefix_postfix
441
+ if opre > pre:
442
+ pre = opre
443
+ if opost is None or (post is not None and opost > post):
444
+ post = opost
445
+ return pre, post
446
+
447
+ def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
448
+ flags = _combine_flags(flags, self.added_flags, self.removed_flags)
449
+ if alphabet is None:
450
+ alphabet = self.get_alphabet(flags)
451
+ if prefix_postfix is None:
452
+ prefix_postfix = self.prefix_postfix
453
+ return FSM.union(*(o.to_fsm(alphabet, prefix_postfix, flags) for o in self.options))
454
+
455
+ def with_flags(self, added: REFlags, removed: REFlags = REFlags(0)) -> 'Pattern':
456
+ return self.__class__(self.options, added, removed)
457
+
458
+ def simplify(self) -> 'Pattern':
459
+ if len(self.options) == 1:
460
+ o = self.options[0]
461
+ if isinstance(o, _Concatenation) and len(o.parts) == 1 and isinstance(o.parts[0], Pattern):
462
+ p: Pattern = o.parts[0].simplify()
463
+ f = _combine_flags(_combine_flags(REFlags(0), self.added_flags, self.removed_flags),
464
+ p.added_flags, p.removed_flags)
465
+ return p.with_flags(f)
466
+ return self.__class__(tuple(o.simplify() for o in self.options), self.added_flags, self.removed_flags)
467
+
468
+
469
+ class _ParsePattern(SimpleParser[Pattern]):
470
+ SPECIAL_CHARS_STANDARD: FrozenSet[str] = frozenset({
471
+ '+', '?', '*', '.', '$', '^', '\\', '(', ')', '[', '|'
472
+ })
473
+ SPECIAL_CHARS_INNER: FrozenSet[str] = frozenset({
474
+ '\\', ']'
475
+ })
476
+ RESERVED_ESCAPES: FrozenSet[str] = frozenset({
477
+ 'u', 'U', 'A', 'Z', 'b', 'B'
478
+ })
479
+
480
+ def __init__(self, data: str):
481
+ super(_ParsePattern, self).__init__(data)
482
+ self.flags = None
483
+
484
+ def parse(self):
485
+ try:
486
+ return super(_ParsePattern, self).parse()
487
+ except NoMatch:
488
+ raise InvalidSyntax
489
+
490
+ def start(self):
491
+ self.flags = None
492
+ p = self.pattern()
493
+ if self.flags is not None:
494
+ p = p.with_flags(self.flags)
495
+ return p
496
+
497
+ def pattern(self):
498
+ options = [self.conc()]
499
+ while self.static_b('|'):
500
+ options.append(self.conc())
501
+ return Pattern(tuple(options))
502
+
503
+ def conc(self):
504
+ parts = []
505
+ while True:
506
+ try:
507
+ parts.append(self.obj())
508
+ except nomatch:
509
+ break
510
+ return _Concatenation(tuple(parts))
511
+
512
+ def obj(self):
513
+ if self.static_b("("):
514
+ return self.group()
515
+ return self.repetition(self.atom())
516
+
517
+ def group(self):
518
+ if self.static_b("?"):
519
+ return self.extension_group()
520
+ else:
521
+ p = self.pattern()
522
+ self.static(")")
523
+ return self.repetition(p)
524
+
525
+ def extension_group(self):
526
+ c = self.any()
527
+ if c in 'aiLmsux-':
528
+ self.index -= 1
529
+ added_flags = self.multiple('aiLmsux', 0, None)
530
+ if self.static_b('-'):
531
+ removed_flags = self.multiple('aiLmsux', 1, None)
532
+ else:
533
+ removed_flags = ''
534
+ if self.static_b(':'):
535
+ p = self.pattern()
536
+ p = p.with_flags(_get_flags(added_flags), _get_flags(removed_flags))
537
+ self.static(")")
538
+ return self.repetition(p)
539
+ elif removed_flags != '':
540
+ raise nomatch
541
+ else:
542
+ self.static(')')
543
+ self.flags = _get_flags(added_flags)
544
+ return _EMPTY
545
+ elif c == ':':
546
+ p = self.pattern()
547
+ self.static(")")
548
+ return self.repetition(p)
549
+ elif c == 'P':
550
+ if self.static_b('<'):
551
+ self.multiple('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_', 1, None)
552
+ self.static('>')
553
+ p = self.pattern()
554
+ self.static(")")
555
+ return self.repetition(p)
556
+ elif self.static_b('='):
557
+ raise Unsupported("Group references are not implemented")
558
+ elif c == '#':
559
+ while not self.static_b(')'):
560
+ self.any()
561
+ elif c == '=':
562
+ p = self.pattern()
563
+ self.static(")")
564
+ return _NonCapturing(p, False, False)
565
+ elif c == '!':
566
+ p = self.pattern()
567
+ self.static(")")
568
+ return _NonCapturing(p, False, True)
569
+ elif c == '<':
570
+ c = self.any()
571
+ if c == '=':
572
+ p = self.pattern()
573
+ self.static(")")
574
+ return _NonCapturing(p, True, False)
575
+ elif c == '!':
576
+ p = self.pattern()
577
+ self.static(")")
578
+ return _NonCapturing(p, True, True)
579
+ elif c == '(':
580
+ raise Unsupported("Conditional matching is not implemented")
581
+ else:
582
+ raise InvalidSyntax(
583
+ f"Unknown group-extension: {c!r} (Context: {self.data[self.index - 3:self.index + 5]!r}")
584
+
585
+ def atom(self):
586
+ if self.static_b("["):
587
+ return self.repetition(self.chargroup())
588
+ elif self.static_b("\\"):
589
+ return self.repetition(self.escaped())
590
+ elif self.static_b("."):
591
+ return self.repetition(_DOT)
592
+ elif self.static_b("$"):
593
+ raise Unsupported("'$'")
594
+ elif self.static_b("^"):
595
+ raise Unsupported("'^'")
596
+ else:
597
+ c = self.any_but(*self.SPECIAL_CHARS_STANDARD)
598
+ return self.repetition(_CharGroup(frozenset({c}), False))
599
+
600
+ def repetition(self, base: _Repeatable):
601
+ if self.static_b("*"):
602
+ if self.static_b("?"):
603
+ pass
604
+ return _Repeated(base, 0, None)
605
+ elif self.static_b("+"):
606
+ if self.static_b("?"):
607
+ pass
608
+ return _Repeated(base, 1, None)
609
+ elif self.static_b("?"):
610
+ if self.static_b("?"):
611
+ pass
612
+ return _Repeated(base, 0, 1)
613
+ elif self.static_b("{"):
614
+ try:
615
+ n = self.number()
616
+ except nomatch:
617
+ n = 0
618
+ if self.static_b(','):
619
+ try:
620
+ m = self.number()
621
+ except nomatch:
622
+ m = None
623
+ else:
624
+ m = n
625
+ self.static("}")
626
+ if self.static_b('?'):
627
+ pass
628
+ return _Repeated(base, n, m)
629
+ else:
630
+ return base
631
+
632
+ def number(self) -> int:
633
+ return int(self.multiple("0123456789", 1, None))
634
+
635
+ def escaped(self, inner=False):
636
+ if self.static_b("x"):
637
+ n = self.multiple("0123456789abcdefABCDEF", 2, 2)
638
+ c = chr(int(n, 16))
639
+ return _CharGroup(frozenset({c}), False)
640
+ if self.static_b("0"):
641
+ n = self.multiple("01234567", 1, 2)
642
+ c = chr(int(n, 8))
643
+ return _CharGroup(frozenset({c}), False)
644
+ if self.anyof_b('N', 'p', 'P', 'u', 'U'):
645
+ raise Unsupported('regex module unicode properties are not supported.')
646
+ if not inner:
647
+ try:
648
+ n = self.multiple("01234567", 3, 3)
649
+ except nomatch:
650
+ pass
651
+ else:
652
+ c = chr(int(n, 8))
653
+ return _CharGroup(frozenset({c}), False)
654
+ try:
655
+ self.multiple("0123456789", 1, 2)
656
+ except nomatch:
657
+ pass
658
+ else:
659
+ raise Unsupported("Group references are not implemented")
660
+ else:
661
+ try:
662
+ n = self.multiple("01234567", 1, 3)
663
+ except nomatch:
664
+ pass
665
+ else:
666
+ c = chr(int(n, 8))
667
+ return _CharGroup(frozenset({c}), False)
668
+ if not inner:
669
+ try:
670
+ c = self.anyof(*self.RESERVED_ESCAPES)
671
+ except nomatch:
672
+ pass
673
+ else:
674
+ raise Unsupported(f"Escape \\{c} is not implemented")
675
+ try:
676
+ c = self.anyof(*_CHAR_GROUPS)
677
+ except nomatch:
678
+ pass
679
+ else:
680
+ return _CHAR_GROUPS[c]
681
+ c = self.any_but("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
682
+ if c.isalpha():
683
+ raise nomatch
684
+ return _CharGroup(frozenset(c), False)
685
+
686
+ def chargroup(self):
687
+ if self.static_b("^"):
688
+ negate = True
689
+ else:
690
+ negate = False
691
+ groups = []
692
+ while True:
693
+ try:
694
+ groups.append(self.chargroup_inner())
695
+ except nomatch:
696
+ break
697
+ self.static("]")
698
+ if len(groups) == 1:
699
+ f = tuple(groups)[0]
700
+ return _CharGroup(f.chars, negate ^ f.negated)
701
+ elif len(groups) == 0:
702
+ return _CharGroup(frozenset({}), negate)
703
+ else:
704
+ return _combine_char_groups(*groups, negate=negate)
705
+
706
+ def chargroup_inner(self) -> _CharGroup:
707
+ start = self.index
708
+ if self.static_b('\\'):
709
+ base = self.escaped(True)
710
+ else:
711
+ base = _CharGroup(frozenset(self.any_but(*self.SPECIAL_CHARS_INNER)), False)
712
+ if self.static_b('-'):
713
+ if self.static_b('\\'):
714
+ end = self.escaped(True)
715
+ elif self.peek_static(']'):
716
+ return _combine_char_groups(base, _CharGroup(frozenset('-'), False), negate=False)
717
+ else:
718
+ end = _CharGroup(frozenset(self.any_but(*self.SPECIAL_CHARS_INNER)), False)
719
+ if len(base.chars) != 1 or len(end.chars) != 1:
720
+ raise InvalidSyntax(f"Invalid Character-range: {self.data[start:self.index]}")
721
+ low, high = ord(*base.chars), ord(*end.chars)
722
+ if low > high:
723
+ raise InvalidSyntax(f"Invalid Character-range: {self.data[start:self.index]}")
724
+ return _CharGroup(frozenset((chr(i) for i in range(low, high + 1))), False)
725
+ return base
726
+
727
+
728
+ def parse_pattern(pattern: str) -> Pattern:
729
+ p = _ParsePattern(pattern)
730
+ out = p.parse()
731
+ out = out.simplify()
732
+ return out