jaclang 0.8.4__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of jaclang might be problematic. Click here for more details.
- jaclang/cli/cli.py +74 -22
- jaclang/compiler/jac.lark +3 -3
- jaclang/compiler/larkparse/jac_parser.py +2 -2
- jaclang/compiler/parser.py +14 -21
- jaclang/compiler/passes/main/__init__.py +3 -1
- jaclang/compiler/passes/main/binder_pass.py +594 -0
- jaclang/compiler/passes/main/import_pass.py +8 -256
- jaclang/compiler/passes/main/inheritance_pass.py +2 -2
- jaclang/compiler/passes/main/pyast_gen_pass.py +35 -69
- jaclang/compiler/passes/main/pyast_load_pass.py +24 -13
- jaclang/compiler/passes/main/sem_def_match_pass.py +1 -1
- jaclang/compiler/passes/main/tests/fixtures/M1.jac +3 -0
- jaclang/compiler/passes/main/tests/fixtures/sym_binder.jac +47 -0
- jaclang/compiler/passes/main/tests/test_binder_pass.py +111 -0
- jaclang/compiler/passes/main/tests/test_pyast_gen_pass.py +13 -13
- jaclang/compiler/passes/main/tests/test_sem_def_match_pass.py +6 -6
- jaclang/compiler/passes/tool/doc_ir_gen_pass.py +2 -0
- jaclang/compiler/passes/tool/tests/fixtures/simple_walk_fmt.jac +6 -0
- jaclang/compiler/program.py +15 -8
- jaclang/compiler/tests/test_sr_errors.py +32 -0
- jaclang/compiler/unitree.py +21 -15
- jaclang/langserve/engine.jac +23 -4
- jaclang/langserve/tests/test_server.py +13 -0
- jaclang/runtimelib/importer.py +33 -62
- jaclang/runtimelib/utils.py +29 -0
- jaclang/tests/fixtures/pyfunc_fmt.py +60 -0
- jaclang/tests/fixtures/pyfunc_fstr.py +25 -0
- jaclang/tests/fixtures/pyfunc_kwesc.py +33 -0
- jaclang/tests/fixtures/python_run_test.py +19 -0
- jaclang/tests/test_cli.py +67 -0
- jaclang/tests/test_language.py +96 -1
- jaclang/utils/lang_tools.py +3 -3
- jaclang/utils/module_resolver.py +90 -0
- jaclang/utils/symtable_test_helpers.py +125 -0
- jaclang/utils/test.py +3 -4
- jaclang/vendor/interegular/__init__.py +34 -0
- jaclang/vendor/interegular/comparator.py +163 -0
- jaclang/vendor/interegular/fsm.py +1015 -0
- jaclang/vendor/interegular/patterns.py +732 -0
- jaclang/vendor/interegular/py.typed +0 -0
- jaclang/vendor/interegular/utils/__init__.py +15 -0
- jaclang/vendor/interegular/utils/simple_parser.py +165 -0
- jaclang/vendor/interegular-0.3.3.dist-info/INSTALLER +1 -0
- jaclang/vendor/interegular-0.3.3.dist-info/LICENSE.txt +21 -0
- jaclang/vendor/interegular-0.3.3.dist-info/METADATA +64 -0
- jaclang/vendor/interegular-0.3.3.dist-info/RECORD +20 -0
- jaclang/vendor/interegular-0.3.3.dist-info/REQUESTED +0 -0
- jaclang/vendor/interegular-0.3.3.dist-info/WHEEL +5 -0
- jaclang/vendor/interegular-0.3.3.dist-info/top_level.txt +1 -0
- {jaclang-0.8.4.dist-info → jaclang-0.8.5.dist-info}/METADATA +1 -1
- {jaclang-0.8.4.dist-info → jaclang-0.8.5.dist-info}/RECORD +53 -29
- {jaclang-0.8.4.dist-info → jaclang-0.8.5.dist-info}/WHEEL +0 -0
- {jaclang-0.8.4.dist-info → jaclang-0.8.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,732 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Allows the parsing of python-style regexes to FSMs.
|
|
3
|
+
Main access point is `parse_pattern(str) -> Pattern`.
|
|
4
|
+
Most other classes are internal and should not be used.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from abc import abstractmethod, ABC
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from enum import Flag, auto
|
|
10
|
+
from textwrap import indent
|
|
11
|
+
from typing import Iterable, FrozenSet, Optional, Tuple, Union
|
|
12
|
+
|
|
13
|
+
from interegular.fsm import FSM, anything_else, epsilon, Alphabet
|
|
14
|
+
from interegular.utils.simple_parser import SimpleParser, nomatch, NoMatch
|
|
15
|
+
|
|
16
|
+
__all__ = ['parse_pattern', 'Pattern', 'Unsupported', 'InvalidSyntax', 'REFlags']
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Unsupported(Exception):
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class InvalidSyntax(Exception):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class REFlags(Flag):
|
|
28
|
+
CASE_INSENSITIVE = I = auto()
|
|
29
|
+
MULTILINE = M = auto()
|
|
30
|
+
SINGLE_LINE = S = auto()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_flags = {
|
|
34
|
+
'i': REFlags.I,
|
|
35
|
+
'm': REFlags.M,
|
|
36
|
+
's': REFlags.S,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _get_flags(plus: str) -> REFlags:
|
|
41
|
+
res = REFlags(0)
|
|
42
|
+
for c in plus:
|
|
43
|
+
try:
|
|
44
|
+
res |= _flags[c]
|
|
45
|
+
except KeyError:
|
|
46
|
+
raise Unsupported(f"Flag {c} is not implemented")
|
|
47
|
+
return res
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _combine_flags(base: REFlags, added: REFlags, removed: REFlags):
|
|
51
|
+
base |= added
|
|
52
|
+
base &= ~removed
|
|
53
|
+
# TODO: Check for incorrect combinations (aLu)
|
|
54
|
+
return base
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(frozen=True)
|
|
58
|
+
class _BasePattern(ABC):
|
|
59
|
+
__slots__ = '_alphabet_cache', '_prefix_cache', '_lengths_cache'
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def to_fsm(self, alphabet=None, prefix_postfix=None, flags=None) -> FSM:
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def _get_alphabet(self, flags: REFlags) -> Alphabet:
|
|
67
|
+
raise NotImplementedError
|
|
68
|
+
|
|
69
|
+
def get_alphabet(self, flags: REFlags) -> Alphabet:
|
|
70
|
+
if not hasattr(self, '_alphabet_cache'):
|
|
71
|
+
super(_BasePattern, self).__setattr__('_alphabet_cache', {})
|
|
72
|
+
if flags not in self._alphabet_cache:
|
|
73
|
+
self._alphabet_cache[flags] = self._get_alphabet(flags)
|
|
74
|
+
return self._alphabet_cache[flags]
|
|
75
|
+
|
|
76
|
+
@abstractmethod
|
|
77
|
+
def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
|
|
78
|
+
raise NotImplementedError
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def prefix_postfix(self) -> Tuple[int, Optional[int]]:
|
|
82
|
+
"""Returns the number of dots that have to be pre-/postfixed to support look(aheads|backs)"""
|
|
83
|
+
if not hasattr(self, '_prefix_cache'):
|
|
84
|
+
super(_BasePattern, self).__setattr__('_prefix_cache', self._get_prefix_postfix())
|
|
85
|
+
return self._prefix_cache
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def _get_lengths(self) -> Tuple[int, Optional[int]]:
|
|
89
|
+
raise NotImplementedError
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def lengths(self) -> Tuple[int, Optional[int]]:
|
|
93
|
+
"""Returns the minimum and maximum length that this pattern can match
|
|
94
|
+
(maximum can be None bei infinite length)"""
|
|
95
|
+
if not hasattr(self, '_lengths_cache'):
|
|
96
|
+
super(_BasePattern, self).__setattr__('_lengths_cache', self._get_lengths())
|
|
97
|
+
return self._lengths_cache
|
|
98
|
+
|
|
99
|
+
@abstractmethod
|
|
100
|
+
def simplify(self) -> '_BasePattern':
|
|
101
|
+
raise NotImplementedError
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class _Repeatable(_BasePattern, ABC):
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass(frozen=True)
|
|
109
|
+
class _CharGroup(_Repeatable):
|
|
110
|
+
"""Represents the smallest possible pattern that can be matched: A single char.
|
|
111
|
+
Direct port from the lego module"""
|
|
112
|
+
chars: FrozenSet[str]
|
|
113
|
+
negated: bool
|
|
114
|
+
__slots__ = 'chars', 'negated'
|
|
115
|
+
|
|
116
|
+
def _get_alphabet(self, flags: REFlags) -> Alphabet:
|
|
117
|
+
if flags & REFlags.CASE_INSENSITIVE:
|
|
118
|
+
relevant = {*map(str.lower, self.chars), *map(str.upper, self.chars)}
|
|
119
|
+
else:
|
|
120
|
+
relevant = self.chars
|
|
121
|
+
return Alphabet.from_groups(relevant, {anything_else})
|
|
122
|
+
|
|
123
|
+
def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
|
|
124
|
+
return 0, 0
|
|
125
|
+
|
|
126
|
+
def _get_lengths(self) -> Tuple[int, Optional[int]]:
|
|
127
|
+
return 1, 1
|
|
128
|
+
|
|
129
|
+
def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
|
|
130
|
+
if alphabet is None:
|
|
131
|
+
alphabet = self.get_alphabet(flags)
|
|
132
|
+
if prefix_postfix is None:
|
|
133
|
+
prefix_postfix = self.prefix_postfix
|
|
134
|
+
if prefix_postfix != (0, 0):
|
|
135
|
+
raise ValueError("Can not have prefix/postfix on CharGroup-level")
|
|
136
|
+
insensitive = flags & REFlags.CASE_INSENSITIVE
|
|
137
|
+
flags &= ~REFlags.CASE_INSENSITIVE
|
|
138
|
+
flags &= ~REFlags.SINGLE_LINE
|
|
139
|
+
if flags:
|
|
140
|
+
raise Unsupported(flags)
|
|
141
|
+
if insensitive:
|
|
142
|
+
chars = frozenset({*(c.lower() for c in self.chars), *(c.upper() for c in self.chars)})
|
|
143
|
+
else:
|
|
144
|
+
chars = self.chars
|
|
145
|
+
|
|
146
|
+
# State: 0 is initial, 1 is final
|
|
147
|
+
|
|
148
|
+
# If negated, make a singular FSM accepting any other characters
|
|
149
|
+
if self.negated:
|
|
150
|
+
mapping = {
|
|
151
|
+
0: {alphabet[symbol]: 1 for symbol in set(alphabet) - chars},
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
# If normal, make a singular FSM accepting only these characters
|
|
155
|
+
else:
|
|
156
|
+
mapping = {
|
|
157
|
+
0: {alphabet[symbol]: 1 for symbol in chars},
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return FSM(
|
|
161
|
+
alphabet=alphabet,
|
|
162
|
+
states={0, 1},
|
|
163
|
+
initial=0,
|
|
164
|
+
finals={1},
|
|
165
|
+
map=mapping,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def simplify(self) -> '_CharGroup':
|
|
169
|
+
return self
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _combine_char_groups(*groups: _CharGroup, negate):
|
|
173
|
+
pos = set().union(*(g.chars for g in groups if not g.negated))
|
|
174
|
+
neg = set().union(*(g.chars for g in groups if g.negated))
|
|
175
|
+
if neg:
|
|
176
|
+
return _CharGroup(frozenset(neg - pos), not negate)
|
|
177
|
+
else:
|
|
178
|
+
return _CharGroup(frozenset(pos - neg), negate)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@dataclass(frozen=True)
|
|
182
|
+
class __DotCls(_Repeatable):
|
|
183
|
+
|
|
184
|
+
def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
|
|
185
|
+
if alphabet is None:
|
|
186
|
+
alphabet = self.get_alphabet(flags)
|
|
187
|
+
if flags is None or not flags & REFlags.SINGLE_LINE:
|
|
188
|
+
symbols = set(alphabet) - {'\n'}
|
|
189
|
+
else:
|
|
190
|
+
symbols = alphabet
|
|
191
|
+
return FSM(
|
|
192
|
+
alphabet=alphabet,
|
|
193
|
+
states={0, 1},
|
|
194
|
+
initial=0,
|
|
195
|
+
finals={1},
|
|
196
|
+
map={0: {alphabet[sym]: 1 for sym in symbols}},
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
def _get_alphabet(self, flags: REFlags) -> Alphabet:
|
|
200
|
+
if flags & REFlags.SINGLE_LINE:
|
|
201
|
+
return Alphabet.from_groups({anything_else})
|
|
202
|
+
else:
|
|
203
|
+
return Alphabet.from_groups({anything_else}, {'\n'})
|
|
204
|
+
|
|
205
|
+
def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
|
|
206
|
+
return 0, 0
|
|
207
|
+
|
|
208
|
+
def _get_lengths(self) -> Tuple[int, Optional[int]]:
|
|
209
|
+
return 1, 1
|
|
210
|
+
|
|
211
|
+
def simplify(self) -> '__DotCls':
|
|
212
|
+
return self
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
@dataclass(frozen=True)
|
|
216
|
+
class __EmptyCls(_BasePattern):
|
|
217
|
+
|
|
218
|
+
def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
|
|
219
|
+
if alphabet is None:
|
|
220
|
+
alphabet = self.get_alphabet(flags)
|
|
221
|
+
return epsilon(alphabet)
|
|
222
|
+
|
|
223
|
+
def _get_alphabet(self, flags: REFlags) -> Alphabet:
|
|
224
|
+
return Alphabet.from_groups({anything_else})
|
|
225
|
+
|
|
226
|
+
def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
|
|
227
|
+
return 0, 0
|
|
228
|
+
|
|
229
|
+
def _get_lengths(self) -> Tuple[int, Optional[int]]:
|
|
230
|
+
return 0, 0
|
|
231
|
+
|
|
232
|
+
def simplify(self) -> '__EmptyCls':
|
|
233
|
+
return self
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
_DOT = __DotCls()
|
|
237
|
+
_EMPTY = __EmptyCls()
|
|
238
|
+
_NONE = _CharGroup(frozenset(""), False)
|
|
239
|
+
_ALL = _CharGroup(frozenset(""), True)
|
|
240
|
+
_CHAR_GROUPS = {
|
|
241
|
+
'w': _CharGroup(frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"), False),
|
|
242
|
+
'W': _CharGroup(frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"), True),
|
|
243
|
+
'd': _CharGroup(frozenset("0123456789"), False),
|
|
244
|
+
'D': _CharGroup(frozenset("0123456789"), True),
|
|
245
|
+
's': _CharGroup(frozenset(" \t\n\r\f\v"), False),
|
|
246
|
+
'S': _CharGroup(frozenset(" \t\n\r\f\v"), True),
|
|
247
|
+
|
|
248
|
+
'a': _CharGroup(frozenset("\a"), False),
|
|
249
|
+
'b': _CharGroup(frozenset("\b"), False),
|
|
250
|
+
'f': _CharGroup(frozenset("\f"), False),
|
|
251
|
+
'n': _CharGroup(frozenset("\n"), False),
|
|
252
|
+
'r': _CharGroup(frozenset("\r"), False),
|
|
253
|
+
't': _CharGroup(frozenset("\t"), False),
|
|
254
|
+
'v': _CharGroup(frozenset("\v"), False),
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
@dataclass(frozen=True)
|
|
259
|
+
class _Repeated(_BasePattern):
|
|
260
|
+
"""Represents a repeated pattern. `base` can be matched from `min` to `max` times.
|
|
261
|
+
`max` may be None to signal infinite"""
|
|
262
|
+
base: _Repeatable
|
|
263
|
+
min: int
|
|
264
|
+
max: Optional[int]
|
|
265
|
+
|
|
266
|
+
def __str__(self):
|
|
267
|
+
return f"Repeated[{self.min}:{self.max if self.max is not None else ''}]:\n" \
|
|
268
|
+
f"{indent(str(self.base), ' ')}"
|
|
269
|
+
|
|
270
|
+
def _get_alphabet(self, flags: REFlags) -> Alphabet:
|
|
271
|
+
return self.base.get_alphabet(flags)
|
|
272
|
+
|
|
273
|
+
def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
|
|
274
|
+
return self.base.prefix_postfix
|
|
275
|
+
|
|
276
|
+
def _get_lengths(self) -> Tuple[int, Optional[int]]:
|
|
277
|
+
l, h = self.base.lengths
|
|
278
|
+
return l * self.min, (h * self.max if None not in (h, self.max) else None)
|
|
279
|
+
|
|
280
|
+
def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
|
|
281
|
+
if alphabet is None:
|
|
282
|
+
alphabet = self.get_alphabet(flags)
|
|
283
|
+
if prefix_postfix is None:
|
|
284
|
+
prefix_postfix = self.prefix_postfix
|
|
285
|
+
if prefix_postfix != (0, 0):
|
|
286
|
+
raise ValueError("Can not have prefix/postfix on CharGroup-level")
|
|
287
|
+
|
|
288
|
+
unit = self.base.to_fsm(alphabet, (0, 0), flags=flags)
|
|
289
|
+
mandatory = unit * self.min
|
|
290
|
+
if self.max is None:
|
|
291
|
+
optional = unit.star()
|
|
292
|
+
else:
|
|
293
|
+
optional = unit.copy()
|
|
294
|
+
optional.__dict__['finals'] |= {optional.initial}
|
|
295
|
+
optional *= (self.max - self.min)
|
|
296
|
+
return mandatory + optional
|
|
297
|
+
|
|
298
|
+
def simplify(self) -> '_Repeated':
|
|
299
|
+
return self.__class__(self.base.simplify(), self.min, self.max)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
_ALL_STAR = _Repeated(_ALL, 0, None)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
@dataclass(frozen=True)
|
|
306
|
+
class _NonCapturing:
|
|
307
|
+
"""Represents a lookahead/lookback. Matches `inner` without 'consuming' anything. Can be negated.
|
|
308
|
+
Only valid inside a `_Concatenation`"""
|
|
309
|
+
inner: _BasePattern
|
|
310
|
+
backwards: bool
|
|
311
|
+
negate: bool
|
|
312
|
+
__slots__ = 'inner', 'backwards', 'negate'
|
|
313
|
+
|
|
314
|
+
def get_alphabet(self, flags: REFlags) -> Alphabet:
|
|
315
|
+
return self.inner.get_alphabet(flags)
|
|
316
|
+
|
|
317
|
+
def simplify(self) -> '_NonCapturing':
|
|
318
|
+
return self.__class__(self.inner.simplify(), self.backwards, self.negate)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
@dataclass(frozen=True)
|
|
322
|
+
class _Concatenation(_BasePattern):
|
|
323
|
+
"""Represents multiple Patterns that have to be match in a row. Can contain `_NonCapturing`"""
|
|
324
|
+
parts: Tuple[Union[_BasePattern, _NonCapturing], ...]
|
|
325
|
+
__slots__ = 'parts',
|
|
326
|
+
|
|
327
|
+
def __str__(self):
|
|
328
|
+
return "Concatenation:\n" + "\n".join(indent(str(p), ' ') for p in self.parts)
|
|
329
|
+
|
|
330
|
+
def _get_alphabet(self, flags: REFlags) -> Alphabet:
|
|
331
|
+
return Alphabet.union(*(p.get_alphabet(flags) for p in self.parts))[0]
|
|
332
|
+
|
|
333
|
+
def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
|
|
334
|
+
pre = 0 # What is the longest a lookback could stick out over the beginning?
|
|
335
|
+
off = 0 # How many chars have been consumed, e.g what is the minimum length?
|
|
336
|
+
for p in self.parts:
|
|
337
|
+
if not isinstance(p, _NonCapturing):
|
|
338
|
+
off += p.lengths[0]
|
|
339
|
+
elif p.backwards:
|
|
340
|
+
a, b = p.inner.lengths
|
|
341
|
+
if a != b:
|
|
342
|
+
raise InvalidSyntax(f"lookbacks have to have fixed length {(a, b)}")
|
|
343
|
+
req = a - off
|
|
344
|
+
if req > pre:
|
|
345
|
+
pre = req
|
|
346
|
+
post = 0
|
|
347
|
+
off = 0
|
|
348
|
+
for p in reversed(self.parts):
|
|
349
|
+
if not isinstance(p, _NonCapturing):
|
|
350
|
+
off += p.lengths[0]
|
|
351
|
+
elif not p.backwards:
|
|
352
|
+
a, b = p.inner.lengths
|
|
353
|
+
if b is None:
|
|
354
|
+
req = a - off # TODO: is this correct?
|
|
355
|
+
else:
|
|
356
|
+
req = b - off
|
|
357
|
+
if req > post:
|
|
358
|
+
post = req
|
|
359
|
+
return pre, post
|
|
360
|
+
|
|
361
|
+
def _get_lengths(self) -> Tuple[int, Optional[int]]:
|
|
362
|
+
low, high = 0, 0
|
|
363
|
+
for p in self.parts:
|
|
364
|
+
if not isinstance(p, _NonCapturing):
|
|
365
|
+
pl, ph = p.lengths
|
|
366
|
+
low += pl
|
|
367
|
+
high = high + ph if None not in (high, ph) else None
|
|
368
|
+
return low, high
|
|
369
|
+
|
|
370
|
+
def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
|
|
371
|
+
if alphabet is None:
|
|
372
|
+
alphabet = self.get_alphabet(flags)
|
|
373
|
+
if prefix_postfix is None:
|
|
374
|
+
prefix_postfix = self.prefix_postfix
|
|
375
|
+
if prefix_postfix[0] < self.prefix_postfix[0] or prefix_postfix[1] < self.prefix_postfix[1]:
|
|
376
|
+
raise Unsupported("Group can not have lookbacks/lookaheads that go beyond the group bounds.")
|
|
377
|
+
|
|
378
|
+
all_ = _ALL.to_fsm(alphabet)
|
|
379
|
+
all_star = all_.star()
|
|
380
|
+
fsm_parts = []
|
|
381
|
+
current = [all_.times(prefix_postfix[0])]
|
|
382
|
+
for part in self.parts:
|
|
383
|
+
if isinstance(part, _NonCapturing):
|
|
384
|
+
inner = part.inner.to_fsm(alphabet, (0, 0), flags)
|
|
385
|
+
if part.backwards:
|
|
386
|
+
raise Unsupported("lookbacks are not implemented")
|
|
387
|
+
else:
|
|
388
|
+
# try:
|
|
389
|
+
# inner.cardinality()
|
|
390
|
+
# except OverflowError:
|
|
391
|
+
# raise NotImplementedError("Can not deal with infinite length lookaheads")
|
|
392
|
+
fsm_parts.append((None, current))
|
|
393
|
+
fsm_parts.append((part, inner))
|
|
394
|
+
current = []
|
|
395
|
+
else:
|
|
396
|
+
current.append(part.to_fsm(alphabet, (0, 0), flags))
|
|
397
|
+
current.append(all_.times(prefix_postfix[1]))
|
|
398
|
+
result = FSM.concatenate(*current)
|
|
399
|
+
for m, f in reversed(fsm_parts):
|
|
400
|
+
if m is None:
|
|
401
|
+
result = FSM.concatenate(*f, result)
|
|
402
|
+
else:
|
|
403
|
+
assert isinstance(m, _NonCapturing) and not m.backwards
|
|
404
|
+
if m.negate:
|
|
405
|
+
result = result.difference(f + all_star) # TODO: This does not feel right...
|
|
406
|
+
else:
|
|
407
|
+
result = result.intersection(f + all_star)
|
|
408
|
+
return result
|
|
409
|
+
|
|
410
|
+
def simplify(self) -> '_Concatenation':
|
|
411
|
+
return self.__class__(tuple(p.simplify() for p in self.parts))
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
@dataclass(frozen=True)
|
|
415
|
+
class Pattern(_Repeatable):
|
|
416
|
+
options: Tuple[_BasePattern, ...]
|
|
417
|
+
added_flags: REFlags = REFlags(0)
|
|
418
|
+
removed_flags: REFlags = REFlags(0)
|
|
419
|
+
|
|
420
|
+
def __str__(self):
|
|
421
|
+
return "Pattern:\n" + "\n".join(indent(str(o), ' ') for o in self.options)
|
|
422
|
+
|
|
423
|
+
def _get_alphabet(self, flags: REFlags) -> Alphabet:
|
|
424
|
+
flags = _combine_flags(flags, self.added_flags, self.removed_flags)
|
|
425
|
+
return Alphabet.union(*(p.get_alphabet(flags) for p in self.options))[0]
|
|
426
|
+
|
|
427
|
+
def _get_lengths(self) -> Tuple[int, Optional[int]]:
|
|
428
|
+
low, high = None, 0
|
|
429
|
+
for o in self.options:
|
|
430
|
+
ol, oh = o.lengths
|
|
431
|
+
if low is None or ol < low:
|
|
432
|
+
low = ol
|
|
433
|
+
if oh is None or (high is not None and oh > high):
|
|
434
|
+
high = oh
|
|
435
|
+
return low, high
|
|
436
|
+
|
|
437
|
+
def _get_prefix_postfix(self) -> Tuple[int, Optional[int]]:
|
|
438
|
+
pre, post = 0, 0
|
|
439
|
+
for o in self.options:
|
|
440
|
+
opre, opost = o.prefix_postfix
|
|
441
|
+
if opre > pre:
|
|
442
|
+
pre = opre
|
|
443
|
+
if opost is None or (post is not None and opost > post):
|
|
444
|
+
post = opost
|
|
445
|
+
return pre, post
|
|
446
|
+
|
|
447
|
+
def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM:
|
|
448
|
+
flags = _combine_flags(flags, self.added_flags, self.removed_flags)
|
|
449
|
+
if alphabet is None:
|
|
450
|
+
alphabet = self.get_alphabet(flags)
|
|
451
|
+
if prefix_postfix is None:
|
|
452
|
+
prefix_postfix = self.prefix_postfix
|
|
453
|
+
return FSM.union(*(o.to_fsm(alphabet, prefix_postfix, flags) for o in self.options))
|
|
454
|
+
|
|
455
|
+
def with_flags(self, added: REFlags, removed: REFlags = REFlags(0)) -> 'Pattern':
|
|
456
|
+
return self.__class__(self.options, added, removed)
|
|
457
|
+
|
|
458
|
+
def simplify(self) -> 'Pattern':
|
|
459
|
+
if len(self.options) == 1:
|
|
460
|
+
o = self.options[0]
|
|
461
|
+
if isinstance(o, _Concatenation) and len(o.parts) == 1 and isinstance(o.parts[0], Pattern):
|
|
462
|
+
p: Pattern = o.parts[0].simplify()
|
|
463
|
+
f = _combine_flags(_combine_flags(REFlags(0), self.added_flags, self.removed_flags),
|
|
464
|
+
p.added_flags, p.removed_flags)
|
|
465
|
+
return p.with_flags(f)
|
|
466
|
+
return self.__class__(tuple(o.simplify() for o in self.options), self.added_flags, self.removed_flags)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
class _ParsePattern(SimpleParser[Pattern]):
|
|
470
|
+
SPECIAL_CHARS_STANDARD: FrozenSet[str] = frozenset({
|
|
471
|
+
'+', '?', '*', '.', '$', '^', '\\', '(', ')', '[', '|'
|
|
472
|
+
})
|
|
473
|
+
SPECIAL_CHARS_INNER: FrozenSet[str] = frozenset({
|
|
474
|
+
'\\', ']'
|
|
475
|
+
})
|
|
476
|
+
RESERVED_ESCAPES: FrozenSet[str] = frozenset({
|
|
477
|
+
'u', 'U', 'A', 'Z', 'b', 'B'
|
|
478
|
+
})
|
|
479
|
+
|
|
480
|
+
def __init__(self, data: str):
|
|
481
|
+
super(_ParsePattern, self).__init__(data)
|
|
482
|
+
self.flags = None
|
|
483
|
+
|
|
484
|
+
def parse(self):
|
|
485
|
+
try:
|
|
486
|
+
return super(_ParsePattern, self).parse()
|
|
487
|
+
except NoMatch:
|
|
488
|
+
raise InvalidSyntax
|
|
489
|
+
|
|
490
|
+
def start(self):
|
|
491
|
+
self.flags = None
|
|
492
|
+
p = self.pattern()
|
|
493
|
+
if self.flags is not None:
|
|
494
|
+
p = p.with_flags(self.flags)
|
|
495
|
+
return p
|
|
496
|
+
|
|
497
|
+
def pattern(self):
|
|
498
|
+
options = [self.conc()]
|
|
499
|
+
while self.static_b('|'):
|
|
500
|
+
options.append(self.conc())
|
|
501
|
+
return Pattern(tuple(options))
|
|
502
|
+
|
|
503
|
+
def conc(self):
|
|
504
|
+
parts = []
|
|
505
|
+
while True:
|
|
506
|
+
try:
|
|
507
|
+
parts.append(self.obj())
|
|
508
|
+
except nomatch:
|
|
509
|
+
break
|
|
510
|
+
return _Concatenation(tuple(parts))
|
|
511
|
+
|
|
512
|
+
def obj(self):
|
|
513
|
+
if self.static_b("("):
|
|
514
|
+
return self.group()
|
|
515
|
+
return self.repetition(self.atom())
|
|
516
|
+
|
|
517
|
+
def group(self):
|
|
518
|
+
if self.static_b("?"):
|
|
519
|
+
return self.extension_group()
|
|
520
|
+
else:
|
|
521
|
+
p = self.pattern()
|
|
522
|
+
self.static(")")
|
|
523
|
+
return self.repetition(p)
|
|
524
|
+
|
|
525
|
+
def extension_group(self):
|
|
526
|
+
c = self.any()
|
|
527
|
+
if c in 'aiLmsux-':
|
|
528
|
+
self.index -= 1
|
|
529
|
+
added_flags = self.multiple('aiLmsux', 0, None)
|
|
530
|
+
if self.static_b('-'):
|
|
531
|
+
removed_flags = self.multiple('aiLmsux', 1, None)
|
|
532
|
+
else:
|
|
533
|
+
removed_flags = ''
|
|
534
|
+
if self.static_b(':'):
|
|
535
|
+
p = self.pattern()
|
|
536
|
+
p = p.with_flags(_get_flags(added_flags), _get_flags(removed_flags))
|
|
537
|
+
self.static(")")
|
|
538
|
+
return self.repetition(p)
|
|
539
|
+
elif removed_flags != '':
|
|
540
|
+
raise nomatch
|
|
541
|
+
else:
|
|
542
|
+
self.static(')')
|
|
543
|
+
self.flags = _get_flags(added_flags)
|
|
544
|
+
return _EMPTY
|
|
545
|
+
elif c == ':':
|
|
546
|
+
p = self.pattern()
|
|
547
|
+
self.static(")")
|
|
548
|
+
return self.repetition(p)
|
|
549
|
+
elif c == 'P':
|
|
550
|
+
if self.static_b('<'):
|
|
551
|
+
self.multiple('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_', 1, None)
|
|
552
|
+
self.static('>')
|
|
553
|
+
p = self.pattern()
|
|
554
|
+
self.static(")")
|
|
555
|
+
return self.repetition(p)
|
|
556
|
+
elif self.static_b('='):
|
|
557
|
+
raise Unsupported("Group references are not implemented")
|
|
558
|
+
elif c == '#':
|
|
559
|
+
while not self.static_b(')'):
|
|
560
|
+
self.any()
|
|
561
|
+
elif c == '=':
|
|
562
|
+
p = self.pattern()
|
|
563
|
+
self.static(")")
|
|
564
|
+
return _NonCapturing(p, False, False)
|
|
565
|
+
elif c == '!':
|
|
566
|
+
p = self.pattern()
|
|
567
|
+
self.static(")")
|
|
568
|
+
return _NonCapturing(p, False, True)
|
|
569
|
+
elif c == '<':
|
|
570
|
+
c = self.any()
|
|
571
|
+
if c == '=':
|
|
572
|
+
p = self.pattern()
|
|
573
|
+
self.static(")")
|
|
574
|
+
return _NonCapturing(p, True, False)
|
|
575
|
+
elif c == '!':
|
|
576
|
+
p = self.pattern()
|
|
577
|
+
self.static(")")
|
|
578
|
+
return _NonCapturing(p, True, True)
|
|
579
|
+
elif c == '(':
|
|
580
|
+
raise Unsupported("Conditional matching is not implemented")
|
|
581
|
+
else:
|
|
582
|
+
raise InvalidSyntax(
|
|
583
|
+
f"Unknown group-extension: {c!r} (Context: {self.data[self.index - 3:self.index + 5]!r}")
|
|
584
|
+
|
|
585
|
+
def atom(self):
|
|
586
|
+
if self.static_b("["):
|
|
587
|
+
return self.repetition(self.chargroup())
|
|
588
|
+
elif self.static_b("\\"):
|
|
589
|
+
return self.repetition(self.escaped())
|
|
590
|
+
elif self.static_b("."):
|
|
591
|
+
return self.repetition(_DOT)
|
|
592
|
+
elif self.static_b("$"):
|
|
593
|
+
raise Unsupported("'$'")
|
|
594
|
+
elif self.static_b("^"):
|
|
595
|
+
raise Unsupported("'^'")
|
|
596
|
+
else:
|
|
597
|
+
c = self.any_but(*self.SPECIAL_CHARS_STANDARD)
|
|
598
|
+
return self.repetition(_CharGroup(frozenset({c}), False))
|
|
599
|
+
|
|
600
|
+
def repetition(self, base: _Repeatable):
|
|
601
|
+
if self.static_b("*"):
|
|
602
|
+
if self.static_b("?"):
|
|
603
|
+
pass
|
|
604
|
+
return _Repeated(base, 0, None)
|
|
605
|
+
elif self.static_b("+"):
|
|
606
|
+
if self.static_b("?"):
|
|
607
|
+
pass
|
|
608
|
+
return _Repeated(base, 1, None)
|
|
609
|
+
elif self.static_b("?"):
|
|
610
|
+
if self.static_b("?"):
|
|
611
|
+
pass
|
|
612
|
+
return _Repeated(base, 0, 1)
|
|
613
|
+
elif self.static_b("{"):
|
|
614
|
+
try:
|
|
615
|
+
n = self.number()
|
|
616
|
+
except nomatch:
|
|
617
|
+
n = 0
|
|
618
|
+
if self.static_b(','):
|
|
619
|
+
try:
|
|
620
|
+
m = self.number()
|
|
621
|
+
except nomatch:
|
|
622
|
+
m = None
|
|
623
|
+
else:
|
|
624
|
+
m = n
|
|
625
|
+
self.static("}")
|
|
626
|
+
if self.static_b('?'):
|
|
627
|
+
pass
|
|
628
|
+
return _Repeated(base, n, m)
|
|
629
|
+
else:
|
|
630
|
+
return base
|
|
631
|
+
|
|
632
|
+
def number(self) -> int:
|
|
633
|
+
return int(self.multiple("0123456789", 1, None))
|
|
634
|
+
|
|
635
|
+
def escaped(self, inner=False):
|
|
636
|
+
if self.static_b("x"):
|
|
637
|
+
n = self.multiple("0123456789abcdefABCDEF", 2, 2)
|
|
638
|
+
c = chr(int(n, 16))
|
|
639
|
+
return _CharGroup(frozenset({c}), False)
|
|
640
|
+
if self.static_b("0"):
|
|
641
|
+
n = self.multiple("01234567", 1, 2)
|
|
642
|
+
c = chr(int(n, 8))
|
|
643
|
+
return _CharGroup(frozenset({c}), False)
|
|
644
|
+
if self.anyof_b('N', 'p', 'P', 'u', 'U'):
|
|
645
|
+
raise Unsupported('regex module unicode properties are not supported.')
|
|
646
|
+
if not inner:
|
|
647
|
+
try:
|
|
648
|
+
n = self.multiple("01234567", 3, 3)
|
|
649
|
+
except nomatch:
|
|
650
|
+
pass
|
|
651
|
+
else:
|
|
652
|
+
c = chr(int(n, 8))
|
|
653
|
+
return _CharGroup(frozenset({c}), False)
|
|
654
|
+
try:
|
|
655
|
+
self.multiple("0123456789", 1, 2)
|
|
656
|
+
except nomatch:
|
|
657
|
+
pass
|
|
658
|
+
else:
|
|
659
|
+
raise Unsupported("Group references are not implemented")
|
|
660
|
+
else:
|
|
661
|
+
try:
|
|
662
|
+
n = self.multiple("01234567", 1, 3)
|
|
663
|
+
except nomatch:
|
|
664
|
+
pass
|
|
665
|
+
else:
|
|
666
|
+
c = chr(int(n, 8))
|
|
667
|
+
return _CharGroup(frozenset({c}), False)
|
|
668
|
+
if not inner:
|
|
669
|
+
try:
|
|
670
|
+
c = self.anyof(*self.RESERVED_ESCAPES)
|
|
671
|
+
except nomatch:
|
|
672
|
+
pass
|
|
673
|
+
else:
|
|
674
|
+
raise Unsupported(f"Escape \\{c} is not implemented")
|
|
675
|
+
try:
|
|
676
|
+
c = self.anyof(*_CHAR_GROUPS)
|
|
677
|
+
except nomatch:
|
|
678
|
+
pass
|
|
679
|
+
else:
|
|
680
|
+
return _CHAR_GROUPS[c]
|
|
681
|
+
c = self.any_but("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
682
|
+
if c.isalpha():
|
|
683
|
+
raise nomatch
|
|
684
|
+
return _CharGroup(frozenset(c), False)
|
|
685
|
+
|
|
686
|
+
def chargroup(self):
|
|
687
|
+
if self.static_b("^"):
|
|
688
|
+
negate = True
|
|
689
|
+
else:
|
|
690
|
+
negate = False
|
|
691
|
+
groups = []
|
|
692
|
+
while True:
|
|
693
|
+
try:
|
|
694
|
+
groups.append(self.chargroup_inner())
|
|
695
|
+
except nomatch:
|
|
696
|
+
break
|
|
697
|
+
self.static("]")
|
|
698
|
+
if len(groups) == 1:
|
|
699
|
+
f = tuple(groups)[0]
|
|
700
|
+
return _CharGroup(f.chars, negate ^ f.negated)
|
|
701
|
+
elif len(groups) == 0:
|
|
702
|
+
return _CharGroup(frozenset({}), negate)
|
|
703
|
+
else:
|
|
704
|
+
return _combine_char_groups(*groups, negate=negate)
|
|
705
|
+
|
|
706
|
+
def chargroup_inner(self) -> _CharGroup:
|
|
707
|
+
start = self.index
|
|
708
|
+
if self.static_b('\\'):
|
|
709
|
+
base = self.escaped(True)
|
|
710
|
+
else:
|
|
711
|
+
base = _CharGroup(frozenset(self.any_but(*self.SPECIAL_CHARS_INNER)), False)
|
|
712
|
+
if self.static_b('-'):
|
|
713
|
+
if self.static_b('\\'):
|
|
714
|
+
end = self.escaped(True)
|
|
715
|
+
elif self.peek_static(']'):
|
|
716
|
+
return _combine_char_groups(base, _CharGroup(frozenset('-'), False), negate=False)
|
|
717
|
+
else:
|
|
718
|
+
end = _CharGroup(frozenset(self.any_but(*self.SPECIAL_CHARS_INNER)), False)
|
|
719
|
+
if len(base.chars) != 1 or len(end.chars) != 1:
|
|
720
|
+
raise InvalidSyntax(f"Invalid Character-range: {self.data[start:self.index]}")
|
|
721
|
+
low, high = ord(*base.chars), ord(*end.chars)
|
|
722
|
+
if low > high:
|
|
723
|
+
raise InvalidSyntax(f"Invalid Character-range: {self.data[start:self.index]}")
|
|
724
|
+
return _CharGroup(frozenset((chr(i) for i in range(low, high + 1))), False)
|
|
725
|
+
return base
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def parse_pattern(pattern: str) -> Pattern:
|
|
729
|
+
p = _ParsePattern(pattern)
|
|
730
|
+
out = p.parse()
|
|
731
|
+
out = out.simplify()
|
|
732
|
+
return out
|