omextra 0.0.0.dev496__py3-none-any.whl → 0.0.0.dev498__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omextra/text/abnf/__init__.py +51 -18
- omextra/text/abnf/_dataclasses.py +246 -0
- omextra/text/abnf/base.py +21 -257
- omextra/text/abnf/core.py +22 -10
- omextra/text/abnf/grammars.py +235 -0
- omextra/text/abnf/internal.py +1 -1
- omextra/text/abnf/matches.py +145 -0
- omextra/text/abnf/meta.py +45 -12
- omextra/text/abnf/ops.py +76 -9
- omextra/text/abnf/opto.py +257 -0
- omextra/text/abnf/parsing.py +134 -20
- omextra/text/abnf/utils.py +38 -41
- omextra/text/abnf/visitors.py +1 -1
- {omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/METADATA +2 -2
- {omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/RECORD +19 -16
- {omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/WHEEL +0 -0
- {omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/entry_points.txt +0 -0
- {omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/licenses/LICENSE +0 -0
- {omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/top_level.txt +0 -0
omextra/text/abnf/ops.py
CHANGED
|
@@ -4,8 +4,10 @@ from omlish import check
|
|
|
4
4
|
from omlish import dataclasses as dc
|
|
5
5
|
from omlish import lang
|
|
6
6
|
|
|
7
|
+
from .base import CompositeOp
|
|
7
8
|
from .base import LeafOp
|
|
8
9
|
from .base import Op
|
|
10
|
+
from .base import OpTuple
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
##
|
|
@@ -54,8 +56,8 @@ class RangeLiteral(Literal, lang.Final):
|
|
|
54
56
|
hi: str
|
|
55
57
|
|
|
56
58
|
def __post_init__(self) -> None:
|
|
57
|
-
|
|
58
|
-
|
|
59
|
+
for c in (self.lo, self.hi):
|
|
60
|
+
check.equal(len(check.non_empty_str(c)), 1)
|
|
59
61
|
check.state(self.hi >= self.lo)
|
|
60
62
|
|
|
61
63
|
def __init__(self, value: Range) -> None:
|
|
@@ -101,30 +103,62 @@ def literal(*args, case_sensitive=None):
|
|
|
101
103
|
|
|
102
104
|
|
|
103
105
|
@ta.final
|
|
104
|
-
class Concat(
|
|
106
|
+
class Concat(CompositeOp, lang.Final):
|
|
105
107
|
def __init__(self, *children: Op) -> None:
|
|
106
108
|
super().__init__()
|
|
107
109
|
|
|
108
|
-
|
|
110
|
+
check.arg(len(children) > 1)
|
|
111
|
+
for i, c in enumerate(children):
|
|
109
112
|
check.isinstance(c, Op)
|
|
113
|
+
if i:
|
|
114
|
+
check.state(not (isinstance(c, Concat) and isinstance(children[i - 1], Concat)))
|
|
110
115
|
self._children = children
|
|
111
116
|
|
|
112
117
|
@property
|
|
113
|
-
def children(self) ->
|
|
118
|
+
def children(self) -> OpTuple:
|
|
114
119
|
return self._children
|
|
115
120
|
|
|
116
121
|
def __repr__(self) -> str:
|
|
117
122
|
return f'{self.__class__.__name__}@{id(self):x}({", ".join(map(repr, self._children))})'
|
|
118
123
|
|
|
124
|
+
def replace_children(self, *children: Op) -> Op:
|
|
125
|
+
if children == self._children:
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
return concat(*children)
|
|
129
|
+
|
|
119
130
|
|
|
120
|
-
concat
|
|
131
|
+
def concat(*children: Op) -> Op:
|
|
132
|
+
if len(children) == 1:
|
|
133
|
+
return children[0]
|
|
134
|
+
|
|
135
|
+
check.not_empty(children)
|
|
136
|
+
|
|
137
|
+
lst: list[Op | list[Op]] = []
|
|
138
|
+
for c in children:
|
|
139
|
+
if (
|
|
140
|
+
lst and
|
|
141
|
+
isinstance(c, Concat) and
|
|
142
|
+
isinstance(ll := lst[-1], (Concat, list))
|
|
143
|
+
):
|
|
144
|
+
if isinstance(ll, list):
|
|
145
|
+
ll.extend(c.children)
|
|
146
|
+
else:
|
|
147
|
+
lst.append([*ta.cast(list, lst.pop()), *c.children])
|
|
148
|
+
else:
|
|
149
|
+
lst.append(c)
|
|
150
|
+
|
|
151
|
+
if len(lst) == 1:
|
|
152
|
+
return Concat(*e) if isinstance(e := lst[0], list) else e
|
|
153
|
+
|
|
154
|
+
return Concat(*[Concat(*e) if isinstance(e, list) else e for e in lst])
|
|
121
155
|
|
|
122
156
|
|
|
123
157
|
##
|
|
124
158
|
|
|
125
159
|
|
|
126
160
|
@ta.final
|
|
127
|
-
class Repeat(
|
|
161
|
+
class Repeat(CompositeOp, lang.Final):
|
|
128
162
|
@dc.dataclass(frozen=True)
|
|
129
163
|
class Times:
|
|
130
164
|
min: int = 0
|
|
@@ -159,9 +193,20 @@ class Repeat(Op, lang.Final):
|
|
|
159
193
|
def child(self) -> Op:
|
|
160
194
|
return self._child
|
|
161
195
|
|
|
196
|
+
@property
|
|
197
|
+
def children(self) -> OpTuple:
|
|
198
|
+
return (self._child,)
|
|
199
|
+
|
|
162
200
|
def __repr__(self) -> str:
|
|
163
201
|
return f'{self.__class__.__name__}@{id(self):x}({self._times}, {self._child!r})'
|
|
164
202
|
|
|
203
|
+
def replace_children(self, *children: Op) -> Op:
|
|
204
|
+
child = check.single(children)
|
|
205
|
+
if child == self._child:
|
|
206
|
+
return self
|
|
207
|
+
|
|
208
|
+
return Repeat(self._times, child)
|
|
209
|
+
|
|
165
210
|
|
|
166
211
|
@ta.overload
|
|
167
212
|
def repeat(child: Op) -> Repeat: # noqa
|
|
@@ -223,7 +268,7 @@ def option(child: Op) -> Repeat:
|
|
|
223
268
|
|
|
224
269
|
|
|
225
270
|
@ta.final
|
|
226
|
-
class Either(
|
|
271
|
+
class Either(CompositeOp, lang.Final):
|
|
227
272
|
def __init__(self, *children: Op, first_match: bool = False) -> None:
|
|
228
273
|
super().__init__()
|
|
229
274
|
|
|
@@ -233,7 +278,7 @@ class Either(Op, lang.Final):
|
|
|
233
278
|
self._first_match = first_match
|
|
234
279
|
|
|
235
280
|
@property
|
|
236
|
-
def children(self) ->
|
|
281
|
+
def children(self) -> OpTuple:
|
|
237
282
|
return self._children
|
|
238
283
|
|
|
239
284
|
@property
|
|
@@ -247,6 +292,12 @@ class Either(Op, lang.Final):
|
|
|
247
292
|
f'{", first_match=True" if self._first_match else ""})'
|
|
248
293
|
)
|
|
249
294
|
|
|
295
|
+
def replace_children(self, *children: Op) -> Op:
|
|
296
|
+
if children == self._children:
|
|
297
|
+
return self
|
|
298
|
+
|
|
299
|
+
return Either(*children, first_match=self._first_match)
|
|
300
|
+
|
|
250
301
|
|
|
251
302
|
either = Either
|
|
252
303
|
|
|
@@ -260,12 +311,28 @@ class RuleRef(Op, lang.Final):
|
|
|
260
311
|
super().__init__()
|
|
261
312
|
|
|
262
313
|
self._name = check.non_empty_str(name)
|
|
314
|
+
|
|
263
315
|
self._name_f = name.casefold()
|
|
264
316
|
|
|
317
|
+
def coalesce(self, other: Op) -> Op:
|
|
318
|
+
"""
|
|
319
|
+
Op nodes are compared by identity, and transformations return identical node instances when nothing has changed.
|
|
320
|
+
This method assists with that, preserving RuleRef node identity if the given node is otherwise equal.
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
if isinstance(other, RuleRef) and other.name_f == self.name_f:
|
|
324
|
+
return self
|
|
325
|
+
|
|
326
|
+
return other
|
|
327
|
+
|
|
265
328
|
@property
|
|
266
329
|
def name(self) -> str:
|
|
267
330
|
return self._name
|
|
268
331
|
|
|
332
|
+
@property
|
|
333
|
+
def name_f(self) -> str:
|
|
334
|
+
return self._name_f
|
|
335
|
+
|
|
269
336
|
def __repr__(self) -> str:
|
|
270
337
|
return f'{self.__class__.__name__}@{id(self):x}({self._name!r})'
|
|
271
338
|
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TODO:
|
|
3
|
+
- origin tracking?
|
|
4
|
+
- minor opts:
|
|
5
|
+
- merge concat(range, range)
|
|
6
|
+
"""
|
|
7
|
+
import abc
|
|
8
|
+
import re
|
|
9
|
+
import typing as ta
|
|
10
|
+
|
|
11
|
+
from omlish import check
|
|
12
|
+
from omlish import dataclasses as dc
|
|
13
|
+
from omlish import lang
|
|
14
|
+
|
|
15
|
+
from .base import CompositeOp
|
|
16
|
+
from .base import Op
|
|
17
|
+
from .grammars import Channel
|
|
18
|
+
from .grammars import Grammar
|
|
19
|
+
from .grammars import Rule
|
|
20
|
+
from .internal import Regex
|
|
21
|
+
from .ops import CaseInsensitiveStringLiteral
|
|
22
|
+
from .ops import Concat
|
|
23
|
+
from .ops import Either
|
|
24
|
+
from .ops import RangeLiteral
|
|
25
|
+
from .ops import Repeat
|
|
26
|
+
from .ops import RuleRef
|
|
27
|
+
from .ops import StringLiteral
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dc.dataclass(frozen=True)
|
|
34
|
+
class _RegexItem(lang.Abstract):
|
|
35
|
+
@property
|
|
36
|
+
@abc.abstractmethod
|
|
37
|
+
def pat(self) -> str:
|
|
38
|
+
raise NotImplementedError
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def of_op(cls, op: Op) -> ta.Optional['_RegexItem']:
|
|
42
|
+
if isinstance(op, StringLiteral):
|
|
43
|
+
return _StringLiteralRegexItem(op.value)
|
|
44
|
+
|
|
45
|
+
elif isinstance(op, CaseInsensitiveStringLiteral):
|
|
46
|
+
return _CaseInsensitiveStringLiteralRegexItem(op.value)
|
|
47
|
+
|
|
48
|
+
elif isinstance(op, RangeLiteral):
|
|
49
|
+
lo = re.escape(op.value.lo)
|
|
50
|
+
hi = re.escape(op.value.hi)
|
|
51
|
+
return _RegexRegexItem(f'[{lo}-{hi}]')
|
|
52
|
+
|
|
53
|
+
elif isinstance(op, Regex):
|
|
54
|
+
return _RegexRegexItem(op.pat.pattern)
|
|
55
|
+
|
|
56
|
+
else:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def of(cls, obj: ta.Union['_RegexItem', Op, None]) -> ta.Optional['_RegexItem']:
|
|
61
|
+
if obj is None:
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
elif isinstance(obj, _RegexItem):
|
|
65
|
+
return obj
|
|
66
|
+
|
|
67
|
+
elif isinstance(obj, Op):
|
|
68
|
+
return cls.of_op(obj)
|
|
69
|
+
|
|
70
|
+
else:
|
|
71
|
+
raise TypeError(obj)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dc.dataclass(frozen=True)
|
|
75
|
+
class _StringLiteralRegexItem(_RegexItem, lang.Final):
|
|
76
|
+
s: str
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def pat(self) -> str:
|
|
80
|
+
return re.escape(self.s)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dc.dataclass(frozen=True)
|
|
84
|
+
class _CaseInsensitiveStringLiteralRegexItem(_RegexItem, lang.Final):
|
|
85
|
+
s: str
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def pat(self) -> str:
|
|
89
|
+
return f'(?i:{re.escape(self.s)})'
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dc.dataclass(frozen=True)
|
|
93
|
+
class _RegexRegexItem(_RegexItem, lang.Final):
|
|
94
|
+
ps: str
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def pat(self) -> str:
|
|
98
|
+
return self.ps
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _regex_item_transform_op(op: Op) -> _RegexItem | None:
|
|
102
|
+
if isinstance(op, (StringLiteral, CaseInsensitiveStringLiteral, Regex)):
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
elif isinstance(op, RangeLiteral):
|
|
106
|
+
# Unlike other leafs we eagerly transform RangeLiteral to a regex as it's probably faster than the python impl,
|
|
107
|
+
# even alone.
|
|
108
|
+
return _RegexItem.of_op(op)
|
|
109
|
+
|
|
110
|
+
elif isinstance(op, RuleRef):
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
elif isinstance(op, Concat):
|
|
114
|
+
children = [_regex_item_transform_op(child) or _RegexItem.of(child) for child in op.children]
|
|
115
|
+
if all(ca is not None for ca in children):
|
|
116
|
+
return _RegexRegexItem(''.join(check.not_none(ca).pat for ca in children))
|
|
117
|
+
|
|
118
|
+
if not any(ca is not None for ca in children):
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
# FIXME: merge adjacent
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
elif isinstance(op, Repeat):
|
|
125
|
+
child = _RegexItem.of(_regex_item_transform_op(op.child))
|
|
126
|
+
if child is None:
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
# Wrap the child pattern in a non-capturing group if needed to ensure correct quantification. A pattern needs
|
|
130
|
+
# wrapping if it contains multiple elements or operators (e.g., 'ab', 'a|b'). Single character classes [a-z] and
|
|
131
|
+
# single escaped chars don't need wrapping.
|
|
132
|
+
if (
|
|
133
|
+
len(child_pat := child.pat) > 1 and
|
|
134
|
+
not (child_pat.startswith('[') and child_pat.endswith(']'))
|
|
135
|
+
):
|
|
136
|
+
child_pat = f'(?:{child_pat})'
|
|
137
|
+
|
|
138
|
+
times = op.times
|
|
139
|
+
if times.min == 0 and times.max is None:
|
|
140
|
+
quantifier = '*'
|
|
141
|
+
elif times.min == 1 and times.max is None:
|
|
142
|
+
quantifier = '+'
|
|
143
|
+
elif times.min == 0 and times.max == 1:
|
|
144
|
+
quantifier = '?'
|
|
145
|
+
elif times.max is None:
|
|
146
|
+
quantifier = f'{{{times.min},}}'
|
|
147
|
+
elif times.min == times.max:
|
|
148
|
+
quantifier = f'{{{times.min}}}'
|
|
149
|
+
else:
|
|
150
|
+
quantifier = f'{{{times.min},{times.max}}}'
|
|
151
|
+
|
|
152
|
+
return _RegexRegexItem(child_pat + quantifier)
|
|
153
|
+
|
|
154
|
+
elif isinstance(op, Either):
|
|
155
|
+
# Only convert Either if first_match is True, as regex alternation uses first-match semantics. ABNF Either with
|
|
156
|
+
# first_match=False uses longest-match semantics, which differs from regex.
|
|
157
|
+
if not op.first_match:
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
children = [_regex_item_transform_op(child) or _RegexItem.of(child) for child in op.children]
|
|
161
|
+
if all(ca is not None for ca in children):
|
|
162
|
+
# Build regex alternation. Use a capturing group for the alternation
|
|
163
|
+
return _RegexRegexItem(''.join([
|
|
164
|
+
'(',
|
|
165
|
+
'|'.join(check.not_none(ca).pat for ca in children),
|
|
166
|
+
')',
|
|
167
|
+
]))
|
|
168
|
+
|
|
169
|
+
if not any(ca is not None for ca in children):
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
# FIXME: merge adjacent
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
else:
|
|
176
|
+
raise TypeError(op)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _regex_transform_op(op: Op) -> Op:
|
|
180
|
+
v = _regex_item_transform_op(op)
|
|
181
|
+
|
|
182
|
+
if v is None:
|
|
183
|
+
return op
|
|
184
|
+
|
|
185
|
+
elif isinstance(v, _RegexItem):
|
|
186
|
+
return Regex(re.compile(v.pat))
|
|
187
|
+
|
|
188
|
+
else:
|
|
189
|
+
raise TypeError(v)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
##
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def optimize_op(op: Op) -> Op:
|
|
196
|
+
op = _regex_transform_op(op)
|
|
197
|
+
|
|
198
|
+
return op
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
##
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _inline_rules(fn: ta.Callable[[Rule], bool], gram: Grammar) -> Grammar:
|
|
205
|
+
cur_rule: Rule
|
|
206
|
+
inlined_rules: dict[str, Op] = {}
|
|
207
|
+
|
|
208
|
+
def rec_op(op: Op) -> Op:
|
|
209
|
+
if isinstance(op, RuleRef):
|
|
210
|
+
if op.name_f == cur_rule.name_f:
|
|
211
|
+
return op
|
|
212
|
+
|
|
213
|
+
if (r := gram.rule(op.name)) is None or not fn(r):
|
|
214
|
+
return op
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
return inlined_rules[r.name]
|
|
218
|
+
except KeyError:
|
|
219
|
+
pass
|
|
220
|
+
|
|
221
|
+
inlined_rules[op.name] = op
|
|
222
|
+
i_op = rec_op(r.op)
|
|
223
|
+
inlined_rules[op.name] = i_op
|
|
224
|
+
|
|
225
|
+
return op.coalesce(i_op)
|
|
226
|
+
|
|
227
|
+
elif isinstance(op, CompositeOp):
|
|
228
|
+
return op.replace_children(*map(rec_op, op.children))
|
|
229
|
+
|
|
230
|
+
else:
|
|
231
|
+
return op
|
|
232
|
+
|
|
233
|
+
new_rules: list[Rule] = []
|
|
234
|
+
for rule in gram.rules:
|
|
235
|
+
cur_rule = rule
|
|
236
|
+
new_rules.append(rule.replace_op(rec_op(rule.op)))
|
|
237
|
+
|
|
238
|
+
return gram.replace_rules(*new_rules)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
##
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def optimize_grammar(
|
|
245
|
+
gram: Grammar,
|
|
246
|
+
*,
|
|
247
|
+
inline_channels: ta.Container[Channel] | None = (Channel.SPACE,),
|
|
248
|
+
) -> Grammar:
|
|
249
|
+
if inline_channels:
|
|
250
|
+
gram = _inline_rules(lambda r: r.channel in inline_channels, gram)
|
|
251
|
+
|
|
252
|
+
gram = gram.replace_rules(*[
|
|
253
|
+
r.replace_op(optimize_op(r.op))
|
|
254
|
+
for r in gram.rules
|
|
255
|
+
])
|
|
256
|
+
|
|
257
|
+
return gram
|
omextra/text/abnf/parsing.py
CHANGED
|
@@ -2,10 +2,12 @@ import typing as ta
|
|
|
2
2
|
|
|
3
3
|
from omlish import check
|
|
4
4
|
|
|
5
|
-
from .base import Grammar
|
|
6
|
-
from .base import Match
|
|
7
5
|
from .base import Op
|
|
6
|
+
from .grammars import Grammar
|
|
7
|
+
from .grammars import Rule
|
|
8
8
|
from .internal import Regex
|
|
9
|
+
from .matches import Match
|
|
10
|
+
from .matches import longest_match
|
|
9
11
|
from .ops import CaseInsensitiveStringLiteral
|
|
10
12
|
from .ops import Concat
|
|
11
13
|
from .ops import Either
|
|
@@ -19,15 +21,23 @@ from .ops import StringLiteral
|
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
class _Parser:
|
|
24
|
+
class MaxStepsExceededError(Exception):
|
|
25
|
+
pass
|
|
26
|
+
|
|
22
27
|
def __init__(
|
|
23
28
|
self,
|
|
24
29
|
grammar: Grammar,
|
|
25
30
|
source: str,
|
|
31
|
+
*,
|
|
32
|
+
max_steps: int | None = None,
|
|
26
33
|
) -> None:
|
|
27
34
|
super().__init__()
|
|
28
35
|
|
|
29
36
|
self._grammar = grammar
|
|
30
37
|
self._source = source
|
|
38
|
+
self._max_steps = max_steps
|
|
39
|
+
|
|
40
|
+
self._rules = self._grammar._rules # Noqa
|
|
31
41
|
|
|
32
42
|
self._dispatch: dict[type[Op], ta.Any] = {
|
|
33
43
|
StringLiteral: self._iter_parse_string_literal,
|
|
@@ -40,6 +50,10 @@ class _Parser:
|
|
|
40
50
|
Regex: self._iter_parse_regex,
|
|
41
51
|
}
|
|
42
52
|
|
|
53
|
+
self._memo: dict[tuple[Op, int], tuple[Match, ...]] = {}
|
|
54
|
+
|
|
55
|
+
self._cur_step = 0
|
|
56
|
+
|
|
43
57
|
def _iter_parse_string_literal(self, op: StringLiteral, start: int) -> ta.Iterator[Match]:
|
|
44
58
|
if start < len(self._source): # noqa
|
|
45
59
|
source = self._source[start : start + len(op._value)] # noqa
|
|
@@ -57,67 +71,120 @@ class _Parser:
|
|
|
57
71
|
source = self._source[start] # noqa
|
|
58
72
|
except IndexError:
|
|
59
73
|
return
|
|
74
|
+
|
|
60
75
|
# ranges are always case-sensitive
|
|
61
76
|
if (value := op._value).lo <= source <= value.hi: # noqa
|
|
62
77
|
yield Match(op, start, start + 1, ())
|
|
63
78
|
|
|
64
79
|
def _iter_parse_concat(self, op: Concat, start: int) -> ta.Iterator[Match]:
|
|
65
|
-
i = 0
|
|
66
80
|
match_tups: list[tuple[Match, ...]] = [()]
|
|
81
|
+
|
|
82
|
+
i = 0
|
|
67
83
|
for cp in op._children: # noqa
|
|
68
84
|
next_match_tups: list[tuple[Match, ...]] = []
|
|
85
|
+
|
|
69
86
|
for mt in match_tups:
|
|
70
87
|
for cm in self.iter_parse(cp, mt[-1].end if mt else start):
|
|
71
88
|
next_match_tups.append((*mt, cm))
|
|
72
89
|
i += 1
|
|
90
|
+
|
|
73
91
|
if not next_match_tups:
|
|
74
92
|
return
|
|
93
|
+
|
|
75
94
|
match_tups = next_match_tups
|
|
95
|
+
|
|
76
96
|
if not i:
|
|
77
97
|
return
|
|
98
|
+
|
|
78
99
|
for mt in sorted(match_tups, key=len, reverse=True):
|
|
79
100
|
yield Match(op, start, mt[-1].end if mt else start, mt)
|
|
80
101
|
|
|
81
102
|
def _iter_parse_repeat(self, op: Repeat, start: int) -> ta.Iterator[Match]:
|
|
82
|
-
|
|
83
|
-
|
|
103
|
+
# Map from (repetition_count, end_position) to longest match tuple
|
|
104
|
+
matches_by_count_pos: dict[tuple[int, int], tuple[Match, ...]] = {(0, start): ()}
|
|
105
|
+
max_end_by_count: dict[int, int] = {0: start}
|
|
106
|
+
|
|
84
107
|
i = 0
|
|
85
108
|
while True:
|
|
86
109
|
if op._times.max is not None and i == op._times.max: # noqa
|
|
87
110
|
break
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
111
|
+
|
|
112
|
+
if self._max_steps is not None and self._cur_step > self._max_steps:
|
|
113
|
+
raise _Parser.MaxStepsExceededError(self._cur_step)
|
|
114
|
+
self._cur_step += 1
|
|
115
|
+
|
|
116
|
+
next_matches: dict[tuple[int, int], tuple[Match, ...]] = {}
|
|
117
|
+
next_max_end = max_end_by_count.get(i, -1)
|
|
118
|
+
|
|
119
|
+
for (count, end_pos), mt in matches_by_count_pos.items():
|
|
120
|
+
if count != i:
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
for cm in self.iter_parse(op._child, end_pos): # noqa
|
|
124
|
+
next_mt = (*mt, cm)
|
|
125
|
+
next_key = (i + 1, cm.end)
|
|
126
|
+
|
|
127
|
+
# Keep only the longest match tuple for each (count, position)
|
|
128
|
+
if next_key not in next_matches or len(next_mt) > len(next_matches[next_key]):
|
|
129
|
+
next_matches[next_key] = next_mt
|
|
130
|
+
if cm.end > next_max_end:
|
|
131
|
+
next_max_end = cm.end
|
|
132
|
+
|
|
133
|
+
if not next_matches:
|
|
134
|
+
break
|
|
135
|
+
|
|
136
|
+
# Check if we made progress (reached new positions)
|
|
137
|
+
if next_max_end <= max_end_by_count.get(i, -1):
|
|
93
138
|
break
|
|
139
|
+
|
|
94
140
|
i += 1
|
|
95
|
-
|
|
96
|
-
|
|
141
|
+
matches_by_count_pos.update(next_matches)
|
|
142
|
+
max_end_by_count[i] = next_max_end
|
|
143
|
+
|
|
97
144
|
if i < op._times.min: # noqa
|
|
98
145
|
return
|
|
99
|
-
|
|
100
|
-
|
|
146
|
+
|
|
147
|
+
# Collect valid matches and sort by (end_position, repetition_count) descending
|
|
148
|
+
valid_matches: list[tuple[int, int, tuple[Match, ...]]] = []
|
|
149
|
+
for (count, end_pos), mt in matches_by_count_pos.items():
|
|
150
|
+
if op._times.min <= count <= (op._times.max if op._times.max is not None else i): # noqa
|
|
151
|
+
valid_matches.append((end_pos, count, mt))
|
|
152
|
+
|
|
153
|
+
for end_pos, _, mt in sorted(valid_matches, key=lambda x: (x[0], x[1]), reverse=True):
|
|
154
|
+
yield Match(op, start, end_pos, mt)
|
|
101
155
|
|
|
102
156
|
def _iter_parse_either(self, op: Either, start: int) -> ta.Iterator[Match]:
|
|
103
157
|
for cp in op._children: # noqa
|
|
104
158
|
found = False
|
|
159
|
+
|
|
105
160
|
for cm in self.iter_parse(cp, start):
|
|
106
161
|
found = True
|
|
107
162
|
yield Match(op, start, cm.end, (cm,))
|
|
163
|
+
|
|
108
164
|
if found and op._first_match: # noqa
|
|
109
165
|
return
|
|
110
166
|
|
|
111
167
|
def _iter_parse_rule_ref(self, op: RuleRef, start: int) -> ta.Iterator[Match]:
|
|
112
|
-
cp = self.
|
|
168
|
+
cp = self._rules._rules_by_name_f[op._name_f].op # noqa
|
|
113
169
|
for cm in self.iter_parse(cp, start):
|
|
114
170
|
yield Match(op, cm.start, cm.end, (cm,))
|
|
115
171
|
|
|
116
172
|
def _iter_parse_regex(self, op: Regex, start: int) -> ta.Iterator[Match]:
|
|
117
|
-
|
|
173
|
+
if (m := op._pat.match(self._source, start)) is not None: # noqa
|
|
174
|
+
yield Match(op, start, m.end(), ())
|
|
118
175
|
|
|
119
176
|
def iter_parse(self, op: Op, start: int) -> ta.Iterator[Match]:
|
|
120
|
-
|
|
177
|
+
if (key := (op, start)) in self._memo:
|
|
178
|
+
yield from self._memo[key]
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
if self._max_steps is not None and self._cur_step >= self._max_steps:
|
|
182
|
+
raise _Parser.MaxStepsExceededError(self._cur_step)
|
|
183
|
+
self._cur_step += 1
|
|
184
|
+
|
|
185
|
+
matches = tuple(self._dispatch[op.__class__](op, start))
|
|
186
|
+
self._memo[key] = matches
|
|
187
|
+
yield from matches
|
|
121
188
|
|
|
122
189
|
|
|
123
190
|
##
|
|
@@ -131,8 +198,9 @@ class _DebugParser(_Parser):
|
|
|
131
198
|
level: int = 1,
|
|
132
199
|
*,
|
|
133
200
|
write: ta.Callable[[str], None] | None = None,
|
|
201
|
+
**kwargs: ta.Any,
|
|
134
202
|
) -> None:
|
|
135
|
-
super().__init__(grammar, source)
|
|
203
|
+
super().__init__(grammar, source, **kwargs)
|
|
136
204
|
|
|
137
205
|
self._level = level
|
|
138
206
|
if write is None:
|
|
@@ -162,7 +230,7 @@ class _DebugParser(_Parser):
|
|
|
162
230
|
ps = check.isinstance(op, RuleRef).name
|
|
163
231
|
else:
|
|
164
232
|
ps = self._op_str(op)
|
|
165
|
-
body = f'{start}:{self._source[start]!r} {ps}'
|
|
233
|
+
body = f'{start}:{self._source[start] if start < len(self._source) else ""!r} {ps}'
|
|
166
234
|
|
|
167
235
|
if self._level > 2:
|
|
168
236
|
self._write(f'{ws}+ {body}')
|
|
@@ -185,7 +253,7 @@ class _DebugParser(_Parser):
|
|
|
185
253
|
self._write(f'{ws}- {body}')
|
|
186
254
|
|
|
187
255
|
|
|
188
|
-
|
|
256
|
+
#
|
|
189
257
|
|
|
190
258
|
|
|
191
259
|
def _iter_parse(
|
|
@@ -195,18 +263,64 @@ def _iter_parse(
|
|
|
195
263
|
start: int,
|
|
196
264
|
*,
|
|
197
265
|
debug: int = 0,
|
|
266
|
+
max_steps: int | None = None,
|
|
198
267
|
) -> ta.Iterator[Match]:
|
|
199
268
|
parser: _Parser
|
|
200
269
|
if debug:
|
|
201
270
|
parser = _DebugParser(
|
|
202
271
|
grammar,
|
|
203
272
|
source,
|
|
273
|
+
max_steps=max_steps,
|
|
204
274
|
level=debug,
|
|
205
275
|
)
|
|
206
276
|
else:
|
|
207
277
|
parser = _Parser(
|
|
208
278
|
grammar,
|
|
209
279
|
source,
|
|
280
|
+
max_steps=max_steps,
|
|
210
281
|
)
|
|
211
282
|
|
|
212
283
|
return parser.iter_parse(op, start)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
##
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def iter_parse(
|
|
290
|
+
obj: Grammar | Rule | Op,
|
|
291
|
+
src: str,
|
|
292
|
+
*,
|
|
293
|
+
root: str | None = None,
|
|
294
|
+
start: int = 0,
|
|
295
|
+
) -> ta.Iterator[Match]:
|
|
296
|
+
if isinstance(obj, Grammar):
|
|
297
|
+
gram = obj
|
|
298
|
+
elif isinstance(obj, Rule):
|
|
299
|
+
check.none(root)
|
|
300
|
+
gram = Grammar(obj, root=obj)
|
|
301
|
+
elif isinstance(obj, Op):
|
|
302
|
+
check.none(root)
|
|
303
|
+
gram = Grammar(Rule('root', obj), root='root')
|
|
304
|
+
else:
|
|
305
|
+
raise TypeError(obj)
|
|
306
|
+
|
|
307
|
+
return gram.iter_parse(
|
|
308
|
+
src,
|
|
309
|
+
root,
|
|
310
|
+
start=start,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def parse(
|
|
315
|
+
obj: Grammar | Rule | Op,
|
|
316
|
+
src: str,
|
|
317
|
+
*,
|
|
318
|
+
root: str | None = None,
|
|
319
|
+
start: int = 0,
|
|
320
|
+
) -> Match | None:
|
|
321
|
+
return longest_match(iter_parse(
|
|
322
|
+
obj,
|
|
323
|
+
src,
|
|
324
|
+
root=root,
|
|
325
|
+
start=start,
|
|
326
|
+
))
|