omextra 0.0.0.dev497__py3-none-any.whl → 0.0.0.dev499__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
omextra/text/abnf/ops.py CHANGED
@@ -7,6 +7,7 @@ from omlish import lang
7
7
  from .base import CompositeOp
8
8
  from .base import LeafOp
9
9
  from .base import Op
10
+ from .base import OpTuple
10
11
 
11
12
 
12
13
  ##
@@ -106,19 +107,51 @@ class Concat(CompositeOp, lang.Final):
106
107
  def __init__(self, *children: Op) -> None:
107
108
  super().__init__()
108
109
 
109
- for c in check.not_empty(children):
110
+ check.arg(len(children) > 1)
111
+ for i, c in enumerate(children):
110
112
  check.isinstance(c, Op)
113
+ if i:
114
+ check.state(not (isinstance(c, Concat) and isinstance(children[i - 1], Concat)))
111
115
  self._children = children
112
116
 
113
117
  @property
114
- def children(self) -> ta.Sequence[Op]:
118
+ def children(self) -> OpTuple:
115
119
  return self._children
116
120
 
117
121
  def __repr__(self) -> str:
118
122
  return f'{self.__class__.__name__}@{id(self):x}({", ".join(map(repr, self._children))})'
119
123
 
124
+ def replace_children(self, *children: Op) -> Op:
125
+ if children == self._children:
126
+ return self
127
+
128
+ return concat(*children)
129
+
130
+
131
+ def concat(*children: Op) -> Op:
132
+ if len(children) == 1:
133
+ return children[0]
134
+
135
+ check.not_empty(children)
120
136
 
121
- concat = Concat
137
+ lst: list[Op | list[Op]] = []
138
+ for c in children:
139
+ if (
140
+ lst and
141
+ isinstance(c, Concat) and
142
+ isinstance(ll := lst[-1], (Concat, list))
143
+ ):
144
+ if isinstance(ll, list):
145
+ ll.extend(c.children)
146
+ else:
147
+ lst.append([*ta.cast(list, lst.pop()), *c.children])
148
+ else:
149
+ lst.append(c)
150
+
151
+ if len(lst) == 1:
152
+ return Concat(*e) if isinstance(e := lst[0], list) else e
153
+
154
+ return Concat(*[Concat(*e) if isinstance(e, list) else e for e in lst])
122
155
 
123
156
 
124
157
  ##
@@ -161,12 +194,19 @@ class Repeat(CompositeOp, lang.Final):
161
194
  return self._child
162
195
 
163
196
  @property
164
- def children(self) -> ta.Sequence[Op]:
197
+ def children(self) -> OpTuple:
165
198
  return (self._child,)
166
199
 
167
200
  def __repr__(self) -> str:
168
201
  return f'{self.__class__.__name__}@{id(self):x}({self._times}, {self._child!r})'
169
202
 
203
+ def replace_children(self, *children: Op) -> Op:
204
+ child = check.single(children)
205
+ if child == self._child:
206
+ return self
207
+
208
+ return Repeat(self._times, child)
209
+
170
210
 
171
211
  @ta.overload
172
212
  def repeat(child: Op) -> Repeat: # noqa
@@ -238,7 +278,7 @@ class Either(CompositeOp, lang.Final):
238
278
  self._first_match = first_match
239
279
 
240
280
  @property
241
- def children(self) -> ta.Sequence[Op]:
281
+ def children(self) -> OpTuple:
242
282
  return self._children
243
283
 
244
284
  @property
@@ -252,6 +292,12 @@ class Either(CompositeOp, lang.Final):
252
292
  f'{", first_match=True" if self._first_match else ""})'
253
293
  )
254
294
 
295
+ def replace_children(self, *children: Op) -> Op:
296
+ if children == self._children:
297
+ return self
298
+
299
+ return Either(*children, first_match=self._first_match)
300
+
255
301
 
256
302
  either = Either
257
303
 
@@ -265,12 +311,28 @@ class RuleRef(Op, lang.Final):
265
311
  super().__init__()
266
312
 
267
313
  self._name = check.non_empty_str(name)
314
+
268
315
  self._name_f = name.casefold()
269
316
 
317
+ def coalesce(self, other: Op) -> Op:
318
+ """
319
+ Op nodes are compared by identity, and transformations return identical node instances when nothing has changed.
320
+ This method assists with that, preserving RuleRef node identity if the given node is otherwise equal.
321
+ """
322
+
323
+ if isinstance(other, RuleRef) and other.name_f == self.name_f:
324
+ return self
325
+
326
+ return other
327
+
270
328
  @property
271
329
  def name(self) -> str:
272
330
  return self._name
273
331
 
332
+ @property
333
+ def name_f(self) -> str:
334
+ return self._name_f
335
+
274
336
  def __repr__(self) -> str:
275
337
  return f'{self.__class__.__name__}@{id(self):x}({self._name!r})'
276
338
 
omextra/text/abnf/opto.py CHANGED
@@ -1,18 +1,24 @@
1
1
  """
2
2
  TODO:
3
- - Merge concat
4
- - Merge concatted literals
5
- - Regex
3
+ - origin tracking?
4
+ - minor opts:
5
+ - merge concat(range, range)
6
6
  """
7
+ import abc
7
8
  import re
8
9
  import typing as ta
9
10
 
10
11
  from omlish import check
12
+ from omlish import dataclasses as dc
13
+ from omlish import lang
11
14
 
15
+ from .base import CompositeOp
12
16
  from .base import Op
17
+ from .grammars import Channel
18
+ from .grammars import Grammar
19
+ from .grammars import Rule
13
20
  from .internal import Regex
14
21
  from .ops import CaseInsensitiveStringLiteral
15
- from .ops import CompositeOp
16
22
  from .ops import Concat
17
23
  from .ops import Either
18
24
  from .ops import RangeLiteral
@@ -24,42 +30,109 @@ from .ops import StringLiteral
24
30
  ##
25
31
 
26
32
 
27
- def _build_op_regex_pat(op: Op, pats_by_op: ta.Mapping[Op, str | None]) -> str | None:
28
- if isinstance(op, StringLiteral):
29
- return re.escape(op.value)
33
+ @dc.dataclass(frozen=True)
34
+ class _RegexItem(lang.Abstract):
35
+ @property
36
+ @abc.abstractmethod
37
+ def pat(self) -> str:
38
+ raise NotImplementedError
30
39
 
31
- elif isinstance(op, CaseInsensitiveStringLiteral):
32
- return f'(?i:{re.escape(op.value)})'
40
+ @classmethod
41
+ def of_op(cls, op: Op) -> ta.Optional['_RegexItem']:
42
+ if isinstance(op, StringLiteral):
43
+ return _StringLiteralRegexItem(op.value)
44
+
45
+ elif isinstance(op, CaseInsensitiveStringLiteral):
46
+ return _CaseInsensitiveStringLiteralRegexItem(op.value)
47
+
48
+ elif isinstance(op, RangeLiteral):
49
+ lo = re.escape(op.value.lo)
50
+ hi = re.escape(op.value.hi)
51
+ return _RegexRegexItem(f'[{lo}-{hi}]')
52
+
53
+ elif isinstance(op, Regex):
54
+ return _RegexRegexItem(op.pat.pattern)
55
+
56
+ else:
57
+ return None
58
+
59
+ @classmethod
60
+ def of(cls, obj: ta.Union['_RegexItem', Op, None]) -> ta.Optional['_RegexItem']:
61
+ if obj is None:
62
+ return None
63
+
64
+ elif isinstance(obj, _RegexItem):
65
+ return obj
66
+
67
+ elif isinstance(obj, Op):
68
+ return cls.of_op(obj)
69
+
70
+ else:
71
+ raise TypeError(obj)
72
+
73
+
74
+ @dc.dataclass(frozen=True)
75
+ class _StringLiteralRegexItem(_RegexItem, lang.Final):
76
+ s: str
77
+
78
+ @property
79
+ def pat(self) -> str:
80
+ return re.escape(self.s)
81
+
82
+
83
+ @dc.dataclass(frozen=True)
84
+ class _CaseInsensitiveStringLiteralRegexItem(_RegexItem, lang.Final):
85
+ s: str
86
+
87
+ @property
88
+ def pat(self) -> str:
89
+ return f'(?i:{re.escape(self.s)})'
90
+
91
+
92
+ @dc.dataclass(frozen=True)
93
+ class _RegexRegexItem(_RegexItem, lang.Final):
94
+ ps: str
95
+
96
+ @property
97
+ def pat(self) -> str:
98
+ return self.ps
99
+
100
+
101
+ def _regex_item_transform_op(op: Op) -> _RegexItem | None:
102
+ if isinstance(op, (StringLiteral, CaseInsensitiveStringLiteral, Regex)):
103
+ return None
33
104
 
34
105
  elif isinstance(op, RangeLiteral):
35
- lo = re.escape(op.value.lo)
36
- hi = re.escape(op.value.hi)
37
- return f'[{lo}-{hi}]'
106
+ # Unlike other leafs we eagerly transform RangeLiteral to a regex as it's probably faster than the python impl,
107
+ # even alone.
108
+ return _RegexItem.of_op(op)
38
109
 
39
110
  elif isinstance(op, RuleRef):
40
111
  return None
41
112
 
42
- elif isinstance(op, Regex):
43
- return op.pat.pattern
44
-
45
113
  elif isinstance(op, Concat):
46
- child_pats = [pats_by_op[child] for child in op.children]
47
- if not all(ca is not None for ca in child_pats):
114
+ children = [_regex_item_transform_op(child) or _RegexItem.of(child) for child in op.children]
115
+ if all(ca is not None for ca in children):
116
+ return _RegexRegexItem(''.join(check.not_none(ca).pat for ca in children))
117
+
118
+ if not any(ca is not None for ca in children):
48
119
  return None
49
- return ''.join(ta.cast(str, ca) for ca in child_pats)
120
+
121
+ # FIXME: merge adjacent
122
+ return None
50
123
 
51
124
  elif isinstance(op, Repeat):
52
- if (child_pat := pats_by_op[op.child]) is None:
125
+ child = _RegexItem.of(_regex_item_transform_op(op.child))
126
+ if child is None:
53
127
  return None
54
128
 
55
129
  # Wrap the child pattern in a non-capturing group if needed to ensure correct quantification. A pattern needs
56
130
  # wrapping if it contains multiple elements or operators (e.g., 'ab', 'a|b'). Single character classes [a-z] and
57
131
  # single escaped chars don't need wrapping.
58
- needs_group = (
59
- len(child_pat) > 1 and
60
- not (child_pat.startswith('[') and child_pat.endswith(']'))
61
- )
62
- if needs_group:
132
+ if (
133
+ len(child_pat := child.pat) > 1 and
134
+ not (child_pat.startswith('[') and child_pat.endswith(']'))
135
+ ):
63
136
  child_pat = f'(?:{child_pat})'
64
137
 
65
138
  times = op.times
@@ -76,7 +149,7 @@ def _build_op_regex_pat(op: Op, pats_by_op: ta.Mapping[Op, str | None]) -> str |
76
149
  else:
77
150
  quantifier = f'{{{times.min},{times.max}}}'
78
151
 
79
- return child_pat + quantifier
152
+ return _RegexRegexItem(child_pat + quantifier)
80
153
 
81
154
  elif isinstance(op, Either):
82
155
  # Only convert Either if first_match is True, as regex alternation uses first-match semantics. ABNF Either with
@@ -84,71 +157,101 @@ def _build_op_regex_pat(op: Op, pats_by_op: ta.Mapping[Op, str | None]) -> str |
84
157
  if not op.first_match:
85
158
  return None
86
159
 
87
- child_pats = [pats_by_op[child] for child in op.children]
88
- if not all(ca is not None for ca in child_pats):
160
+ children = [_regex_item_transform_op(child) or _RegexItem.of(child) for child in op.children]
161
+ if all(ca is not None for ca in children):
162
+ # Build regex alternation. Use a capturing group for the alternation
163
+ return _RegexRegexItem(''.join([
164
+ '(',
165
+ '|'.join(check.not_none(ca).pat for ca in children),
166
+ ')',
167
+ ]))
168
+
169
+ if not any(ca is not None for ca in children):
89
170
  return None
90
171
 
91
- # Build regex alternation. Use a capturing group for the alternation
92
- return f'({"|".join(ta.cast("ta.Sequence[str]", child_pats))})'
172
+ # FIXME: merge adjacent
173
+ return None
93
174
 
94
175
  else:
95
176
  raise TypeError(op)
96
177
 
97
178
 
98
- def _regex_transform_single_op(op: Op, pats_by_op: ta.Mapping[Op, str | None]) -> Op:
99
- pat = pats_by_op[op]
179
+ def _regex_transform_op(op: Op) -> Op:
180
+ v = _regex_item_transform_op(op)
100
181
 
101
- if pat is not None:
102
- if isinstance(op, Regex):
103
- return op
104
-
105
- return Regex(re.compile(pat))
182
+ if v is None:
183
+ return op
106
184
 
107
- if isinstance(op, Concat):
108
- new_children = tuple(_regex_transform_single_op(child, pats_by_op) for child in op.children)
109
- if new_children == op.children:
110
- return op
185
+ elif isinstance(v, _RegexItem):
186
+ return Regex(re.compile(v.pat))
111
187
 
112
- return Concat(*new_children)
188
+ else:
189
+ raise TypeError(v)
113
190
 
114
- elif isinstance(op, Repeat):
115
- new_child = _regex_transform_single_op(op.child, pats_by_op)
116
- if new_child == op.child:
117
- return op
118
191
 
119
- return Repeat(op.times, new_child)
192
+ ##
120
193
 
121
- elif isinstance(op, Either):
122
- new_children = tuple(_regex_transform_single_op(child, pats_by_op) for child in op.children)
123
- if new_children == op.children:
124
- return op
125
194
 
126
- return Either(*new_children, first_match=op.first_match)
195
+ def optimize_op(op: Op) -> Op:
196
+ op = _regex_transform_op(op)
127
197
 
128
198
  return op
129
199
 
130
200
 
131
- def regex_transform_op(op: Op) -> Op:
132
- pats_by_op: dict[Op, str | None] = {}
201
+ ##
202
+
203
+
204
+ def _inline_rules(fn: ta.Callable[[Rule], bool], gram: Grammar) -> Grammar:
205
+ cur_rule: Rule
206
+ inlined_rules: dict[str, Op] = {}
133
207
 
134
- def analyze_tree(o: Op) -> None:
135
- check.not_in(o, pats_by_op)
208
+ def rec_op(op: Op) -> Op:
209
+ if isinstance(op, RuleRef):
210
+ if op.name_f == cur_rule.name_f:
211
+ return op
136
212
 
137
- if isinstance(o, CompositeOp):
138
- for child in o.children:
139
- analyze_tree(child)
213
+ if (r := gram.rule(op.name)) is None or not fn(r):
214
+ return op
140
215
 
141
- pats_by_op[o] = _build_op_regex_pat(o, pats_by_op)
216
+ try:
217
+ return inlined_rules[r.name]
218
+ except KeyError:
219
+ pass
142
220
 
143
- analyze_tree(op)
221
+ inlined_rules[op.name] = op
222
+ i_op = rec_op(r.op)
223
+ inlined_rules[op.name] = i_op
144
224
 
145
- return _regex_transform_single_op(op, pats_by_op)
225
+ return op.coalesce(i_op)
226
+
227
+ elif isinstance(op, CompositeOp):
228
+ return op.replace_children(*map(rec_op, op.children))
229
+
230
+ else:
231
+ return op
232
+
233
+ new_rules: list[Rule] = []
234
+ for rule in gram.rules:
235
+ cur_rule = rule
236
+ new_rules.append(rule.replace_op(rec_op(rule.op)))
237
+
238
+ return gram.replace_rules(*new_rules)
146
239
 
147
240
 
148
241
  ##
149
242
 
150
243
 
151
- def optimize_op(op: Op) -> Op:
152
- op = regex_transform_op(op)
244
+ def optimize_grammar(
245
+ gram: Grammar,
246
+ *,
247
+ inline_channels: ta.Container[Channel] | None = (Channel.SPACE,),
248
+ ) -> Grammar:
249
+ if inline_channels:
250
+ gram = _inline_rules(lambda r: r.channel in inline_channels, gram)
153
251
 
154
- return op
252
+ gram = gram.replace_rules(*[
253
+ r.replace_op(optimize_op(r.op))
254
+ for r in gram.rules
255
+ ])
256
+
257
+ return gram
@@ -2,10 +2,12 @@ import typing as ta
2
2
 
3
3
  from omlish import check
4
4
 
5
- from .base import Grammar
6
- from .base import Match
7
5
  from .base import Op
6
+ from .grammars import Grammar
7
+ from .grammars import Rule
8
8
  from .internal import Regex
9
+ from .matches import Match
10
+ from .matches import longest_match
9
11
  from .ops import CaseInsensitiveStringLiteral
10
12
  from .ops import Concat
11
13
  from .ops import Either
@@ -35,6 +37,8 @@ class _Parser:
35
37
  self._source = source
36
38
  self._max_steps = max_steps
37
39
 
40
+ self._rules = self._grammar._rules # Noqa
41
+
38
42
  self._dispatch: dict[type[Op], ta.Any] = {
39
43
  StringLiteral: self._iter_parse_string_literal,
40
44
  CaseInsensitiveStringLiteral: self._iter_parse_case_insensitive_string_literal,
@@ -67,6 +71,7 @@ class _Parser:
67
71
  source = self._source[start] # noqa
68
72
  except IndexError:
69
73
  return
74
+
70
75
  # ranges are always case-sensitive
71
76
  if (value := op._value).lo <= source <= value.hi: # noqa
72
77
  yield Match(op, start, start + 1, ())
@@ -160,7 +165,7 @@ class _Parser:
160
165
  return
161
166
 
162
167
  def _iter_parse_rule_ref(self, op: RuleRef, start: int) -> ta.Iterator[Match]:
163
- cp = self._grammar._rules_by_name_f[op._name_f].op # noqa
168
+ cp = self._rules._rules_by_name_f[op._name_f].op # noqa
164
169
  for cm in self.iter_parse(cp, start):
165
170
  yield Match(op, cm.start, cm.end, (cm,))
166
171
 
@@ -225,7 +230,7 @@ class _DebugParser(_Parser):
225
230
  ps = check.isinstance(op, RuleRef).name
226
231
  else:
227
232
  ps = self._op_str(op)
228
- body = f'{start}:{self._source[start]!r} {ps}'
233
+ body = f'{start}:{self._source[start] if start < len(self._source) else ""!r} {ps}'
229
234
 
230
235
  if self._level > 2:
231
236
  self._write(f'{ws}+ {body}')
@@ -248,7 +253,7 @@ class _DebugParser(_Parser):
248
253
  self._write(f'{ws}- {body}')
249
254
 
250
255
 
251
- ##
256
+ #
252
257
 
253
258
 
254
259
  def _iter_parse(
@@ -276,3 +281,46 @@ def _iter_parse(
276
281
  )
277
282
 
278
283
  return parser.iter_parse(op, start)
284
+
285
+
286
+ ##
287
+
288
+
289
+ def iter_parse(
290
+ obj: Grammar | Rule | Op,
291
+ src: str,
292
+ *,
293
+ root: str | None = None,
294
+ start: int = 0,
295
+ ) -> ta.Iterator[Match]:
296
+ if isinstance(obj, Grammar):
297
+ gram = obj
298
+ elif isinstance(obj, Rule):
299
+ check.none(root)
300
+ gram = Grammar(obj, root=obj)
301
+ elif isinstance(obj, Op):
302
+ check.none(root)
303
+ gram = Grammar(Rule('root', obj), root='root')
304
+ else:
305
+ raise TypeError(obj)
306
+
307
+ return gram.iter_parse(
308
+ src,
309
+ root,
310
+ start=start,
311
+ )
312
+
313
+
314
+ def parse(
315
+ obj: Grammar | Rule | Op,
316
+ src: str,
317
+ *,
318
+ root: str | None = None,
319
+ start: int = 0,
320
+ ) -> Match | None:
321
+ return longest_match(iter_parse(
322
+ obj,
323
+ src,
324
+ root=root,
325
+ start=start,
326
+ ))
@@ -1,62 +1,59 @@
1
- import itertools
2
1
  import textwrap
3
2
  import typing as ta
4
3
 
5
4
  from omlish import check
6
5
 
7
- from .base import Grammar
8
- from .base import Match
6
+ from .grammars import Channel
7
+ from .grammars import Grammar
8
+ from .matches import Match
9
+ from .matches import filter_matches
9
10
  from .ops import RuleRef
10
11
 
11
12
 
12
13
  ##
13
14
 
14
15
 
15
- def strip_insignificant_match_rules(m: Match, g: Grammar) -> Match:
16
- def rec(c: Match) -> Match:
17
- return c.flat_map_children(
18
- lambda x: (
19
- (rec(x),) if not (
20
- isinstance((xp := x.op), RuleRef) and
21
- check.not_none(g.rule(xp.name)).insignificant
22
- ) else ()
23
- ),
24
- )
25
- return rec(m)
16
+ def filter_match_channels(
17
+ m: Match,
18
+ g: Grammar,
19
+ *,
20
+ keep: ta.Container[Channel] | None = None,
21
+ remove: ta.Container[Channel] | None = None,
22
+ keep_children: bool = False,
23
+ ) -> Match:
24
+ if keep is None and remove is None:
25
+ return m
26
26
 
27
+ def fn(x: Match) -> bool:
28
+ if not isinstance((rr := x.op), RuleRef):
29
+ return False
27
30
 
28
- def only_match_rules(m: Match) -> Match:
29
- def rec(c: Match) -> ta.Iterable[Match]:
30
- if isinstance(c.op, RuleRef):
31
- return (c.flat_map_children(rec),)
32
- else:
33
- return itertools.chain.from_iterable(map(rec, c.children))
34
- return m.flat_map_children(rec)
31
+ r = check.not_none(g.rule(rr.name))
32
+
33
+ if keep is not None and r.channel not in keep:
34
+ return False
35
+
36
+ if remove is not None and r.channel in remove:
37
+ return False
38
+
39
+ return True
40
+
41
+ return filter_matches(
42
+ fn,
43
+ m,
44
+ keep_children=keep_children,
45
+ )
35
46
 
36
47
 
37
48
  #
38
49
 
39
50
 
40
- def parse_rules(
41
- grammar: Grammar,
42
- source: str,
43
- root: str | None = None,
44
- *,
45
- start: int = 0,
46
- **kwargs: ta.Any,
47
- ) -> Match | None:
48
- if (match := grammar.parse(
49
- source,
50
- root,
51
- start=start,
52
- **kwargs,
53
- )) is None:
54
- return None
55
-
56
- match = only_match_rules(match)
57
- match = strip_insignificant_match_rules(match, grammar)
58
-
59
- return match
51
+ def only_match_rules(m: Match) -> Match:
52
+ return filter_matches(
53
+ lambda x: isinstance(x.op, RuleRef),
54
+ m,
55
+ keep_children=True,
56
+ )
60
57
 
61
58
 
62
59
  ##
@@ -5,8 +5,8 @@ from omlish import collections as col
5
5
  from omlish import dispatch
6
6
  from omlish import lang
7
7
 
8
- from .base import Match
9
8
  from .base import Op
9
+ from .matches import Match
10
10
  from .ops import RuleRef
11
11
 
12
12
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: omextra
3
- Version: 0.0.0.dev497
3
+ Version: 0.0.0.dev499
4
4
  Summary: omextra
5
5
  Author: wrmsr
6
6
  License-Expression: BSD-3-Clause
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
14
14
  Requires-Python: >=3.13
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
- Requires-Dist: omlish==0.0.0.dev497
17
+ Requires-Dist: omlish==0.0.0.dev499
18
18
  Dynamic: license-file
19
19
 
20
20
  # Overview