omextra 0.0.0.dev496__py3-none-any.whl → 0.0.0.dev498__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
omextra/text/abnf/core.py CHANGED
@@ -3,13 +3,15 @@ https://datatracker.ietf.org/doc/html/rfc5234
3
3
  """
4
4
  import typing as ta
5
5
 
6
- from .base import Grammar
7
- from .base import Rule
6
+ from .grammars import Channel
7
+ from .grammars import Grammar
8
+ from .grammars import Rule
8
9
  from .ops import concat
9
10
  from .ops import either
10
11
  from .ops import literal
11
12
  from .ops import repeat
12
13
  from .ops import rule
14
+ from .opto import optimize_grammar
13
15
 
14
16
 
15
17
  ##
@@ -23,6 +25,7 @@ CORE_RULES: ta.Sequence[Rule] = [
23
25
  literal('\x41', '\x5a'),
24
26
  literal('\x61', '\x7a'),
25
27
  ),
28
+ channel=Channel.CONTENT,
26
29
  ),
27
30
 
28
31
  Rule(
@@ -31,11 +34,13 @@ CORE_RULES: ta.Sequence[Rule] = [
31
34
  literal('0'),
32
35
  literal('1'),
33
36
  ),
37
+ channel=Channel.CONTENT,
34
38
  ),
35
39
 
36
40
  Rule(
37
41
  'CHAR',
38
42
  literal('\x01', '\x7f'),
43
+ channel=Channel.CONTENT,
39
44
  ),
40
45
 
41
46
  Rule(
@@ -44,12 +49,13 @@ CORE_RULES: ta.Sequence[Rule] = [
44
49
  literal('\x00', '\x1f'),
45
50
  literal('\x7f', case_sensitive=True),
46
51
  ),
52
+ channel=Channel.CONTENT,
47
53
  ),
48
54
 
49
55
  Rule(
50
56
  'CR',
51
57
  literal('\x0d', case_sensitive=True),
52
- insignificant=True,
58
+ channel=Channel.SPACE,
53
59
  ),
54
60
 
55
61
  Rule(
@@ -58,17 +64,19 @@ CORE_RULES: ta.Sequence[Rule] = [
58
64
  rule('CR'),
59
65
  rule('LF'),
60
66
  ),
61
- insignificant=True,
67
+ channel=Channel.SPACE,
62
68
  ),
63
69
 
64
70
  Rule(
65
71
  'DIGIT',
66
72
  literal('\x30', '\x39'),
73
+ channel=Channel.CONTENT,
67
74
  ),
68
75
 
69
76
  Rule(
70
77
  'DQUOTE',
71
78
  literal('\x22', case_sensitive=True),
79
+ channel=Channel.CONTENT,
72
80
  ),
73
81
 
74
82
  Rule(
@@ -82,18 +90,19 @@ CORE_RULES: ta.Sequence[Rule] = [
82
90
  literal('E'),
83
91
  literal('F'),
84
92
  ),
93
+ channel=Channel.CONTENT,
85
94
  ),
86
95
 
87
96
  Rule(
88
97
  'HTAB',
89
98
  literal('\x09', case_sensitive=True),
90
- insignificant=True,
99
+ channel=Channel.SPACE,
91
100
  ),
92
101
 
93
102
  Rule(
94
103
  'LF',
95
104
  literal('\x0a', case_sensitive=True),
96
- insignificant=True,
105
+ channel=Channel.SPACE,
97
106
  ),
98
107
 
99
108
  Rule(
@@ -107,23 +116,25 @@ CORE_RULES: ta.Sequence[Rule] = [
107
116
  ),
108
117
  ),
109
118
  ),
110
- insignificant=True,
119
+ channel=Channel.SPACE,
111
120
  ),
112
121
 
113
122
  Rule(
114
123
  'OCTET',
115
124
  literal('\x00', '\xff'),
125
+ channel=Channel.CONTENT,
116
126
  ),
117
127
 
118
128
  Rule(
119
129
  'SP',
120
130
  literal('\x20', case_sensitive=True),
121
- insignificant=True,
131
+ channel=Channel.SPACE,
122
132
  ),
123
133
 
124
134
  Rule(
125
135
  'VCHAR',
126
136
  literal('\x21', '\x7e'),
137
+ channel=Channel.CONTENT,
127
138
  ),
128
139
 
129
140
  Rule(
@@ -132,10 +143,11 @@ CORE_RULES: ta.Sequence[Rule] = [
132
143
  rule('SP'),
133
144
  rule('HTAB'),
134
145
  ),
135
- insignificant=True,
146
+ channel=Channel.SPACE,
136
147
  ),
137
148
 
138
149
  ]
139
150
 
140
151
 
141
- CORE_GRAMMAR = Grammar(*CORE_RULES)
152
+ RAW_CORE_GRAMMAR = Grammar(*CORE_RULES)
153
+ CORE_GRAMMAR = optimize_grammar(RAW_CORE_GRAMMAR)
@@ -0,0 +1,235 @@
1
+ import enum
2
+ import typing as ta
3
+
4
+ from omlish import check
5
+ from omlish import lang
6
+
7
+ from .errors import AbnfError
8
+ from .errors import AbnfIncompleteParseError
9
+ from .matches import Match
10
+ from .matches import longest_match
11
+ from .ops import Op
12
+
13
+
14
+ with lang.auto_proxy_import(globals()):
15
+ from . import parsing
16
+
17
+
18
+ ##
19
+
20
+
21
+ class Channel(enum.Enum):
22
+ STRUCTURE = enum.auto()
23
+ CONTENT = enum.auto()
24
+ COMMENT = enum.auto()
25
+ SPACE = enum.auto()
26
+
27
+
28
+ class Rule(lang.Final):
29
+ def __init__(
30
+ self,
31
+ name: str,
32
+ op: Op,
33
+ *,
34
+ channel: Channel = Channel.STRUCTURE,
35
+ ) -> None:
36
+ super().__init__()
37
+
38
+ self._name = check.non_empty_str(name)
39
+ self._op = check.isinstance(op, Op)
40
+ self._channel = channel
41
+
42
+ self._name_f = name.casefold()
43
+
44
+ def __repr__(self) -> str:
45
+ return f'{self.__class__.__name__}({self._name!r}, channel={self._channel.name})'
46
+
47
+ def replace_op(self, op: Op) -> 'Rule':
48
+ return Rule(
49
+ self._name,
50
+ op,
51
+ channel=self._channel,
52
+ )
53
+
54
+ @property
55
+ def name(self) -> str:
56
+ return self._name
57
+
58
+ @property
59
+ def name_f(self) -> str:
60
+ return self._name_f
61
+
62
+ @property
63
+ def op(self) -> Op:
64
+ return self._op
65
+
66
+ @property
67
+ def channel(self) -> Channel:
68
+ return self._channel
69
+
70
+
71
+ #
72
+
73
+
74
+ class RulesCollection(lang.Final, ta.Collection[Rule]):
75
+ def __init__(self, *rules: ta.Union[Rule, 'RulesCollection']) -> None:
76
+ super().__init__()
77
+
78
+ rules_set: set[Rule] = set()
79
+ rules_by_name: dict[str, Rule] = {}
80
+ rules_by_name_f: dict[str, Rule] = {}
81
+ rules_by_op: dict[Op, Rule] = {}
82
+
83
+ def add(gr: Rule) -> None:
84
+ check.isinstance(gr, Rule)
85
+
86
+ check.not_in(gr, rules_set)
87
+ check.not_in(gr._name, rules_by_name) # noqa
88
+ check.not_in(gr._name_f, rules_by_name_f) # noqa
89
+ check.not_in(gr._op, rules_by_op) # noqa
90
+
91
+ rules_set.add(gr)
92
+ rules_by_name[gr._name] = gr # noqa
93
+ rules_by_name_f[gr._name_f] = gr # noqa
94
+ rules_by_op[gr._op] = gr # noqa
95
+
96
+ for e in rules:
97
+ if isinstance(e, RulesCollection):
98
+ for c in e:
99
+ add(c)
100
+ else:
101
+ add(e)
102
+
103
+ self._rules_set = rules_set
104
+ self._rules_by_name: ta.Mapping[str, Rule] = rules_by_name
105
+ self._rules_by_name_f: ta.Mapping[str, Rule] = rules_by_name_f
106
+ self._rules_by_op: ta.Mapping[Op, Rule] = rules_by_op
107
+
108
+ @property
109
+ def rules_set(self) -> ta.AbstractSet[Rule]:
110
+ return self._rules_set
111
+
112
+ @property
113
+ def rules_by_name(self) -> ta.Mapping[str, Rule]:
114
+ return self._rules_by_name
115
+
116
+ @property
117
+ def rules_by_name_f(self) -> ta.Mapping[str, Rule]:
118
+ return self._rules_by_name_f
119
+
120
+ @property
121
+ def rules_by_op(self) -> ta.Mapping[Op, Rule]:
122
+ return self._rules_by_op
123
+
124
+ #
125
+
126
+ def __len__(self) -> int:
127
+ return len(self._rules_set)
128
+
129
+ def __iter__(self) -> ta.Iterator[Rule]:
130
+ return iter(self._rules_set)
131
+
132
+ def __contains__(self, item: Rule) -> bool: # type: ignore[override]
133
+ return item in self._rules_set
134
+
135
+ #
136
+
137
+ def rule(self, name: str) -> Rule | None:
138
+ return self._rules_by_name_f.get(name.casefold())
139
+
140
+
141
+ ##
142
+
143
+
144
+ class Grammar(lang.Final):
145
+ def __init__(
146
+ self,
147
+ *rules: Rule | RulesCollection,
148
+ root: Rule | str | None = None,
149
+ ) -> None:
150
+ super().__init__()
151
+
152
+ if len(rules) == 1 and isinstance(r0 := rules[0], RulesCollection):
153
+ self._rules = r0
154
+ else:
155
+ self._rules = RulesCollection(*rules)
156
+
157
+ if isinstance(root, str):
158
+ root = self._rules.rules_by_name_f[root.casefold()]
159
+ self._root = root
160
+
161
+ @property
162
+ def rules(self) -> RulesCollection:
163
+ return self._rules
164
+
165
+ @property
166
+ def root(self) -> Rule | None:
167
+ return self._root
168
+
169
+ #
170
+
171
+ def rule(self, name: str) -> Rule | None:
172
+ return self._rules.rule(name)
173
+
174
+ def replace_rules(self, *rules: Rule) -> 'Grammar':
175
+ rc = RulesCollection(*rules)
176
+ if rc.rules_set == self._rules.rules_set:
177
+ return self
178
+
179
+ return Grammar(
180
+ rc,
181
+ root=self._root.name if self._root is not None else None,
182
+ )
183
+
184
+ #
185
+
186
+ def iter_parse(
187
+ self,
188
+ source: str,
189
+ root: Rule | str | None = None,
190
+ *,
191
+ start: int = 0,
192
+ debug: int = 0,
193
+ **kwargs: ta.Any,
194
+ ) -> ta.Iterator[Match]:
195
+ if root is None:
196
+ if (root := self._root) is None:
197
+ raise AbnfError('No root or default root specified')
198
+ else:
199
+ if isinstance(root, str):
200
+ root = self._rules.rules_by_name_f[root.casefold()]
201
+ else:
202
+ root = check.in_(check.isinstance(root, Rule), self._rules)
203
+
204
+ return parsing._iter_parse( # noqa
205
+ self,
206
+ source,
207
+ root._op, # noqa
208
+ start,
209
+ debug=debug,
210
+ **kwargs,
211
+ )
212
+
213
+ def parse(
214
+ self,
215
+ source: str,
216
+ root: str | None = None,
217
+ *,
218
+ start: int = 0,
219
+ complete: bool = False,
220
+ debug: int = 0,
221
+ **kwargs: ta.Any,
222
+ ) -> Match | None:
223
+ if (match := longest_match(self.iter_parse(
224
+ source,
225
+ root,
226
+ start=start,
227
+ debug=debug,
228
+ **kwargs,
229
+ ))) is None:
230
+ return None
231
+
232
+ if complete and (match.start, match.end) != (start, len(source)):
233
+ raise AbnfIncompleteParseError
234
+
235
+ return match
@@ -29,4 +29,4 @@ class Regex(InternalOp, LeafOp, lang.Final):
29
29
  return self._pat
30
30
 
31
31
  def __repr__(self) -> str:
32
- return f'{self.__class__.__name__}@{id(self):x}({self._pat!r})'
32
+ return f'{self.__class__.__name__}@{id(self):x}({self._pat.pattern!r})'
@@ -0,0 +1,145 @@
1
+ import io
2
+ import itertools
3
+ import typing as ta
4
+
5
+ from omlish import lang
6
+
7
+ from .internal import Regex
8
+ from .ops import CaseInsensitiveStringLiteral
9
+ from .ops import Op
10
+ from .ops import RangeLiteral
11
+ from .ops import RuleRef
12
+ from .ops import StringLiteral
13
+
14
+
15
+ ##
16
+
17
+
18
+ @ta.final
19
+ class Match(ta.NamedTuple):
20
+ op: 'Op'
21
+ start: int
22
+ end: int
23
+ children: tuple['Match', ...]
24
+
25
+ @property
26
+ def length(self) -> int:
27
+ return self.end - self.start
28
+
29
+ #
30
+
31
+ def __repr__(self) -> str:
32
+ return (
33
+ f'{self.__class__.__name__}('
34
+ f'{self.op._match_repr()}, ' # noqa
35
+ f'{self.start}, {self.end}'
36
+ f'{f", {self.children!r}" if self.children else ""})'
37
+ )
38
+
39
+ def render_to(
40
+ self,
41
+ write: ta.Callable[[str], ta.Any],
42
+ *,
43
+ indent: int | None = None,
44
+ _depth: int = 0,
45
+ ) -> None:
46
+ ix: str | None = (' ' * (indent * _depth)) if indent is not None else None
47
+ if ix:
48
+ write(ix)
49
+
50
+ o = self.op
51
+
52
+ if isinstance(o, (StringLiteral, CaseInsensitiveStringLiteral)):
53
+ write(f'literal<{self.start}-{self.end}>({o.value!r})')
54
+
55
+ elif isinstance(o, RangeLiteral):
56
+ write(f'literal<{self.start}-{self.end}>({o.value.lo!r}-{o.value.hi!r})')
57
+
58
+ elif isinstance(o, Regex):
59
+ write(f'regex<{self.start}-{self.end}>({o.pat.pattern!r})')
60
+
61
+ else:
62
+ write(f'{o.__class__.__name__.lower()}<{self.start}-{self.end}>')
63
+
64
+ if isinstance(o, RuleRef):
65
+ write(f':{o.name}')
66
+
67
+ if self.children:
68
+ write('(')
69
+ if ix is not None:
70
+ write('\n')
71
+
72
+ for i, c in enumerate(self.children):
73
+ if i and ix is None:
74
+ write(', ')
75
+
76
+ c.render_to(write, indent=indent, _depth=_depth + 1)
77
+
78
+ if ix is not None:
79
+ write(',\n')
80
+
81
+ if ix:
82
+ write(ix)
83
+
84
+ write(')')
85
+
86
+ def render(
87
+ self,
88
+ *,
89
+ indent: int | None = None,
90
+ ) -> str:
91
+ sb = io.StringIO()
92
+ self.render_to(sb.write, indent=indent)
93
+ return sb.getvalue()
94
+
95
+ def __str__(self) -> str:
96
+ return self.render()
97
+
98
+ #
99
+
100
+ def replace_children(self, *children: 'Match') -> 'Match':
101
+ if lang.seqs_identical(children, self.children):
102
+ return self
103
+
104
+ return self._replace(children=children)
105
+
106
+ def map_children(self, fn: ta.Callable[['Match'], 'Match']) -> 'Match':
107
+ return self.replace_children(*map(fn, self.children))
108
+
109
+ def flat_map_children(self, fn: ta.Callable[['Match'], ta.Iterable['Match']]) -> 'Match':
110
+ return self.replace_children(*itertools.chain.from_iterable(map(fn, self.children)))
111
+
112
+
113
+ ##
114
+
115
+
116
+ def longest_match(ms: ta.Iterable[Match]) -> Match | None:
117
+ bm: Match | None = None
118
+ bl = 0
119
+ for m in ms:
120
+ l = m.length
121
+ if bm is None or l > bl:
122
+ bm, bl = m, l
123
+ return bm
124
+
125
+
126
+ def filter_matches(
127
+ fn: ta.Callable[[Match], bool],
128
+ m: Match,
129
+ *,
130
+ keep_children: bool = False,
131
+ ) -> Match:
132
+ def inner(x: Match) -> ta.Iterable[Match]:
133
+ if fn(x):
134
+ return (rec(x),)
135
+
136
+ elif keep_children:
137
+ return lang.flatten(inner(c) for c in x.children)
138
+
139
+ else:
140
+ return ()
141
+
142
+ def rec(c: Match) -> Match:
143
+ return c.flat_map_children(inner)
144
+
145
+ return rec(m)
omextra/text/abnf/meta.py CHANGED
@@ -7,12 +7,13 @@ from omlish import check
7
7
  from omlish import dataclasses as dc
8
8
  from omlish import lang
9
9
 
10
- from .base import Grammar
11
- from .base import Match
12
10
  from .base import Op
13
- from .base import Rule
14
11
  from .core import CORE_RULES
15
12
  from .errors import AbnfGrammarParseError
13
+ from .grammars import Channel
14
+ from .grammars import Grammar
15
+ from .grammars import Rule
16
+ from .matches import Match
16
17
  from .ops import Repeat
17
18
  from .ops import concat
18
19
  from .ops import either
@@ -20,8 +21,10 @@ from .ops import literal
20
21
  from .ops import option
21
22
  from .ops import repeat
22
23
  from .ops import rule
24
+ from .opto import optimize_grammar
25
+ from .utils import filter_match_channels
23
26
  from .utils import fix_ws
24
- from .utils import parse_rules
27
+ from .utils import only_match_rules
25
28
  from .visitors import RuleMatchVisitor
26
29
 
27
30
 
@@ -105,7 +108,7 @@ META_GRAMMAR_RULES: ta.Sequence[Rule] = [
105
108
  rule('WSP'),
106
109
  ),
107
110
  ),
108
- insignificant=True,
111
+ channel=Channel.SPACE,
109
112
  ),
110
113
 
111
114
  Rule(
@@ -114,7 +117,7 @@ META_GRAMMAR_RULES: ta.Sequence[Rule] = [
114
117
  rule('comment'),
115
118
  rule('CRLF'),
116
119
  ),
117
- insignificant=True,
120
+ channel=Channel.SPACE,
118
121
  ),
119
122
 
120
123
  Rule(
@@ -129,6 +132,7 @@ META_GRAMMAR_RULES: ta.Sequence[Rule] = [
129
132
  ),
130
133
  rule('CRLF'),
131
134
  ),
135
+ channel=Channel.COMMENT,
132
136
  ),
133
137
 
134
138
  Rule(
@@ -409,12 +413,21 @@ META_GRAMMAR_RULES: ta.Sequence[Rule] = [
409
413
  ]
410
414
 
411
415
 
412
- META_GRAMMAR = Grammar(
416
+ RAW_META_GRAMMAR = Grammar(
413
417
  *CORE_RULES,
414
418
  *META_GRAMMAR_RULES,
415
419
  root='rulelist',
416
420
  )
417
421
 
422
+ META_GRAMMAR = optimize_grammar(
423
+ RAW_META_GRAMMAR,
424
+ inline_channels=(
425
+ Channel.CONTENT,
426
+ Channel.COMMENT,
427
+ Channel.SPACE,
428
+ ),
429
+ )
430
+
418
431
 
419
432
  ##
420
433
 
@@ -555,30 +568,50 @@ class MetaGrammarRuleMatchVisitor(RuleMatchVisitor[ta.Any]):
555
568
  return self.QuotedString(self._source[m.start + 1:m.end - 1])
556
569
 
557
570
 
571
+ ##
572
+
573
+
558
574
  def parse_grammar(
559
575
  source: str,
560
576
  *,
561
- no_core_rules: bool = False,
562
577
  root: str | None = None,
578
+ no_core_rules: bool = False,
579
+ no_optimize: bool = False,
563
580
  **kwargs: ta.Any,
564
581
  ) -> Grammar:
565
582
  source = fix_ws(source)
566
583
 
567
- if (mg_m := parse_rules(
568
- META_GRAMMAR,
584
+ if (mg_m := META_GRAMMAR.parse(
569
585
  source,
570
586
  complete=True,
571
587
  **kwargs,
572
588
  )) is None:
573
589
  raise AbnfGrammarParseError(source)
574
590
 
591
+ mg_m = only_match_rules(mg_m)
592
+
593
+ mg_m = filter_match_channels(
594
+ mg_m,
595
+ META_GRAMMAR,
596
+ keep=(Channel.STRUCTURE,),
597
+ keep_children=True,
598
+ )
599
+
575
600
  check.isinstance(mg_m.op, Repeat)
576
601
 
577
602
  mg_rmv = MetaGrammarRuleMatchVisitor(source)
578
- rules = [mg_rmv.visit_match(gg_cm) for gg_cm in mg_m.children]
603
+ rules = [
604
+ check.isinstance(mg_rmv.visit_match(gg_cm), Rule)
605
+ for gg_cm in mg_m.children
606
+ ]
579
607
 
580
- return Grammar(
608
+ gram = Grammar(
581
609
  *rules,
582
610
  *(CORE_RULES if not no_core_rules else []),
583
611
  root=root,
584
612
  )
613
+
614
+ if not no_optimize:
615
+ gram = optimize_grammar(gram)
616
+
617
+ return gram