PyPI - omextra - Versions diffs - 0.0.0.dev497__py3-none-any.whl → 0.0.0.dev499__py3-none-any.whl - Mend

omextra 0.0.0.dev497py3-none-any.whl → 0.0.0.dev499py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

omextra/text/abnf/__init__.py +45 -14
omextra/text/abnf/_dataclasses.py +246 -0
omextra/text/abnf/base.py +6 -279
omextra/text/abnf/core.py +22 -10
omextra/text/abnf/grammars.py +235 -0
omextra/text/abnf/matches.py +145 -0
omextra/text/abnf/meta.py +39 -17
omextra/text/abnf/ops.py +67 -5
omextra/text/abnf/opto.py +167 -64
omextra/text/abnf/parsing.py +53 -5
omextra/text/abnf/utils.py +38 -41
omextra/text/abnf/visitors.py +1 -1
{omextra-0.0.0.dev497.dist-info → omextra-0.0.0.dev499.dist-info}/METADATA +2 -2
{omextra-0.0.0.dev497.dist-info → omextra-0.0.0.dev499.dist-info}/RECORD +18 -16
{omextra-0.0.0.dev497.dist-info → omextra-0.0.0.dev499.dist-info}/WHEEL +0 -0
{omextra-0.0.0.dev497.dist-info → omextra-0.0.0.dev499.dist-info}/entry_points.txt +0 -0
{omextra-0.0.0.dev497.dist-info → omextra-0.0.0.dev499.dist-info}/licenses/LICENSE +0 -0
{omextra-0.0.0.dev497.dist-info → omextra-0.0.0.dev499.dist-info}/top_level.txt +0 -0

omextra/text/abnf/ops.py CHANGED Viewed

@@ -7,6 +7,7 @@ from omlish import lang
 from .base import CompositeOp
 from .base import LeafOp
 from .base import Op
+from .base import OpTuple
 ##
@@ -106,19 +107,51 @@ class Concat(CompositeOp, lang.Final):
     def __init__(self, *children: Op) -> None:
         super().__init__()
-        for c in check.not_empty(children):
+        check.arg(len(children) > 1)
+        for i, c in enumerate(children):
             check.isinstance(c, Op)
+            if i:
+                check.state(not (isinstance(c, Concat) and isinstance(children[i - 1], Concat)))
         self._children = children
     @property
-    def children(self) -> ta.Sequence[Op]:
+    def children(self) -> OpTuple:
         return self._children
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}@{id(self):x}({", ".join(map(repr, self._children))})'
+    def replace_children(self, *children: Op) -> Op:
+        if children == self._children:
+            return self
+        return concat(*children)
+def concat(*children: Op) -> Op:
+    if len(children) == 1:
+        return children[0]
+    check.not_empty(children)
-concat = Concat
+    lst: list[Op | list[Op]] = []
+    for c in children:
+        if (
+                lst and
+                isinstance(c, Concat) and
+                isinstance(ll := lst[-1], (Concat, list))
+        ):
+            if isinstance(ll, list):
+                ll.extend(c.children)
+            else:
+                lst.append([*ta.cast(list, lst.pop()), *c.children])
+        else:
+            lst.append(c)
+    if len(lst) == 1:
+        return Concat(*e) if isinstance(e := lst[0], list) else e
+    return Concat(*[Concat(*e) if isinstance(e, list) else e for e in lst])
 ##
@@ -161,12 +194,19 @@ class Repeat(CompositeOp, lang.Final):
         return self._child
     @property
-    def children(self) -> ta.Sequence[Op]:
+    def children(self) -> OpTuple:
         return (self._child,)
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}@{id(self):x}({self._times}, {self._child!r})'
+    def replace_children(self, *children: Op) -> Op:
+        child = check.single(children)
+        if child == self._child:
+            return self
+        return Repeat(self._times, child)
 @ta.overload
 def repeat(child: Op) -> Repeat:  # noqa
@@ -238,7 +278,7 @@ class Either(CompositeOp, lang.Final):
         self._first_match = first_match
     @property
-    def children(self) -> ta.Sequence[Op]:
+    def children(self) -> OpTuple:
         return self._children
     @property
@@ -252,6 +292,12 @@ class Either(CompositeOp, lang.Final):
             f'{", first_match=True" if self._first_match else ""})'
         )
+    def replace_children(self, *children: Op) -> Op:
+        if children == self._children:
+            return self
+        return Either(*children, first_match=self._first_match)
 either = Either
@@ -265,12 +311,28 @@ class RuleRef(Op, lang.Final):
         super().__init__()
         self._name = check.non_empty_str(name)
         self._name_f = name.casefold()
+    def coalesce(self, other: Op) -> Op:
+        """
+        Op nodes are compared by identity, and transformations return identical node instances when nothing has changed.
+        This method assists with that, preserving RuleRef node identity if the given node is otherwise equal.
+        """
+        if isinstance(other, RuleRef) and other.name_f == self.name_f:
+            return self
+        return other
     @property
     def name(self) -> str:
         return self._name
+    @property
+    def name_f(self) -> str:
+        return self._name_f
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}@{id(self):x}({self._name!r})'

omextra/text/abnf/opto.py CHANGED Viewed

@@ -1,18 +1,24 @@
 """
 TODO:
- - Merge concat
- - Merge concatted literals
- - Regex
+ - origin tracking?
+ - minor opts:
+  - merge concat(range, range)
 """
+import abc
 import re
 import typing as ta
 from omlish import check
+from omlish import dataclasses as dc
+from omlish import lang
+from .base import CompositeOp
 from .base import Op
+from .grammars import Channel
+from .grammars import Grammar
+from .grammars import Rule
 from .internal import Regex
 from .ops import CaseInsensitiveStringLiteral
-from .ops import CompositeOp
 from .ops import Concat
 from .ops import Either
 from .ops import RangeLiteral
@@ -24,42 +30,109 @@ from .ops import StringLiteral
 ##
-def _build_op_regex_pat(op: Op, pats_by_op: ta.Mapping[Op, str | None]) -> str | None:
-    if isinstance(op, StringLiteral):
-        return re.escape(op.value)
+@dc.dataclass(frozen=True)
+class _RegexItem(lang.Abstract):
+    @property
+    @abc.abstractmethod
+    def pat(self) -> str:
+        raise NotImplementedError
-    elif isinstance(op, CaseInsensitiveStringLiteral):
-        return f'(?i:{re.escape(op.value)})'
+    @classmethod
+    def of_op(cls, op: Op) -> ta.Optional['_RegexItem']:
+        if isinstance(op, StringLiteral):
+            return _StringLiteralRegexItem(op.value)
+        elif isinstance(op, CaseInsensitiveStringLiteral):
+            return _CaseInsensitiveStringLiteralRegexItem(op.value)
+        elif isinstance(op, RangeLiteral):
+            lo = re.escape(op.value.lo)
+            hi = re.escape(op.value.hi)
+            return _RegexRegexItem(f'[{lo}-{hi}]')
+        elif isinstance(op, Regex):
+            return _RegexRegexItem(op.pat.pattern)
+        else:
+            return None
+    @classmethod
+    def of(cls, obj: ta.Union['_RegexItem', Op, None]) -> ta.Optional['_RegexItem']:
+        if obj is None:
+            return None
+        elif isinstance(obj, _RegexItem):
+            return obj
+        elif isinstance(obj, Op):
+            return cls.of_op(obj)
+        else:
+            raise TypeError(obj)
+@dc.dataclass(frozen=True)
+class _StringLiteralRegexItem(_RegexItem, lang.Final):
+    s: str
+    @property
+    def pat(self) -> str:
+        return re.escape(self.s)
+@dc.dataclass(frozen=True)
+class _CaseInsensitiveStringLiteralRegexItem(_RegexItem, lang.Final):
+    s: str
+    @property
+    def pat(self) -> str:
+        return f'(?i:{re.escape(self.s)})'
+@dc.dataclass(frozen=True)
+class _RegexRegexItem(_RegexItem, lang.Final):
+    ps: str
+    @property
+    def pat(self) -> str:
+        return self.ps
+def _regex_item_transform_op(op: Op) -> _RegexItem | None:
+    if isinstance(op, (StringLiteral, CaseInsensitiveStringLiteral, Regex)):
+        return None
     elif isinstance(op, RangeLiteral):
-        lo = re.escape(op.value.lo)
-        hi = re.escape(op.value.hi)
-        return f'[{lo}-{hi}]'
+        # Unlike other leafs we eagerly transform RangeLiteral to a regex as it's probably faster than the python impl,
+        # even alone.
+        return _RegexItem.of_op(op)
     elif isinstance(op, RuleRef):
         return None
-    elif isinstance(op, Regex):
-        return op.pat.pattern
     elif isinstance(op, Concat):
-        child_pats = [pats_by_op[child] for child in op.children]
-        if not all(ca is not None for ca in child_pats):
+        children = [_regex_item_transform_op(child) or _RegexItem.of(child) for child in op.children]
+        if all(ca is not None for ca in children):
+            return _RegexRegexItem(''.join(check.not_none(ca).pat for ca in children))
+        if not any(ca is not None for ca in children):
             return None
-        return ''.join(ta.cast(str, ca) for ca in child_pats)
+        # FIXME: merge adjacent
+        return None
     elif isinstance(op, Repeat):
-        if (child_pat := pats_by_op[op.child]) is None:
+        child = _RegexItem.of(_regex_item_transform_op(op.child))
+        if child is None:
             return None
         # Wrap the child pattern in a non-capturing group if needed to ensure correct quantification. A pattern needs
         # wrapping if it contains multiple elements or operators (e.g., 'ab', 'a|b'). Single character classes [a-z] and
         # single escaped chars don't need wrapping.
-        needs_group = (
-            len(child_pat) > 1 and
-            not (child_pat.startswith('[') and child_pat.endswith(']'))
-        )
-        if needs_group:
+        if (
+                len(child_pat := child.pat) > 1 and
+                not (child_pat.startswith('[') and child_pat.endswith(']'))
+        ):
             child_pat = f'(?:{child_pat})'
         times = op.times
@@ -76,7 +149,7 @@ def _build_op_regex_pat(op: Op, pats_by_op: ta.Mapping[Op, str | None]) -> str |
         else:
             quantifier = f'{{{times.min},{times.max}}}'
-        return child_pat + quantifier
+        return _RegexRegexItem(child_pat + quantifier)
     elif isinstance(op, Either):
         # Only convert Either if first_match is True, as regex alternation uses first-match semantics. ABNF Either with
@@ -84,71 +157,101 @@ def _build_op_regex_pat(op: Op, pats_by_op: ta.Mapping[Op, str | None]) -> str |
         if not op.first_match:
             return None
-        child_pats = [pats_by_op[child] for child in op.children]
-        if not all(ca is not None for ca in child_pats):
+        children = [_regex_item_transform_op(child) or _RegexItem.of(child) for child in op.children]
+        if all(ca is not None for ca in children):
+            # Build regex alternation. Use a capturing group for the alternation
+            return _RegexRegexItem(''.join([
+                '(',
+                '|'.join(check.not_none(ca).pat for ca in children),
+                ')',
+            ]))
+        if not any(ca is not None for ca in children):
             return None
-        # Build regex alternation. Use a capturing group for the alternation
-        return f'({"|".join(ta.cast("ta.Sequence[str]", child_pats))})'
+        # FIXME: merge adjacent
+        return None
     else:
         raise TypeError(op)
-def _regex_transform_single_op(op: Op, pats_by_op: ta.Mapping[Op, str | None]) -> Op:
-    pat = pats_by_op[op]
+def _regex_transform_op(op: Op) -> Op:
+    v = _regex_item_transform_op(op)
-    if pat is not None:
-        if isinstance(op, Regex):
-            return op
-        return Regex(re.compile(pat))
+    if v is None:
+        return op
-    if isinstance(op, Concat):
-        new_children = tuple(_regex_transform_single_op(child, pats_by_op) for child in op.children)
-        if new_children == op.children:
-            return op
+    elif isinstance(v, _RegexItem):
+        return Regex(re.compile(v.pat))
-        return Concat(*new_children)
+    else:
+        raise TypeError(v)
-    elif isinstance(op, Repeat):
-        new_child = _regex_transform_single_op(op.child, pats_by_op)
-        if new_child == op.child:
-            return op
-        return Repeat(op.times, new_child)
+##
-    elif isinstance(op, Either):
-        new_children = tuple(_regex_transform_single_op(child, pats_by_op) for child in op.children)
-        if new_children == op.children:
-            return op
-        return Either(*new_children, first_match=op.first_match)
+def optimize_op(op: Op) -> Op:
+    op = _regex_transform_op(op)
     return op
-def regex_transform_op(op: Op) -> Op:
-    pats_by_op: dict[Op, str | None] = {}
+##
+def _inline_rules(fn: ta.Callable[[Rule], bool], gram: Grammar) -> Grammar:
+    cur_rule: Rule
+    inlined_rules: dict[str, Op] = {}
-    def analyze_tree(o: Op) -> None:
-        check.not_in(o, pats_by_op)
+    def rec_op(op: Op) -> Op:
+        if isinstance(op, RuleRef):
+            if op.name_f == cur_rule.name_f:
+                return op
-        if isinstance(o, CompositeOp):
-            for child in o.children:
-                analyze_tree(child)
+            if (r := gram.rule(op.name)) is None or not fn(r):
+                return op
-        pats_by_op[o] = _build_op_regex_pat(o, pats_by_op)
+            try:
+                return inlined_rules[r.name]
+            except KeyError:
+                pass
-    analyze_tree(op)
+            inlined_rules[op.name] = op
+            i_op = rec_op(r.op)
+            inlined_rules[op.name] = i_op
-    return _regex_transform_single_op(op, pats_by_op)
+            return op.coalesce(i_op)
+        elif isinstance(op, CompositeOp):
+            return op.replace_children(*map(rec_op, op.children))
+        else:
+            return op
+    new_rules: list[Rule] = []
+    for rule in gram.rules:
+        cur_rule = rule
+        new_rules.append(rule.replace_op(rec_op(rule.op)))
+    return gram.replace_rules(*new_rules)
 ##
-def optimize_op(op: Op) -> Op:
-    op = regex_transform_op(op)
+def optimize_grammar(
+        gram: Grammar,
+        *,
+        inline_channels: ta.Container[Channel] | None = (Channel.SPACE,),
+) -> Grammar:
+    if inline_channels:
+        gram = _inline_rules(lambda r: r.channel in inline_channels, gram)
-    return op
+    gram = gram.replace_rules(*[
+        r.replace_op(optimize_op(r.op))
+        for r in gram.rules
+    ])
+    return gram

omextra/text/abnf/parsing.py CHANGED Viewed

@@ -2,10 +2,12 @@ import typing as ta
 from omlish import check
-from .base import Grammar
-from .base import Match
 from .base import Op
+from .grammars import Grammar
+from .grammars import Rule
 from .internal import Regex
+from .matches import Match
+from .matches import longest_match
 from .ops import CaseInsensitiveStringLiteral
 from .ops import Concat
 from .ops import Either
@@ -35,6 +37,8 @@ class _Parser:
         self._source = source
         self._max_steps = max_steps
+        self._rules = self._grammar._rules  # Noqa
         self._dispatch: dict[type[Op], ta.Any] = {
             StringLiteral: self._iter_parse_string_literal,
             CaseInsensitiveStringLiteral: self._iter_parse_case_insensitive_string_literal,
@@ -67,6 +71,7 @@ class _Parser:
             source = self._source[start]  # noqa
         except IndexError:
             return
         # ranges are always case-sensitive
         if (value := op._value).lo <= source <= value.hi:  # noqa
             yield Match(op, start, start + 1, ())
@@ -160,7 +165,7 @@ class _Parser:
                 return
     def _iter_parse_rule_ref(self, op: RuleRef, start: int) -> ta.Iterator[Match]:
-        cp = self._grammar._rules_by_name_f[op._name_f].op  # noqa
+        cp = self._rules._rules_by_name_f[op._name_f].op  # noqa
         for cm in self.iter_parse(cp, start):
             yield Match(op, cm.start, cm.end, (cm,))
@@ -225,7 +230,7 @@ class _DebugParser(_Parser):
             ps = check.isinstance(op, RuleRef).name
         else:
             ps = self._op_str(op)
-        body = f'{start}:{self._source[start]!r} {ps}'
+        body = f'{start}:{self._source[start] if start < len(self._source) else ""!r} {ps}'
         if self._level > 2:
             self._write(f'{ws}+ {body}')
@@ -248,7 +253,7 @@ class _DebugParser(_Parser):
                 self._write(f'{ws}- {body}')
-##
+#
 def _iter_parse(
@@ -276,3 +281,46 @@ def _iter_parse(
         )
     return parser.iter_parse(op, start)
+##
+def iter_parse(
+        obj: Grammar | Rule | Op,
+        src: str,
+        *,
+        root: str | None = None,
+        start: int = 0,
+) -> ta.Iterator[Match]:
+    if isinstance(obj, Grammar):
+        gram = obj
+    elif isinstance(obj, Rule):
+        check.none(root)
+        gram = Grammar(obj, root=obj)
+    elif isinstance(obj, Op):
+        check.none(root)
+        gram = Grammar(Rule('root', obj), root='root')
+    else:
+        raise TypeError(obj)
+    return gram.iter_parse(
+        src,
+        root,
+        start=start,
+    )
+def parse(
+        obj: Grammar | Rule | Op,
+        src: str,
+        *,
+        root: str | None = None,
+        start: int = 0,
+) -> Match | None:
+    return longest_match(iter_parse(
+        obj,
+        src,
+        root=root,
+        start=start,
+    ))

omextra/text/abnf/utils.py CHANGED Viewed

@@ -1,62 +1,59 @@
-import itertools
 import textwrap
 import typing as ta
 from omlish import check
-from .base import Grammar
-from .base import Match
+from .grammars import Channel
+from .grammars import Grammar
+from .matches import Match
+from .matches import filter_matches
 from .ops import RuleRef
 ##
-def strip_insignificant_match_rules(m: Match, g: Grammar) -> Match:
-    def rec(c: Match) -> Match:
-        return c.flat_map_children(
-            lambda x: (
-                (rec(x),) if not (
-                    isinstance((xp := x.op), RuleRef) and
-                    check.not_none(g.rule(xp.name)).insignificant
-                ) else ()
-            ),
-        )
-    return rec(m)
+def filter_match_channels(
+        m: Match,
+        g: Grammar,
+        *,
+        keep: ta.Container[Channel] | None = None,
+        remove: ta.Container[Channel] | None = None,
+        keep_children: bool = False,
+) -> Match:
+    if keep is None and remove is None:
+        return m
+    def fn(x: Match) -> bool:
+        if not isinstance((rr := x.op), RuleRef):
+            return False
-def only_match_rules(m: Match) -> Match:
-    def rec(c: Match) -> ta.Iterable[Match]:
-        if isinstance(c.op, RuleRef):
-            return (c.flat_map_children(rec),)
-        else:
-            return itertools.chain.from_iterable(map(rec, c.children))
-    return m.flat_map_children(rec)
+        r = check.not_none(g.rule(rr.name))
+        if keep is not None and r.channel not in keep:
+            return False
+        if remove is not None and r.channel in remove:
+            return False
+        return True
+    return filter_matches(
+        fn,
+        m,
+        keep_children=keep_children,
+    )
 #
-def parse_rules(
-        grammar: Grammar,
-        source: str,
-        root: str | None = None,
-        *,
-        start: int = 0,
-        **kwargs: ta.Any,
-) -> Match | None:
-    if (match := grammar.parse(
-            source,
-            root,
-            start=start,
-            **kwargs,
-    )) is None:
-        return None
-    match = only_match_rules(match)
-    match = strip_insignificant_match_rules(match, grammar)
-    return match
+def only_match_rules(m: Match) -> Match:
+    return filter_matches(
+        lambda x: isinstance(x.op, RuleRef),
+        m,
+        keep_children=True,
+    )
 ##

omextra/text/abnf/visitors.py CHANGED Viewed

@@ -5,8 +5,8 @@ from omlish import collections as col
 from omlish import dispatch
 from omlish import lang
-from .base import Match
 from .base import Op
+from .matches import Match
 from .ops import RuleRef

{omextra-0.0.0.dev497.dist-info → omextra-0.0.0.dev499.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: omextra
-Version: 0.0.0.dev497
+Version: 0.0.0.dev499
 Summary: omextra
 Author: wrmsr
 License-Expression: BSD-3-Clause
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Python: >=3.13
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: omlish==0.0.0.dev497
+Requires-Dist: omlish==0.0.0.dev499
 Dynamic: license-file
 # Overview

omextra 0.0.0.dev497__py3-none-any.whl → 0.0.0.dev499__py3-none-any.whl

omextra 0.0.0.dev497py3-none-any.whl → 0.0.0.dev499py3-none-any.whl