PyPI - omextra - Versions diffs - 0.0.0.dev496__py3-none-any.whl → 0.0.0.dev498__py3-none-any.whl - Mend

omextra 0.0.0.dev496py3-none-any.whl → 0.0.0.dev498py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

omextra/text/abnf/__init__.py +51 -18
omextra/text/abnf/_dataclasses.py +246 -0
omextra/text/abnf/base.py +21 -257
omextra/text/abnf/core.py +22 -10
omextra/text/abnf/grammars.py +235 -0
omextra/text/abnf/internal.py +1 -1
omextra/text/abnf/matches.py +145 -0
omextra/text/abnf/meta.py +45 -12
omextra/text/abnf/ops.py +76 -9
omextra/text/abnf/opto.py +257 -0
omextra/text/abnf/parsing.py +134 -20
omextra/text/abnf/utils.py +38 -41
omextra/text/abnf/visitors.py +1 -1
{omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/METADATA +2 -2
{omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/RECORD +19 -16
{omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/WHEEL +0 -0
{omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/entry_points.txt +0 -0
{omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/licenses/LICENSE +0 -0
{omextra-0.0.0.dev496.dist-info → omextra-0.0.0.dev498.dist-info}/top_level.txt +0 -0

omextra/text/abnf/ops.py CHANGED Viewed

@@ -4,8 +4,10 @@ from omlish import check
 from omlish import dataclasses as dc
 from omlish import lang
+from .base import CompositeOp
 from .base import LeafOp
 from .base import Op
+from .base import OpTuple
 ##
@@ -54,8 +56,8 @@ class RangeLiteral(Literal, lang.Final):
         hi: str
         def __post_init__(self) -> None:
-            check.non_empty_str(self.lo)
-            check.non_empty_str(self.hi)
+            for c in (self.lo, self.hi):
+                check.equal(len(check.non_empty_str(c)), 1)
             check.state(self.hi >= self.lo)
     def __init__(self, value: Range) -> None:
@@ -101,30 +103,62 @@ def literal(*args, case_sensitive=None):
 @ta.final
-class Concat(Op, lang.Final):
+class Concat(CompositeOp, lang.Final):
     def __init__(self, *children: Op) -> None:
         super().__init__()
-        for c in check.not_empty(children):
+        check.arg(len(children) > 1)
+        for i, c in enumerate(children):
             check.isinstance(c, Op)
+            if i:
+                check.state(not (isinstance(c, Concat) and isinstance(children[i - 1], Concat)))
         self._children = children
     @property
-    def children(self) -> ta.Sequence[Op]:
+    def children(self) -> OpTuple:
         return self._children
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}@{id(self):x}({", ".join(map(repr, self._children))})'
+    def replace_children(self, *children: Op) -> Op:
+        if children == self._children:
+            return self
+        return concat(*children)
-concat = Concat
+def concat(*children: Op) -> Op:
+    if len(children) == 1:
+        return children[0]
+    check.not_empty(children)
+    lst: list[Op | list[Op]] = []
+    for c in children:
+        if (
+                lst and
+                isinstance(c, Concat) and
+                isinstance(ll := lst[-1], (Concat, list))
+        ):
+            if isinstance(ll, list):
+                ll.extend(c.children)
+            else:
+                lst.append([*ta.cast(list, lst.pop()), *c.children])
+        else:
+            lst.append(c)
+    if len(lst) == 1:
+        return Concat(*e) if isinstance(e := lst[0], list) else e
+    return Concat(*[Concat(*e) if isinstance(e, list) else e for e in lst])
 ##
 @ta.final
-class Repeat(Op, lang.Final):
+class Repeat(CompositeOp, lang.Final):
     @dc.dataclass(frozen=True)
     class Times:
         min: int = 0
@@ -159,9 +193,20 @@ class Repeat(Op, lang.Final):
     def child(self) -> Op:
         return self._child
+    @property
+    def children(self) -> OpTuple:
+        return (self._child,)
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}@{id(self):x}({self._times}, {self._child!r})'
+    def replace_children(self, *children: Op) -> Op:
+        child = check.single(children)
+        if child == self._child:
+            return self
+        return Repeat(self._times, child)
 @ta.overload
 def repeat(child: Op) -> Repeat:  # noqa
@@ -223,7 +268,7 @@ def option(child: Op) -> Repeat:
 @ta.final
-class Either(Op, lang.Final):
+class Either(CompositeOp, lang.Final):
     def __init__(self, *children: Op, first_match: bool = False) -> None:
         super().__init__()
@@ -233,7 +278,7 @@ class Either(Op, lang.Final):
         self._first_match = first_match
     @property
-    def children(self) -> ta.Sequence[Op]:
+    def children(self) -> OpTuple:
         return self._children
     @property
@@ -247,6 +292,12 @@ class Either(Op, lang.Final):
             f'{", first_match=True" if self._first_match else ""})'
         )
+    def replace_children(self, *children: Op) -> Op:
+        if children == self._children:
+            return self
+        return Either(*children, first_match=self._first_match)
 either = Either
@@ -260,12 +311,28 @@ class RuleRef(Op, lang.Final):
         super().__init__()
         self._name = check.non_empty_str(name)
         self._name_f = name.casefold()
+    def coalesce(self, other: Op) -> Op:
+        """
+        Op nodes are compared by identity, and transformations return identical node instances when nothing has changed.
+        This method assists with that, preserving RuleRef node identity if the given node is otherwise equal.
+        """
+        if isinstance(other, RuleRef) and other.name_f == self.name_f:
+            return self
+        return other
     @property
     def name(self) -> str:
         return self._name
+    @property
+    def name_f(self) -> str:
+        return self._name_f
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}@{id(self):x}({self._name!r})'

omextra/text/abnf/opto.py ADDED Viewed

@@ -0,0 +1,257 @@
+"""
+TODO:
+ - origin tracking?
+ - minor opts:
+  - merge concat(range, range)
+"""
+import abc
+import re
+import typing as ta
+from omlish import check
+from omlish import dataclasses as dc
+from omlish import lang
+from .base import CompositeOp
+from .base import Op
+from .grammars import Channel
+from .grammars import Grammar
+from .grammars import Rule
+from .internal import Regex
+from .ops import CaseInsensitiveStringLiteral
+from .ops import Concat
+from .ops import Either
+from .ops import RangeLiteral
+from .ops import Repeat
+from .ops import RuleRef
+from .ops import StringLiteral
+##
+@dc.dataclass(frozen=True)
+class _RegexItem(lang.Abstract):
+    @property
+    @abc.abstractmethod
+    def pat(self) -> str:
+        raise NotImplementedError
+    @classmethod
+    def of_op(cls, op: Op) -> ta.Optional['_RegexItem']:
+        if isinstance(op, StringLiteral):
+            return _StringLiteralRegexItem(op.value)
+        elif isinstance(op, CaseInsensitiveStringLiteral):
+            return _CaseInsensitiveStringLiteralRegexItem(op.value)
+        elif isinstance(op, RangeLiteral):
+            lo = re.escape(op.value.lo)
+            hi = re.escape(op.value.hi)
+            return _RegexRegexItem(f'[{lo}-{hi}]')
+        elif isinstance(op, Regex):
+            return _RegexRegexItem(op.pat.pattern)
+        else:
+            return None
+    @classmethod
+    def of(cls, obj: ta.Union['_RegexItem', Op, None]) -> ta.Optional['_RegexItem']:
+        if obj is None:
+            return None
+        elif isinstance(obj, _RegexItem):
+            return obj
+        elif isinstance(obj, Op):
+            return cls.of_op(obj)
+        else:
+            raise TypeError(obj)
+@dc.dataclass(frozen=True)
+class _StringLiteralRegexItem(_RegexItem, lang.Final):
+    s: str
+    @property
+    def pat(self) -> str:
+        return re.escape(self.s)
+@dc.dataclass(frozen=True)
+class _CaseInsensitiveStringLiteralRegexItem(_RegexItem, lang.Final):
+    s: str
+    @property
+    def pat(self) -> str:
+        return f'(?i:{re.escape(self.s)})'
+@dc.dataclass(frozen=True)
+class _RegexRegexItem(_RegexItem, lang.Final):
+    ps: str
+    @property
+    def pat(self) -> str:
+        return self.ps
+def _regex_item_transform_op(op: Op) -> _RegexItem | None:
+    if isinstance(op, (StringLiteral, CaseInsensitiveStringLiteral, Regex)):
+        return None
+    elif isinstance(op, RangeLiteral):
+        # Unlike other leafs we eagerly transform RangeLiteral to a regex as it's probably faster than the python impl,
+        # even alone.
+        return _RegexItem.of_op(op)
+    elif isinstance(op, RuleRef):
+        return None
+    elif isinstance(op, Concat):
+        children = [_regex_item_transform_op(child) or _RegexItem.of(child) for child in op.children]
+        if all(ca is not None for ca in children):
+            return _RegexRegexItem(''.join(check.not_none(ca).pat for ca in children))
+        if not any(ca is not None for ca in children):
+            return None
+        # FIXME: merge adjacent
+        return None
+    elif isinstance(op, Repeat):
+        child = _RegexItem.of(_regex_item_transform_op(op.child))
+        if child is None:
+            return None
+        # Wrap the child pattern in a non-capturing group if needed to ensure correct quantification. A pattern needs
+        # wrapping if it contains multiple elements or operators (e.g., 'ab', 'a|b'). Single character classes [a-z] and
+        # single escaped chars don't need wrapping.
+        if (
+                len(child_pat := child.pat) > 1 and
+                not (child_pat.startswith('[') and child_pat.endswith(']'))
+        ):
+            child_pat = f'(?:{child_pat})'
+        times = op.times
+        if times.min == 0 and times.max is None:
+            quantifier = '*'
+        elif times.min == 1 and times.max is None:
+            quantifier = '+'
+        elif times.min == 0 and times.max == 1:
+            quantifier = '?'
+        elif times.max is None:
+            quantifier = f'{{{times.min},}}'
+        elif times.min == times.max:
+            quantifier = f'{{{times.min}}}'
+        else:
+            quantifier = f'{{{times.min},{times.max}}}'
+        return _RegexRegexItem(child_pat + quantifier)
+    elif isinstance(op, Either):
+        # Only convert Either if first_match is True, as regex alternation uses first-match semantics. ABNF Either with
+        # first_match=False uses longest-match semantics, which differs from regex.
+        if not op.first_match:
+            return None
+        children = [_regex_item_transform_op(child) or _RegexItem.of(child) for child in op.children]
+        if all(ca is not None for ca in children):
+            # Build regex alternation. Use a capturing group for the alternation
+            return _RegexRegexItem(''.join([
+                '(',
+                '|'.join(check.not_none(ca).pat for ca in children),
+                ')',
+            ]))
+        if not any(ca is not None for ca in children):
+            return None
+        # FIXME: merge adjacent
+        return None
+    else:
+        raise TypeError(op)
+def _regex_transform_op(op: Op) -> Op:
+    v = _regex_item_transform_op(op)
+    if v is None:
+        return op
+    elif isinstance(v, _RegexItem):
+        return Regex(re.compile(v.pat))
+    else:
+        raise TypeError(v)
+##
+def optimize_op(op: Op) -> Op:
+    op = _regex_transform_op(op)
+    return op
+##
+def _inline_rules(fn: ta.Callable[[Rule], bool], gram: Grammar) -> Grammar:
+    cur_rule: Rule
+    inlined_rules: dict[str, Op] = {}
+    def rec_op(op: Op) -> Op:
+        if isinstance(op, RuleRef):
+            if op.name_f == cur_rule.name_f:
+                return op
+            if (r := gram.rule(op.name)) is None or not fn(r):
+                return op
+            try:
+                return inlined_rules[r.name]
+            except KeyError:
+                pass
+            inlined_rules[op.name] = op
+            i_op = rec_op(r.op)
+            inlined_rules[op.name] = i_op
+            return op.coalesce(i_op)
+        elif isinstance(op, CompositeOp):
+            return op.replace_children(*map(rec_op, op.children))
+        else:
+            return op
+    new_rules: list[Rule] = []
+    for rule in gram.rules:
+        cur_rule = rule
+        new_rules.append(rule.replace_op(rec_op(rule.op)))
+    return gram.replace_rules(*new_rules)
+##
+def optimize_grammar(
+        gram: Grammar,
+        *,
+        inline_channels: ta.Container[Channel] | None = (Channel.SPACE,),
+) -> Grammar:
+    if inline_channels:
+        gram = _inline_rules(lambda r: r.channel in inline_channels, gram)
+    gram = gram.replace_rules(*[
+        r.replace_op(optimize_op(r.op))
+        for r in gram.rules
+    ])
+    return gram

omextra/text/abnf/parsing.py CHANGED Viewed

@@ -2,10 +2,12 @@ import typing as ta
 from omlish import check
-from .base import Grammar
-from .base import Match
 from .base import Op
+from .grammars import Grammar
+from .grammars import Rule
 from .internal import Regex
+from .matches import Match
+from .matches import longest_match
 from .ops import CaseInsensitiveStringLiteral
 from .ops import Concat
 from .ops import Either
@@ -19,15 +21,23 @@ from .ops import StringLiteral
 class _Parser:
+    class MaxStepsExceededError(Exception):
+        pass
     def __init__(
             self,
             grammar: Grammar,
             source: str,
+            *,
+            max_steps: int | None = None,
     ) -> None:
         super().__init__()
         self._grammar = grammar
         self._source = source
+        self._max_steps = max_steps
+        self._rules = self._grammar._rules  # Noqa
         self._dispatch: dict[type[Op], ta.Any] = {
             StringLiteral: self._iter_parse_string_literal,
@@ -40,6 +50,10 @@ class _Parser:
             Regex: self._iter_parse_regex,
         }
+        self._memo: dict[tuple[Op, int], tuple[Match, ...]] = {}
+        self._cur_step = 0
     def _iter_parse_string_literal(self, op: StringLiteral, start: int) -> ta.Iterator[Match]:
         if start < len(self._source):  # noqa
             source = self._source[start : start + len(op._value)]  # noqa
@@ -57,67 +71,120 @@ class _Parser:
             source = self._source[start]  # noqa
         except IndexError:
             return
         # ranges are always case-sensitive
         if (value := op._value).lo <= source <= value.hi:  # noqa
             yield Match(op, start, start + 1, ())
     def _iter_parse_concat(self, op: Concat, start: int) -> ta.Iterator[Match]:
-        i = 0
         match_tups: list[tuple[Match, ...]] = [()]
+        i = 0
         for cp in op._children:  # noqa
             next_match_tups: list[tuple[Match, ...]] = []
             for mt in match_tups:
                 for cm in self.iter_parse(cp, mt[-1].end if mt else start):
                     next_match_tups.append((*mt, cm))
                     i += 1
             if not next_match_tups:
                 return
             match_tups = next_match_tups
         if not i:
             return
         for mt in sorted(match_tups, key=len, reverse=True):
             yield Match(op, start, mt[-1].end if mt else start, mt)
     def _iter_parse_repeat(self, op: Repeat, start: int) -> ta.Iterator[Match]:
-        match_tup_set: set[tuple[Match, ...]] = set()
-        last_match_tup_set: set[tuple[Match, ...]] = {()}
+        # Map from (repetition_count, end_position) to longest match tuple
+        matches_by_count_pos: dict[tuple[int, int], tuple[Match, ...]] = {(0, start): ()}
+        max_end_by_count: dict[int, int] = {0: start}
         i = 0
         while True:
             if op._times.max is not None and i == op._times.max:  # noqa
                 break
-            next_match_tup_set: set[tuple[Match, ...]] = set()
-            for mt in last_match_tup_set:
-                for cm in self.iter_parse(op._child, mt[-1].end if mt else start):  # noqa
-                    next_match_tup_set.add((*mt, cm))
-            if not next_match_tup_set or next_match_tup_set < match_tup_set:
+            if self._max_steps is not None and self._cur_step > self._max_steps:
+                raise _Parser.MaxStepsExceededError(self._cur_step)
+            self._cur_step += 1
+            next_matches: dict[tuple[int, int], tuple[Match, ...]] = {}
+            next_max_end = max_end_by_count.get(i, -1)
+            for (count, end_pos), mt in matches_by_count_pos.items():
+                if count != i:
+                    continue
+                for cm in self.iter_parse(op._child, end_pos):  # noqa
+                    next_mt = (*mt, cm)
+                    next_key = (i + 1, cm.end)
+                    # Keep only the longest match tuple for each (count, position)
+                    if next_key not in next_matches or len(next_mt) > len(next_matches[next_key]):
+                        next_matches[next_key] = next_mt
+                        if cm.end > next_max_end:
+                            next_max_end = cm.end
+            if not next_matches:
+                break
+            # Check if we made progress (reached new positions)
+            if next_max_end <= max_end_by_count.get(i, -1):
                 break
             i += 1
-            match_tup_set |= next_match_tup_set
-            last_match_tup_set = next_match_tup_set
+            matches_by_count_pos.update(next_matches)
+            max_end_by_count[i] = next_max_end
         if i < op._times.min:  # noqa
             return
-        for mt in sorted(match_tup_set or [()], key=len, reverse=True):
-            yield Match(op, start, mt[-1].end if mt else start, mt)  # noqa
+        # Collect valid matches and sort by (end_position, repetition_count) descending
+        valid_matches: list[tuple[int, int, tuple[Match, ...]]] = []
+        for (count, end_pos), mt in matches_by_count_pos.items():
+            if op._times.min <= count <= (op._times.max if op._times.max is not None else i):  # noqa
+                valid_matches.append((end_pos, count, mt))
+        for end_pos, _, mt in sorted(valid_matches, key=lambda x: (x[0], x[1]), reverse=True):
+            yield Match(op, start, end_pos, mt)
     def _iter_parse_either(self, op: Either, start: int) -> ta.Iterator[Match]:
         for cp in op._children:  # noqa
             found = False
             for cm in self.iter_parse(cp, start):
                 found = True
                 yield Match(op, start, cm.end, (cm,))
             if found and op._first_match:  # noqa
                 return
     def _iter_parse_rule_ref(self, op: RuleRef, start: int) -> ta.Iterator[Match]:
-        cp = self._grammar._rules_by_name_f[op._name_f].op  # noqa
+        cp = self._rules._rules_by_name_f[op._name_f].op  # noqa
         for cm in self.iter_parse(cp, start):
             yield Match(op, cm.start, cm.end, (cm,))
     def _iter_parse_regex(self, op: Regex, start: int) -> ta.Iterator[Match]:
-        raise NotImplementedError
+        if (m := op._pat.match(self._source, start)) is not None:  # noqa
+            yield Match(op, start, m.end(), ())
     def iter_parse(self, op: Op, start: int) -> ta.Iterator[Match]:
-        return self._dispatch[op.__class__](op, start)
+        if (key := (op, start)) in self._memo:
+            yield from self._memo[key]
+            return
+        if self._max_steps is not None and self._cur_step >= self._max_steps:
+            raise _Parser.MaxStepsExceededError(self._cur_step)
+        self._cur_step += 1
+        matches = tuple(self._dispatch[op.__class__](op, start))
+        self._memo[key] = matches
+        yield from matches
 ##
@@ -131,8 +198,9 @@ class _DebugParser(_Parser):
             level: int = 1,
             *,
             write: ta.Callable[[str], None] | None = None,
+            **kwargs: ta.Any,
     ) -> None:
-        super().__init__(grammar, source)
+        super().__init__(grammar, source, **kwargs)
         self._level = level
         if write is None:
@@ -162,7 +230,7 @@ class _DebugParser(_Parser):
             ps = check.isinstance(op, RuleRef).name
         else:
             ps = self._op_str(op)
-        body = f'{start}:{self._source[start]!r} {ps}'
+        body = f'{start}:{self._source[start] if start < len(self._source) else ""!r} {ps}'
         if self._level > 2:
             self._write(f'{ws}+ {body}')
@@ -185,7 +253,7 @@ class _DebugParser(_Parser):
                 self._write(f'{ws}- {body}')
-##
+#
 def _iter_parse(
@@ -195,18 +263,64 @@ def _iter_parse(
         start: int,
         *,
         debug: int = 0,
+        max_steps: int | None = None,
 ) -> ta.Iterator[Match]:
     parser: _Parser
     if debug:
         parser = _DebugParser(
             grammar,
             source,
+            max_steps=max_steps,
             level=debug,
         )
     else:
         parser = _Parser(
             grammar,
             source,
+            max_steps=max_steps,
         )
     return parser.iter_parse(op, start)
+##
+def iter_parse(
+        obj: Grammar | Rule | Op,
+        src: str,
+        *,
+        root: str | None = None,
+        start: int = 0,
+) -> ta.Iterator[Match]:
+    if isinstance(obj, Grammar):
+        gram = obj
+    elif isinstance(obj, Rule):
+        check.none(root)
+        gram = Grammar(obj, root=obj)
+    elif isinstance(obj, Op):
+        check.none(root)
+        gram = Grammar(Rule('root', obj), root='root')
+    else:
+        raise TypeError(obj)
+    return gram.iter_parse(
+        src,
+        root,
+        start=start,
+    )
+def parse(
+        obj: Grammar | Rule | Op,
+        src: str,
+        *,
+        root: str | None = None,
+        start: int = 0,
+) -> Match | None:
+    return longest_match(iter_parse(
+        obj,
+        src,
+        root=root,
+        start=start,
+    ))

omextra 0.0.0.dev496__py3-none-any.whl → 0.0.0.dev498__py3-none-any.whl

omextra 0.0.0.dev496py3-none-any.whl → 0.0.0.dev498py3-none-any.whl