PyPI - hyperbase - Versions diffs - 0.8.0__py3-none-any.whl - Mend

hyperbase 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

hyperbase/__init__.py +6 -0
hyperbase/constants.py +4 -0
hyperbase/hyperedge.py +1127 -0
hyperbase/parsers/__init__.py +39 -0
hyperbase/parsers/correctness.py +265 -0
hyperbase/parsers/parser.py +41 -0
hyperbase/parsers/utils.py +19 -0
hyperbase/patterns/__init__.py +29 -0
hyperbase/patterns/argroles.py +142 -0
hyperbase/patterns/atoms.py +98 -0
hyperbase/patterns/common.py +172 -0
hyperbase/patterns/counter.py +153 -0
hyperbase/patterns/entrypoints.py +87 -0
hyperbase/patterns/matcher.py +245 -0
hyperbase/patterns/merge.py +52 -0
hyperbase/patterns/properties.py +59 -0
hyperbase/patterns/utils.py +118 -0
hyperbase/patterns/variables.py +161 -0
hyperbase-0.8.0.dist-info/METADATA +64 -0
hyperbase-0.8.0.dist-info/RECORD +23 -0
hyperbase-0.8.0.dist-info/WHEEL +4 -0
hyperbase-0.8.0.dist-info/licenses/AUTHORS +5 -0
hyperbase-0.8.0.dist-info/licenses/LICENSE +21 -0

hyperbase/parsers/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+from importlib.metadata import entry_points, EntryPoint
+from typing import Any
+from hyperbase.parsers.parser import Parser
+def list_parsers() -> dict[str, EntryPoint]:
+    """Return all installed parser plugins.
+    Each plugin registers via the ``hyperbase.parsers`` entry-point group
+    in its ``pyproject.toml``::
+        [project.entry-points."hyperbase.parsers"]
+        alphabeta = "hyperparser_alphabeta:ParserAlphaBeta"
+    """
+    eps = entry_points(group="hyperbase.parsers")
+    return {ep.name: ep for ep in eps}
+def get_parser(name: str, **kwargs: Any) -> Parser:
+    """Instantiate a parser plugin by name.
+    Looks up *name* in the ``hyperbase.parsers`` entry-point group and
+    returns an instance of the registered :class:`Parser` subclass.
+    Raises :class:`ValueError` if the parser is not installed.
+    """
+    parsers = list_parsers()
+    if name not in parsers:
+        available = ", ".join(sorted(parsers)) or "(none)"
+        raise ValueError(
+            f"Parser {name!r} is not installed. "
+            f"Available parsers: {available}"
+        )
+    cls = parsers[name].load()
+    return cls(**kwargs)  # type: ignore[no-any-return]
+__all__ = ["Parser", "get_parser", "list_parsers"]

hyperbase/parsers/correctness.py ADDED Viewed

@@ -0,0 +1,265 @@
+from __future__ import annotations
+from collections import Counter
+from typing import Any
+from hyperbase.hyperedge import Hyperedge
+from hyperbase.parsers.utils import filter_alphanumeric_strings
+def check_structural_quality(edge: Hyperedge) -> dict[Hyperedge, list[tuple[str, str, int]]]:
+    errors: dict[Hyperedge, list[tuple[str, str, int]]] = {}
+    def _visit(current_edge: Hyperedge) -> None:
+        if not current_edge or current_edge.atom:
+            return
+        current_errors: list[tuple[str, str, int]] = []
+        # Argrole checks
+        try:
+            ars = current_edge.argroles()
+            ar_counts: Counter[str] = Counter()
+            for ar in ars:
+                if ar not in 'mspaoixtjrc':
+                    current_errors.append(('bad-argrole', f"Bad argument role '{ar}'. Should be one of 'mspaoixtjrc'.", 2))
+                ar_counts[ar] += 1
+            for role in 'spoiamc':
+                 if ar_counts[role] > 1:
+                     current_errors.append((f'duplicate-argrole-{role}', f"Argument role '{role}' should only be used once.", 2))
+        except Exception:
+            pass
+        # Junction checks
+        try:
+            if current_edge[0].mt == 'J':
+                types = set([child.mt for child in current_edge[1:]])
+                if types != {'R'} and types != {'C'} and types != {'R', 'S'}:
+                     current_errors.append(('bad-junction-types', "Junction arguments should ideally be all of type 'R[S]' or all of type 'C'.", 3))
+        except Exception:
+            pass
+        if current_errors:
+            errors[current_edge] = current_errors
+        for child in current_edge:
+            _visit(child)
+    if edge:
+        _visit(edge)
+    return errors
+def badness_check(
+    edge: Hyperedge,
+    tokens: list[str]
+) -> dict[Any, list[tuple[str, str, int]]]:
+    raw_errors = edge.check_correctness()
+    errors: dict[Any, list[tuple[str, str, int]]] = {}
+    for k, v in raw_errors.items():
+        errors[k] = [(err_type, err_msg, 0) for err_type, err_msg in v]
+    structural_errors = check_structural_quality(edge)
+    for k, v2 in structural_errors.items():
+        if k in errors:
+            errors[k].extend(v2)
+        else:
+            errors[k] = v2
+    # Only check token matching if we have a valid edge
+    if edge:
+        try:
+            tokens = filter_alphanumeric_strings(tokens)
+            roots = filter_alphanumeric_strings([atom.label() for atom in edge.all_atoms()])
+            # Track which tokens and roots have been matched
+            matched_tokens: set[int] = set()
+            matched_roots: set[int] = set()
+            # Count remaining unmatched instances of each root
+            def count_unmatched_roots(root_value: str) -> int:
+                count = 0
+                for root_idx, root in enumerate(roots):
+                    if root == root_value and root_idx not in matched_roots:
+                        count += 1
+                return count
+            # Go through each token and try to find matching roots
+            for token_idx, token in enumerate(tokens):
+                if token_idx in matched_tokens:
+                    continue  # Already matched this token
+                # Try exact match first
+                unmatched_root_count = count_unmatched_roots(token)
+                if unmatched_root_count > 0:
+                    matched_tokens.add(token_idx)
+                    # Find an unmatched instance of this root
+                    for root_idx, root in enumerate(roots):
+                        if root == token and root_idx not in matched_roots:
+                            matched_roots.add(root_idx)
+                            break
+                else:
+                    # Try to find a root that matches this token exactly (case (a))
+                    for root_idx, root in enumerate(roots):
+                        if root_idx in matched_roots:
+                            continue  # Already matched this root
+                        if root == token:
+                            matched_tokens.add(token_idx)
+                            matched_roots.add(root_idx)
+                            break
+                    # If no exact match, try to find combination of roots that form this token (case (b))
+                    if token_idx not in matched_tokens:
+                        # Look for sequence of consecutive roots that concatenate to form the token
+                        for root_start_idx in range(len(roots)):
+                            if root_start_idx in matched_roots:
+                                continue  # This root is already matched
+                            concatenated = ""
+                            root_sequence: list[int] = []
+                            for root_idx in range(root_start_idx, len(roots)):
+                                if root_idx in matched_roots:
+                                    # Can't use matched roots in sequence
+                                    break
+                                root = roots[root_idx]
+                                concatenated += root
+                                root_sequence.append(root_idx)
+                                if concatenated == token:
+                                    # Found a matching sequence
+                                    matched_tokens.add(token_idx)
+                                    for idx in root_sequence:
+                                        matched_roots.add(idx)
+                                    break
+                                if len(concatenated) >= len(token):
+                                    # Gone too far or exact match found
+                                    break
+                            if token_idx in matched_tokens:
+                                break  # Found a match, no need to try other starting positions
+                    # If still no match, try case (c): root that matches this token and subsequent tokens
+                    if token_idx not in matched_tokens:
+                        # Look for a root that can match this token plus some following tokens
+                        for root_idx, root in enumerate(roots):
+                            if root_idx in matched_roots:
+                                continue  # Already matched
+                            concatenated = ""
+                            token_sequence: list[int] = []
+                            for next_token_idx in range(token_idx, len(tokens)):
+                                if next_token_idx in matched_tokens:
+                                    continue  # Already matched
+                                concatenated += tokens[next_token_idx]
+                                token_sequence.append(next_token_idx)
+                                if concatenated == root:
+                                    # Found a root that matches multiple tokens
+                                    matched_roots.add(root_idx)
+                                    for idx in token_sequence:
+                                        matched_tokens.add(idx)
+                                    break
+                                if len(concatenated) >= len(root):
+                                    break
+                    # If still no match, try case (d): multi-token to multi-root concatenation matching
+                    if token_idx not in matched_tokens:
+                        # First, try positional matching (existing logic)
+                        for root_start_idx in range(len(roots)):
+                            if root_start_idx in matched_roots:
+                                continue  # This root is already matched
+                            tokens_concatenated = ""
+                            roots_concatenated = ""
+                            token_sequence = []
+                            root_sequence = []
+                            max_tokens = min(len(tokens) - token_idx, len(roots) - root_start_idx)
+                            for i in range(max_tokens):
+                                current_token_idx = token_idx + i
+                                current_root_idx = root_start_idx + i
+                                if current_token_idx in matched_tokens or current_root_idx in matched_roots:
+                                    break  # Can't use already matched items
+                                tokens_concatenated += tokens[current_token_idx]
+                                roots_concatenated += roots[current_root_idx]
+                                token_sequence.append(current_token_idx)
+                                root_sequence.append(current_root_idx)
+                                # Check if concatenations match
+                                if tokens_concatenated == roots_concatenated and tokens_concatenated:
+                                    # Found a match - mark all as matched
+                                    for idx in token_sequence:
+                                        matched_tokens.add(idx)
+                                    for idx in root_sequence:
+                                        matched_roots.add(idx)
+                                    break
+                                # Stop if we've gone too far (tokens longer than reasonable)
+                                if len(tokens_concatenated) > 10 or len(roots_concatenated) > 10:
+                                    break
+                            if token_idx in matched_tokens:
+                                break  # Found a match, no need to try other root positions
+                        # If still no match, try non-positional contraction matching (new logic)
+                        if token_idx not in matched_tokens:
+                            # Look for contractions by trying to combine this token with the next one
+                            # and matching against any two available roots in the roots list (not necessarily consecutive)
+                            if token_idx + 1 < len(tokens) and token_idx + 1 not in matched_tokens:
+                                token_concat = tokens[token_idx] + tokens[token_idx + 1]
+                                # Try to find any two available roots (not necessarily consecutive) that concatenate to the same value
+                                for root_idx1 in range(len(roots)):
+                                    if root_idx1 in matched_roots:
+                                        continue  # Can't use already matched roots
+                                    for root_idx2 in range(len(roots)):
+                                        if root_idx2 in matched_roots or root_idx2 == root_idx1:
+                                            continue  # Can't use already matched roots or same root
+                                        root_concat = roots[root_idx1] + roots[root_idx2]
+                                        if token_concat == root_concat:
+                                            # Found a contraction match!
+                                            matched_tokens.add(token_idx)
+                                            matched_tokens.add(token_idx + 1)
+                                            matched_roots.add(root_idx1)
+                                            matched_roots.add(root_idx2)
+                                            break
+                                    if token_idx in matched_tokens:
+                                        break  # Found a match, no need to try other combinations
+            token_matching_errors: list[tuple[str, str, int]] = []
+            # Report unmatched roots
+            for root_idx, root in enumerate(roots):
+                if root_idx not in matched_roots:
+                    token_matching_errors.append(('root-without-token', f"Atom root '{root}' is used more times than it appears in the original text.", 1))
+            # Report unmatched tokens
+            for token_idx, token in enumerate(tokens):
+                if token_idx not in matched_tokens:
+                    token_matching_errors.append(('token-unused', f"Atom root '{token}' is not used, but it appears in the original text.", 1))
+            if len(token_matching_errors) > 0:
+                errors['token-matching'] = token_matching_errors
+        except (AttributeError, Exception):
+            # If token counting fails (e.g., edge is invalid), skip it
+            pass
+    return errors

hyperbase/parsers/parser.py ADDED Viewed

@@ -0,0 +1,41 @@
+from __future__ import annotations
+from collections.abc import Iterator
+from typing import Any
+class Parser:
+    def sentensize(self, text: str) -> list[str]:
+        raise NotImplementedError
+    def parse(self, text: str) -> Iterator[dict[str, Any]]:
+        for sentence in self.sentensize(text):
+            for parse in self.parse_sentence(sentence):
+                yield parse
+    def parse_sentence(self, sentence: str) -> list[dict[str, Any]]:
+        raise NotImplementedError
+    def parse_batch(self, sentences: list[str]) -> list[list[dict[str, Any]]]:
+        """Parse multiple sentences. Subclasses may override with a
+        true batched implementation (e.g. a single CT2 call)."""
+        return [self.parse_sentence(sentence) for sentence in sentences]
+    def parse_text(
+        self, text: str, batch_size: int = 8, progress: bool = False
+    ) -> list[dict[str, Any]]:
+        """Sentensize text, then parse all sentences in batches.
+        Returns a flat list of parse results across all sentences.
+        """
+        sentences = [s for s in self.sentensize(text) if len(s.split()) > 1]
+        batch_range = range(0, len(sentences), batch_size)
+        if progress:
+            from tqdm import tqdm  # type: ignore[import-untyped]
+            batch_range = tqdm(batch_range, desc="Parsing batches", leave=False)
+        results: list[dict[str, Any]] = []
+        for i in batch_range:
+            batch = sentences[i:i + batch_size]
+            for sentence_results in self.parse_batch(batch):
+                results.extend(sentence_results)
+        return results

hyperbase/parsers/utils.py ADDED Viewed

@@ -0,0 +1,19 @@
+def filter_alphanumeric_strings(strings: list[str]) -> list[str]:
+    """
+    Filter a list of strings to include only those containing alphanumeric characters,
+    and remove all non-alphanumeric characters from each string.
+    Args:
+        strings: List of strings to filter
+    Returns:
+        Filtered list containing only lowercased alphanumeric characters
+    """
+    filtered: list[str] = []
+    for s in strings:
+        # Remove non-alphanumeric characters and lowercase
+        cleaned = ''.join(c.lower() for c in s if c.isalnum())
+        # Only include if result is non-empty
+        if cleaned:
+            filtered.append(cleaned)
+    return filtered

hyperbase/patterns/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+from hyperbase.patterns.common import common_pattern
+from hyperbase.patterns.entrypoints import match_pattern, edge_matches_pattern
+from hyperbase.patterns.merge import merge_patterns
+from hyperbase.patterns.properties import (is_wildcard, is_pattern, is_full_pattern, is_fun_pattern,
+                                            is_unordered_pattern)
+from hyperbase.patterns.utils import more_general
+from hyperbase.patterns.variables import (all_variables, apply_vars, apply_variables, extract_vars_map, is_variable,
+                                           contains_variable, remove_variables)
+__all__ = [
+    'all_variables',
+    'apply_vars',
+    'apply_variables',
+    'common_pattern',
+    'contains_variable',
+    'edge_matches_pattern',
+    'extract_vars_map',
+    'is_full_pattern',
+    'is_fun_pattern',
+    'is_pattern',
+    'is_unordered_pattern',
+    'is_variable',
+    'is_wildcard',
+    'match_pattern',
+    'merge_patterns',
+    'more_general',
+    'remove_variables'
+]

hyperbase/patterns/argroles.py ADDED Viewed

@@ -0,0 +1,142 @@
+from __future__ import annotations
+import itertools
+from collections.abc import Iterator, Mapping, Sequence
+from typing import TYPE_CHECKING
+from hyperbase.hyperedge import Hyperedge, hedge
+from hyperbase.patterns.utils import _defun_pattern_argroles
+if TYPE_CHECKING:
+    from hyperbase.patterns.matcher import Matcher
+def _match_by_argroles(
+        matcher: Matcher,
+        edge: Hyperedge,
+        pattern: Hyperedge,
+        role_counts: list[tuple[str, int]],
+        min_vars: int,
+        matched: tuple[Hyperedge, ...] = (),
+        curvars: dict[str, Hyperedge] | None = None,
+        tok_pos: list[int] | None = None
+) -> list[dict[str, Hyperedge]]:
+    if curvars is None:
+        curvars = {}
+    if len(role_counts) == 0:
+        return [curvars]
+    argrole, n = role_counts[0]
+    # match connector
+    if argrole == 'X':
+        eitems = [edge[0]]
+        pitems = [pattern[0]]
+    # match any argrole
+    elif argrole == '*':
+        eitems = [e for e in edge if e not in matched]
+        pitems = list(pattern[-n:])
+    # match specific argrole
+    else:
+        eitems = edge.edges_with_argrole(argrole)
+        pitems = _defun_pattern_argroles(pattern).edges_with_argrole(argrole)
+    if len(eitems) < n:
+        if len(curvars) >= min_vars:
+            return [curvars]
+        else:
+            return []
+    result: list[dict[str, Hyperedge]] = []
+    if tok_pos:
+        tok_pos_items = [tok_pos[i] for i, subedge in enumerate(edge) if subedge in eitems]
+        tok_pos_perms = tuple(itertools.permutations(tok_pos_items, r=n))
+    for perm_n, perm in enumerate(tuple(itertools.permutations(eitems, r=n))):
+        if tok_pos:
+            tok_pos_perm = tok_pos_perms[perm_n]
+        perm_result: list[dict[str, Hyperedge]] = [{}]
+        for i, eitem in enumerate(perm):
+            pitem = pitems[i]
+            tok_pos_item = tok_pos_perm[i] if tok_pos else None
+            item_result: list[dict[str, Hyperedge]] = []
+            for variables in perm_result:
+                item_result += matcher.match(
+                    eitem,
+                    pitem,
+                    {**curvars, **variables},
+                    tok_pos=tok_pos_item
+                )
+            perm_result = item_result
+            if len(item_result) == 0:
+                break
+        for variables in perm_result:
+            result += _match_by_argroles(
+                matcher,
+                edge,
+                pattern,
+                role_counts[1:],
+                min_vars,
+                matched + perm,
+                {**curvars, **variables},
+                tok_pos=tok_pos
+            )
+    return result
+def edge2rolemap(edge: Hyperedge) -> dict[str, list[Hyperedge]]:
+    argroles = edge[0].argroles()
+    if argroles[0] == '{':
+        argroles = argroles[1:-1]
+    args = list(zip(argroles, edge[1:]))
+    rolemap: dict[str, list[Hyperedge]] = {}
+    for role, subedge in args:
+        if role not in rolemap:
+            rolemap[role] = []
+        rolemap[role].append(subedge)
+    return rolemap
+def rolemap2edge(pred: Hyperedge, rm: Mapping[str, Sequence[Hyperedge]]) -> Hyperedge:
+    roles = list(rm.keys())
+    argroles = ''
+    subedges: list[Hyperedge] = [pred]
+    for role in roles:
+        for arg in rm[role]:
+            argroles += role
+            subedges.append(arg)
+    result = hedge(subedges)
+    assert result is not None
+    return result.replace_argroles(argroles)
+def rolemap_pairings(
+        rm1: dict[str, list[Hyperedge]],
+        rm2: dict[str, list[Hyperedge]]
+) -> Iterator[tuple[dict[str, tuple[Hyperedge, ...]], dict[str, tuple[Hyperedge, ...]]]]:
+    roles = list(set(rm1.keys()) & set(rm2.keys()))
+    role_counts: dict[str, int] = {}
+    for role in roles:
+        role_counts[role] = min(len(rm1[role]), len(rm2[role]))
+    pairings: list[list[tuple[tuple[Hyperedge, ...], tuple[Hyperedge, ...]]]] = []
+    for role in roles:
+        role_pairings: list[tuple[tuple[Hyperedge, ...], tuple[Hyperedge, ...]]] = []
+        n = role_counts[role]
+        for args1_combs in itertools.combinations(rm1[role], n):
+            for args1 in itertools.permutations(args1_combs):
+                for args2 in itertools.combinations(rm2[role], n):
+                    role_pairings.append((args1, args2))
+        pairings.append(role_pairings)
+    for pairing in itertools.product(*pairings):
+        rm1_: dict[str, tuple[Hyperedge, ...]] = {}
+        rm2_: dict[str, tuple[Hyperedge, ...]] = {}
+        for role, role_pairing in zip(roles, pairing):
+            rm1_[role] = role_pairing[0]
+            rm2_[role] = role_pairing[1]
+        yield rm1_, rm2_

hyperbase/patterns/atoms.py ADDED Viewed

@@ -0,0 +1,98 @@
+from hyperbase.hyperedge import Hyperedge
+def _matches_atomic_pattern(edge: Hyperedge, atomic_pattern: Hyperedge) -> bool:
+    ap_parts = atomic_pattern.parts()  # type: ignore[attr-defined]
+    if len(ap_parts) == 0 or len(ap_parts[0]) == 0:
+        return False
+    # structural match
+    struct_code = ap_parts[0][0]
+    if struct_code == '.':
+        if edge.not_atom:
+            return False
+    elif atomic_pattern.parens:  # type: ignore[attr-defined]
+        if edge.atom:
+            return False
+    elif struct_code != '*' and not struct_code.isupper():
+        if edge.not_atom:
+            return False
+        if edge.root() != atomic_pattern.root():  # type: ignore[attr-defined]
+            return False
+    # role match
+    if len(ap_parts) > 1:
+        pos = 1
+        # type match
+        ap_role = atomic_pattern.role()  # type: ignore[attr-defined]
+        ap_type = ap_role[0]
+        e_type = edge.type()
+        n = len(ap_type)
+        if len(e_type) < n or e_type[:n] != ap_type:
+            return False
+        e_atom = edge.inner_atom()
+        if len(ap_role) > 1:
+            e_role = e_atom.role()
+            # check if edge role has enough parts to satisfy the wildcard
+            # specification
+            if len(e_role) < len(ap_role):
+                return False
+            # argroles match
+            if ap_type[0] in {'B', 'P'}:
+                ap_argroles_parts = ap_role[1].split('-')
+                if len(ap_argroles_parts) == 1:
+                    ap_argroles_parts.append('')
+                ap_negroles = ap_argroles_parts[1]
+                # fixed order?
+                ap_argroles_posopt = ap_argroles_parts[0]
+                e_argroles = e_role[1]
+                if len(ap_argroles_posopt) > 0 and ap_argroles_posopt[0] == '{':
+                    ap_argroles_posopt = ap_argroles_posopt[1:-1]
+                else:
+                    ap_argroles_posopt = ap_argroles_posopt.replace(',', '')
+                    if len(e_argroles) > len(ap_argroles_posopt):
+                        return False
+                    else:
+                        return ap_argroles_posopt.startswith(e_argroles)  # type: ignore[no-any-return]
+                ap_argroles_parts = ap_argroles_posopt.split(',')
+                ap_posroles = ap_argroles_parts[0]
+                ap_argroles = set(ap_posroles) | set(ap_negroles)
+                for argrole in ap_argroles:
+                    min_count = ap_posroles.count(argrole)
+                    # if there are argrole exclusions
+                    fixed = ap_negroles.count(argrole) > 0
+                    count = e_argroles.count(argrole)
+                    if count < min_count:
+                        return False
+                    # deal with exclusions
+                    if fixed and count > min_count:
+                        return False
+                pos = 2
+            # match rest of role
+            while pos < len(ap_role):
+                if e_role[pos] != ap_role[pos]:
+                    return False
+                pos += 1
+    # match rest of atom
+    if len(ap_parts) > 2:
+        e_parts = e_atom.parts()
+        # check if edge role has enough parts to satisfy the wildcard
+        # specification
+        if len(e_parts) < len(ap_parts):
+            return False
+        while pos < len(ap_parts):
+            if e_parts[pos] != ap_parts[pos]:
+                return False
+            pos += 1
+    return True