PyPI - modelwright - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

modelwright 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

modelwright/__init__.py +148 -0
modelwright/cli.py +466 -0
modelwright/conversion.py +931 -0
modelwright/evaluation.py +173 -0
modelwright/execution.py +239 -0
modelwright/extraction.py +662 -0
modelwright/formulas.py +571 -0
modelwright/formulas_oracle.py +153 -0
modelwright/generation.py +726 -0
modelwright/graph.py +591 -0
modelwright/oracle_validation.py +59 -0
modelwright/oracles.py +132 -0
modelwright/references.py +209 -0
modelwright/validation.py +475 -0
modelwright-0.1.0a1.dist-info/METADATA +160 -0
modelwright-0.1.0a1.dist-info/RECORD +20 -0
modelwright-0.1.0a1.dist-info/WHEEL +5 -0
modelwright-0.1.0a1.dist-info/entry_points.txt +2 -0
modelwright-0.1.0a1.dist-info/licenses/LICENSE +21 -0
modelwright-0.1.0a1.dist-info/top_level.txt +1 -0

modelwright/formulas.py ADDED Viewed

@@ -0,0 +1,571 @@
+"""Formula expression records.
+These records describe translated formula structure; they do not translate
+Excel formula text themselves.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Literal
+from openpyxl.formula.tokenizer import Tokenizer
+from openpyxl.utils.cell import get_column_letter, range_boundaries
+from modelwright.extraction import CellRecord
+from modelwright.graph import DependencyEdge, DependencyGraph
+from modelwright.references import WorkbookReference
+from modelwright.references import normalize_reference
+JsonValue = str | int | float | bool | None | list[Any] | dict[str, Any]
+ExpressionKind = Literal["literal", "reference", "unary", "binary", "comparison", "function_call"]
+DiagnosticSeverity = Literal["info", "warning", "error"]
+FormulaReferenceIndex = dict[tuple[str, str], WorkbookReference]
+SUPPORTED_FUNCTIONS = frozenset(
+    {
+        "AND",
+        "AVERAGE",
+        "CONCATENATE",
+        "COUNTIF",
+        "COUNTIFS",
+        "IF",
+        "IFERROR",
+        "IFNA",
+        "MAX",
+        "MIN",
+        "OR",
+        "OFFSET",
+        "ROUND",
+        "SUM",
+        "SUMIF",
+        "SUMIFS",
+        "VLOOKUP",
+    }
+)
+SUPPORTED_OPERATORS = frozenset({"+", "-", "*", "/", "^", "&", ">", ">=", "<", "<=", "=", "<>", "(", ")", ","})
+@dataclass(frozen=True)
+class FormulaTranslationDiagnostic:
+    """Formula translation concern tied to source formula provenance."""
+    code: str
+    message: str
+    severity: DiagnosticSeverity = "warning"
+    location: str | None = None
+    raw_value: JsonValue = None
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "FormulaTranslationDiagnostic":
+        return cls(
+            code=data["code"],
+            message=data["message"],
+            severity=data.get("severity", "warning"),
+            location=data.get("location"),
+            raw_value=data.get("raw_value"),
+        )
+    def to_dict(self) -> dict[str, JsonValue]:
+        return {
+            "code": self.code,
+            "message": self.message,
+            "severity": self.severity,
+            "location": self.location,
+            "raw_value": self.raw_value,
+        }
+@dataclass(frozen=True)
+class FormulaExpressionNode:
+    """One node in a translated formula expression tree."""
+    kind: ExpressionKind
+    value: JsonValue = None
+    reference: WorkbookReference | None = None
+    operator: str | None = None
+    function_name: str | None = None
+    operands: tuple["FormulaExpressionNode", ...] = field(default_factory=tuple)
+    @classmethod
+    def literal(cls, value: JsonValue) -> "FormulaExpressionNode":
+        return cls(kind="literal", value=value)
+    @classmethod
+    def reference_to(cls, reference: WorkbookReference) -> "FormulaExpressionNode":
+        return cls(kind="reference", reference=reference)
+    @classmethod
+    def unary(
+        cls,
+        operator: str,
+        operand: "FormulaExpressionNode",
+    ) -> "FormulaExpressionNode":
+        return cls(kind="unary", operator=operator, operands=(operand,))
+    @classmethod
+    def binary(
+        cls,
+        operator: str,
+        left: "FormulaExpressionNode",
+        right: "FormulaExpressionNode",
+    ) -> "FormulaExpressionNode":
+        return cls(kind="binary", operator=operator, operands=(left, right))
+    @classmethod
+    def comparison(
+        cls,
+        operator: str,
+        left: "FormulaExpressionNode",
+        right: "FormulaExpressionNode",
+    ) -> "FormulaExpressionNode":
+        return cls(kind="comparison", operator=operator, operands=(left, right))
+    @classmethod
+    def function_call(
+        cls,
+        function_name: str,
+        operands: tuple["FormulaExpressionNode", ...],
+    ) -> "FormulaExpressionNode":
+        return cls(kind="function_call", function_name=function_name.upper(), operands=operands)
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "FormulaExpressionNode":
+        reference_data = data.get("reference")
+        return cls(
+            kind=data["kind"],
+            value=data.get("value"),
+            reference=WorkbookReference.from_dict(reference_data) if reference_data is not None else None,
+            operator=data.get("operator"),
+            function_name=data.get("function_name"),
+            operands=tuple(cls.from_dict(item) for item in data.get("operands", [])),
+        )
+    def to_dict(self) -> dict[str, JsonValue]:
+        return {
+            "kind": self.kind,
+            "value": self.value,
+            "reference": self.reference.to_dict() if self.reference is not None else None,
+            "operator": self.operator,
+            "function_name": self.function_name,
+            "operands": [operand.to_dict() for operand in self.operands],
+        }
+@dataclass(frozen=True)
+class FormulaExpression:
+    """Translated expression for one source formula cell."""
+    source_cell: str
+    raw_formula: str
+    root: FormulaExpressionNode | None = None
+    diagnostics: tuple[FormulaTranslationDiagnostic, ...] = field(default_factory=tuple)
+    @property
+    def translated(self) -> bool:
+        return self.root is not None and not any(diagnostic.severity == "error" for diagnostic in self.diagnostics)
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "FormulaExpression":
+        root_data = data.get("root")
+        return cls(
+            source_cell=data["source_cell"],
+            raw_formula=data["raw_formula"],
+            root=FormulaExpressionNode.from_dict(root_data) if root_data is not None else None,
+            diagnostics=tuple(
+                FormulaTranslationDiagnostic.from_dict(item) for item in data.get("diagnostics", [])
+            ),
+        )
+    def to_dict(self) -> dict[str, JsonValue]:
+        return {
+            "source_cell": self.source_cell,
+            "raw_formula": self.raw_formula,
+            "root": self.root.to_dict() if self.root is not None else None,
+            "translated": self.translated,
+            "diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
+        }
+def translate_formula_cell(
+    cell: CellRecord,
+    graph: DependencyGraph,
+    reference_index: FormulaReferenceIndex | None = None,
+) -> FormulaExpression:
+    """Translate one supported formula cell into an expression tree."""
+    if cell.formula is None:
+        return FormulaExpression(
+            source_cell=cell.cell_ref,
+            raw_formula=str(cell.raw_value),
+            diagnostics=(
+                FormulaTranslationDiagnostic(
+                    code="not_a_formula_cell",
+                    message="cell does not contain a formula record",
+                    severity="error",
+                    location=cell.cell_ref,
+                    raw_value=cell.raw_value,
+                ),
+            ),
+        )
+    raw_formula = cell.formula.raw_formula
+    try:
+        tokens = _formula_tokens(raw_formula)
+        parser = _FormulaParser(
+            tokens=tokens,
+            cell=cell,
+            graph=graph,
+            reference_index=reference_index,
+        )
+        root = parser.parse()
+    except FormulaTranslationError as error:
+        return FormulaExpression(
+            source_cell=cell.cell_ref,
+            raw_formula=raw_formula,
+            diagnostics=(
+                FormulaTranslationDiagnostic(
+                    code=error.code,
+                    message=error.message,
+                    severity="error",
+                    location=cell.cell_ref,
+                    raw_value=error.raw_value,
+                ),
+            ),
+        )
+    return FormulaExpression(source_cell=cell.cell_ref, raw_formula=raw_formula, root=root)
+def build_formula_reference_index(graph: DependencyGraph) -> FormulaReferenceIndex:
+    """Build fast lookup for formula raw references by target cell."""
+    index: FormulaReferenceIndex = {}
+    for edge in graph.execution_edges:
+        index.setdefault((edge.target.normalized, edge.raw_reference), _expression_reference_from_edge(edge))
+    return index
+def _expression_reference_from_edge(edge: DependencyEdge) -> WorkbookReference:
+    if edge.resolved_from is not None and edge.resolved_from.kind in {"cell", "range"}:
+        return edge.resolved_from
+    return edge.source
+@dataclass(frozen=True)
+class _FormulaToken:
+    kind: str
+    value: str
+class FormulaTranslationError(Exception):
+    def __init__(self, code: str, message: str, raw_value: JsonValue = None) -> None:
+        super().__init__(message)
+        self.code = code
+        self.message = message
+        self.raw_value = raw_value
+class _FormulaParser:
+    def __init__(
+        self,
+        *,
+        tokens: tuple[_FormulaToken, ...],
+        cell: CellRecord,
+        graph: DependencyGraph,
+        reference_index: FormulaReferenceIndex | None,
+    ) -> None:
+        self.tokens = tokens
+        self.cell = cell
+        self.graph = graph
+        self.reference_index = reference_index
+        self.index = 0
+    def parse(self) -> FormulaExpressionNode:
+        expression = self._parse_comparison()
+        if self._peek() is not None:
+            token = self._peek()
+            raise FormulaTranslationError("unexpected_formula_token", "unexpected token after expression", token.value)
+        return expression
+    def _parse_comparison(self) -> FormulaExpressionNode:
+        left = self._parse_concatenation()
+        token = self._peek()
+        if token is not None and token.value in {">", ">=", "<", "<=", "=", "<>"}:
+            self._advance()
+            right = self._parse_concatenation()
+            return FormulaExpressionNode.comparison(token.value, left, right)
+        return left
+    def _parse_concatenation(self) -> FormulaExpressionNode:
+        expression = self._parse_additive()
+        while (token := self._peek()) is not None and token.value == "&":
+            self._advance()
+            expression = FormulaExpressionNode.binary(token.value, expression, self._parse_additive())
+        return expression
+    def _parse_additive(self) -> FormulaExpressionNode:
+        expression = self._parse_multiplicative()
+        while (token := self._peek()) is not None and token.value in {"+", "-"}:
+            self._advance()
+            expression = FormulaExpressionNode.binary(token.value, expression, self._parse_multiplicative())
+        return expression
+    def _parse_multiplicative(self) -> FormulaExpressionNode:
+        expression = self._parse_exponent()
+        while (token := self._peek()) is not None and token.value in {"*", "/"}:
+            self._advance()
+            expression = FormulaExpressionNode.binary(token.value, expression, self._parse_exponent())
+        return expression
+    def _parse_exponent(self) -> FormulaExpressionNode:
+        expression = self._parse_unary()
+        while (token := self._peek()) is not None and token.value == "^":
+            self._advance()
+            expression = FormulaExpressionNode.binary(token.value, expression, self._parse_unary())
+        return expression
+    def _parse_unary(self) -> FormulaExpressionNode:
+        token = self._peek()
+        if token is not None and token.kind == "operator" and token.value in {"+", "-"}:
+            self._advance()
+            operand = self._parse_unary()
+            if token.value == "+":
+                return operand
+            return FormulaExpressionNode.unary(token.value, operand)
+        return self._parse_primary()
+    def _parse_primary(self) -> FormulaExpressionNode:
+        token = self._advance()
+        if token.kind == "number":
+            return FormulaExpressionNode.literal(_number_value(token.value))
+        if token.kind == "text":
+            return FormulaExpressionNode.literal(token.value)
+        if token.kind == "logical":
+            return FormulaExpressionNode.literal(token.value == "TRUE")
+        if token.kind == "reference":
+            return FormulaExpressionNode.reference_to(self._resolved_reference(token.value))
+        if token.kind == "identifier":
+            return self._parse_function_call(token.value)
+        if token.value == "(":
+            expression = self._parse_comparison()
+            self._expect(")")
+            return expression
+        raise FormulaTranslationError("unsupported_formula_token", "unsupported formula token", token.value)
+    def _parse_function_call(self, function_name: str) -> FormulaExpressionNode:
+        self._expect("(")
+        raw_function_name = function_name.upper()
+        function_name = _normalized_function_name(raw_function_name)
+        if function_name not in SUPPORTED_FUNCTIONS:
+            raise FormulaTranslationError(
+                "unsupported_function",
+                f"function {raw_function_name} is not supported",
+                raw_function_name,
+            )
+        arguments: list[FormulaExpressionNode] = []
+        if (token := self._peek()) is not None and token.value == ")":
+            self._advance()
+            return FormulaExpressionNode.function_call(function_name, tuple(arguments))
+        while True:
+            arguments.append(self._parse_comparison())
+            token = self._peek()
+            if token is not None and token.value == ",":
+                self._advance()
+                continue
+            self._expect(")")
+            if function_name == "OFFSET":
+                return _static_offset_reference(arguments)
+            return FormulaExpressionNode.function_call(function_name, tuple(arguments))
+    def _resolved_reference(self, raw_reference: str) -> WorkbookReference:
+        semantic_reference = normalize_reference(raw_reference, current_sheet=_sheet_name(self.cell.cell_ref))
+        if semantic_reference.kind == "range":
+            return semantic_reference
+        if self.reference_index is not None:
+            source = self.reference_index.get((self.cell.cell_ref, raw_reference))
+            if source is not None:
+                if source.kind == "structured":
+                    raise FormulaTranslationError(
+                        "unsupported_structured_reference",
+                        "structured references are not supported",
+                        raw_reference,
+                    )
+                return source
+        for edge in self.graph.execution_edges:
+            if edge.target.normalized == self.cell.cell_ref and edge.raw_reference == raw_reference:
+                source = _expression_reference_from_edge(edge)
+                if source.kind == "structured":
+                    raise FormulaTranslationError(
+                        "unsupported_structured_reference",
+                        "structured references are not supported",
+                        raw_reference,
+                    )
+                return source
+        if semantic_reference.kind == "structured":
+            raise FormulaTranslationError(
+                "unsupported_structured_reference",
+                "structured references are not supported",
+                raw_reference,
+            )
+        if semantic_reference.kind == "named_range":
+            raise FormulaTranslationError("unresolved_named_range", "named range could not be resolved", raw_reference)
+        return semantic_reference
+    def _peek(self) -> _FormulaToken | None:
+        if self.index >= len(self.tokens):
+            return None
+        return self.tokens[self.index]
+    def _advance(self) -> _FormulaToken:
+        token = self._peek()
+        if token is None:
+            raise FormulaTranslationError("unexpected_formula_end", "formula ended unexpectedly")
+        self.index += 1
+        return token
+    def _expect(self, value: str) -> None:
+        token = self._advance()
+        if token.value != value:
+            raise FormulaTranslationError("unexpected_formula_token", f"expected {value}", token.value)
+def _formula_tokens(raw_formula: str) -> tuple[_FormulaToken, ...]:
+    tokens: list[_FormulaToken] = []
+    for token in Tokenizer(raw_formula).items:
+        if token.type == "WHITE-SPACE":
+            continue
+        if token.type == "FUNC" and token.subtype == "OPEN":
+            tokens.append(_FormulaToken("identifier", token.value[:-1]))
+            tokens.append(_FormulaToken("operator", "("))
+            continue
+        if token.type == "FUNC" and token.subtype == "CLOSE":
+            tokens.append(_FormulaToken("operator", ")"))
+            continue
+        if token.type == "PAREN":
+            tokens.append(_FormulaToken("operator", token.value))
+            continue
+        if token.type == "SEP":
+            tokens.append(_FormulaToken("operator", token.value))
+            continue
+        if token.type == "OPERAND" and token.subtype == "NUMBER":
+            tokens.append(_FormulaToken("number", token.value))
+            continue
+        if token.type == "OPERAND" and token.subtype == "TEXT":
+            tokens.append(_FormulaToken("text", token.value.strip('"')))
+            continue
+        if token.type == "OPERAND" and token.subtype == "LOGICAL":
+            tokens.append(_FormulaToken("logical", token.value.upper()))
+            continue
+        if token.type == "OPERAND" and token.subtype == "ERROR":
+            raise FormulaTranslationError(
+                "unsupported_error_reference",
+                "formula contains an unsupported error reference",
+                token.value,
+            )
+        if token.type == "OPERAND" and token.subtype == "RANGE":
+            tokens.append(_FormulaToken("reference", token.value))
+            continue
+        if token.type.startswith("OPERATOR"):
+            if token.value not in SUPPORTED_OPERATORS:
+                raise FormulaTranslationError("unsupported_operator", "formula operator is not supported", token.value)
+            tokens.append(_FormulaToken("operator", token.value))
+            continue
+        raise FormulaTranslationError("unsupported_formula_token", "unsupported formula token", token.value)
+    return tuple(tokens)
+def _normalized_function_name(function_name: str) -> str:
+    if function_name.startswith("_XLFN."):
+        return function_name.removeprefix("_XLFN.")
+    return function_name
+def _number_value(raw_value: str) -> int | float:
+    value = float(raw_value)
+    if value.is_integer():
+        return int(value)
+    return value
+def _static_offset_reference(arguments: list[FormulaExpressionNode]) -> FormulaExpressionNode:
+    if len(arguments) != 3:
+        raise FormulaTranslationError(
+            "unsupported_offset_shape",
+            "only static three-argument OFFSET references are supported",
+            "OFFSET",
+        )
+    base, rows, columns = arguments
+    if base.kind != "reference" or base.reference is None or base.reference.kind != "cell":
+        raise FormulaTranslationError(
+            "unsupported_offset_reference",
+            "OFFSET base reference must resolve to one concrete cell",
+            "OFFSET",
+        )
+    row_offset = _literal_integer(rows)
+    column_offset = _literal_integer(columns)
+    if row_offset is None or column_offset is None:
+        raise FormulaTranslationError(
+            "unsupported_offset_argument",
+            "OFFSET row and column offsets must be static integers",
+            "OFFSET",
+        )
+    reference = _shift_cell_reference(base.reference, row_offset=row_offset, column_offset=column_offset)
+    return FormulaExpressionNode.reference_to(reference)
+def _literal_integer(node: FormulaExpressionNode) -> int | None:
+    if node.kind == "literal" and isinstance(node.value, int):
+        return node.value
+    if node.kind == "unary" and node.operator == "-":
+        (operand,) = node.operands
+        value = _literal_integer(operand)
+        return None if value is None else -value
+    return None
+def _shift_cell_reference(
+    reference: WorkbookReference,
+    *,
+    row_offset: int,
+    column_offset: int,
+) -> WorkbookReference:
+    if reference.sheet is None or reference.start_cell is None:
+        raise FormulaTranslationError(
+            "unsupported_offset_reference",
+            "OFFSET base reference must include a sheet and cell coordinate",
+            "OFFSET",
+        )
+    try:
+        min_col, min_row, max_col, max_row = range_boundaries(reference.start_cell)
+    except ValueError as error:
+        raise FormulaTranslationError(
+            "unsupported_offset_reference",
+            "OFFSET base reference must include a valid cell coordinate",
+            "OFFSET",
+        ) from error
+    if min_col != max_col or min_row != max_row:
+        raise FormulaTranslationError(
+            "unsupported_offset_reference",
+            "OFFSET base reference must resolve to one concrete cell",
+            "OFFSET",
+        )
+    shifted_column = min_col + column_offset
+    shifted_row = min_row + row_offset
+    if shifted_column < 1 or shifted_row < 1:
+        raise FormulaTranslationError(
+            "unsupported_offset_reference",
+            "OFFSET resolved outside the worksheet grid",
+            "OFFSET",
+        )
+    return normalize_reference(f"{reference.sheet}!{get_column_letter(shifted_column)}{shifted_row}")
+def _sheet_name(cell_ref: str) -> str:
+    return cell_ref.split("!", 1)[0]