modelwright 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,571 @@
1
+ """Formula expression records.
2
+
3
+ These records describe translated formula structure; they do not translate
4
+ Excel formula text themselves.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Literal
11
+
12
+ from openpyxl.formula.tokenizer import Tokenizer
13
+ from openpyxl.utils.cell import get_column_letter, range_boundaries
14
+
15
+ from modelwright.extraction import CellRecord
16
+ from modelwright.graph import DependencyEdge, DependencyGraph
17
+ from modelwright.references import WorkbookReference
18
+ from modelwright.references import normalize_reference
19
+
20
+
21
+ JsonValue = str | int | float | bool | None | list[Any] | dict[str, Any]
22
+ ExpressionKind = Literal["literal", "reference", "unary", "binary", "comparison", "function_call"]
23
+ DiagnosticSeverity = Literal["info", "warning", "error"]
24
+ FormulaReferenceIndex = dict[tuple[str, str], WorkbookReference]
25
+ SUPPORTED_FUNCTIONS = frozenset(
26
+ {
27
+ "AND",
28
+ "AVERAGE",
29
+ "CONCATENATE",
30
+ "COUNTIF",
31
+ "COUNTIFS",
32
+ "IF",
33
+ "IFERROR",
34
+ "IFNA",
35
+ "MAX",
36
+ "MIN",
37
+ "OR",
38
+ "OFFSET",
39
+ "ROUND",
40
+ "SUM",
41
+ "SUMIF",
42
+ "SUMIFS",
43
+ "VLOOKUP",
44
+ }
45
+ )
46
+ SUPPORTED_OPERATORS = frozenset({"+", "-", "*", "/", "^", "&", ">", ">=", "<", "<=", "=", "<>", "(", ")", ","})
47
+
48
+
49
+ @dataclass(frozen=True)
50
+ class FormulaTranslationDiagnostic:
51
+ """Formula translation concern tied to source formula provenance."""
52
+
53
+ code: str
54
+ message: str
55
+ severity: DiagnosticSeverity = "warning"
56
+ location: str | None = None
57
+ raw_value: JsonValue = None
58
+
59
+ @classmethod
60
+ def from_dict(cls, data: dict[str, Any]) -> "FormulaTranslationDiagnostic":
61
+ return cls(
62
+ code=data["code"],
63
+ message=data["message"],
64
+ severity=data.get("severity", "warning"),
65
+ location=data.get("location"),
66
+ raw_value=data.get("raw_value"),
67
+ )
68
+
69
+ def to_dict(self) -> dict[str, JsonValue]:
70
+ return {
71
+ "code": self.code,
72
+ "message": self.message,
73
+ "severity": self.severity,
74
+ "location": self.location,
75
+ "raw_value": self.raw_value,
76
+ }
77
+
78
+
79
+ @dataclass(frozen=True)
80
+ class FormulaExpressionNode:
81
+ """One node in a translated formula expression tree."""
82
+
83
+ kind: ExpressionKind
84
+ value: JsonValue = None
85
+ reference: WorkbookReference | None = None
86
+ operator: str | None = None
87
+ function_name: str | None = None
88
+ operands: tuple["FormulaExpressionNode", ...] = field(default_factory=tuple)
89
+
90
+ @classmethod
91
+ def literal(cls, value: JsonValue) -> "FormulaExpressionNode":
92
+ return cls(kind="literal", value=value)
93
+
94
+ @classmethod
95
+ def reference_to(cls, reference: WorkbookReference) -> "FormulaExpressionNode":
96
+ return cls(kind="reference", reference=reference)
97
+
98
+ @classmethod
99
+ def unary(
100
+ cls,
101
+ operator: str,
102
+ operand: "FormulaExpressionNode",
103
+ ) -> "FormulaExpressionNode":
104
+ return cls(kind="unary", operator=operator, operands=(operand,))
105
+
106
+ @classmethod
107
+ def binary(
108
+ cls,
109
+ operator: str,
110
+ left: "FormulaExpressionNode",
111
+ right: "FormulaExpressionNode",
112
+ ) -> "FormulaExpressionNode":
113
+ return cls(kind="binary", operator=operator, operands=(left, right))
114
+
115
+ @classmethod
116
+ def comparison(
117
+ cls,
118
+ operator: str,
119
+ left: "FormulaExpressionNode",
120
+ right: "FormulaExpressionNode",
121
+ ) -> "FormulaExpressionNode":
122
+ return cls(kind="comparison", operator=operator, operands=(left, right))
123
+
124
+ @classmethod
125
+ def function_call(
126
+ cls,
127
+ function_name: str,
128
+ operands: tuple["FormulaExpressionNode", ...],
129
+ ) -> "FormulaExpressionNode":
130
+ return cls(kind="function_call", function_name=function_name.upper(), operands=operands)
131
+
132
+ @classmethod
133
+ def from_dict(cls, data: dict[str, Any]) -> "FormulaExpressionNode":
134
+ reference_data = data.get("reference")
135
+ return cls(
136
+ kind=data["kind"],
137
+ value=data.get("value"),
138
+ reference=WorkbookReference.from_dict(reference_data) if reference_data is not None else None,
139
+ operator=data.get("operator"),
140
+ function_name=data.get("function_name"),
141
+ operands=tuple(cls.from_dict(item) for item in data.get("operands", [])),
142
+ )
143
+
144
+ def to_dict(self) -> dict[str, JsonValue]:
145
+ return {
146
+ "kind": self.kind,
147
+ "value": self.value,
148
+ "reference": self.reference.to_dict() if self.reference is not None else None,
149
+ "operator": self.operator,
150
+ "function_name": self.function_name,
151
+ "operands": [operand.to_dict() for operand in self.operands],
152
+ }
153
+
154
+
155
+ @dataclass(frozen=True)
156
+ class FormulaExpression:
157
+ """Translated expression for one source formula cell."""
158
+
159
+ source_cell: str
160
+ raw_formula: str
161
+ root: FormulaExpressionNode | None = None
162
+ diagnostics: tuple[FormulaTranslationDiagnostic, ...] = field(default_factory=tuple)
163
+
164
+ @property
165
+ def translated(self) -> bool:
166
+ return self.root is not None and not any(diagnostic.severity == "error" for diagnostic in self.diagnostics)
167
+
168
+ @classmethod
169
+ def from_dict(cls, data: dict[str, Any]) -> "FormulaExpression":
170
+ root_data = data.get("root")
171
+ return cls(
172
+ source_cell=data["source_cell"],
173
+ raw_formula=data["raw_formula"],
174
+ root=FormulaExpressionNode.from_dict(root_data) if root_data is not None else None,
175
+ diagnostics=tuple(
176
+ FormulaTranslationDiagnostic.from_dict(item) for item in data.get("diagnostics", [])
177
+ ),
178
+ )
179
+
180
+ def to_dict(self) -> dict[str, JsonValue]:
181
+ return {
182
+ "source_cell": self.source_cell,
183
+ "raw_formula": self.raw_formula,
184
+ "root": self.root.to_dict() if self.root is not None else None,
185
+ "translated": self.translated,
186
+ "diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
187
+ }
188
+
189
+
190
+ def translate_formula_cell(
191
+ cell: CellRecord,
192
+ graph: DependencyGraph,
193
+ reference_index: FormulaReferenceIndex | None = None,
194
+ ) -> FormulaExpression:
195
+ """Translate one supported formula cell into an expression tree."""
196
+
197
+ if cell.formula is None:
198
+ return FormulaExpression(
199
+ source_cell=cell.cell_ref,
200
+ raw_formula=str(cell.raw_value),
201
+ diagnostics=(
202
+ FormulaTranslationDiagnostic(
203
+ code="not_a_formula_cell",
204
+ message="cell does not contain a formula record",
205
+ severity="error",
206
+ location=cell.cell_ref,
207
+ raw_value=cell.raw_value,
208
+ ),
209
+ ),
210
+ )
211
+
212
+ raw_formula = cell.formula.raw_formula
213
+ try:
214
+ tokens = _formula_tokens(raw_formula)
215
+ parser = _FormulaParser(
216
+ tokens=tokens,
217
+ cell=cell,
218
+ graph=graph,
219
+ reference_index=reference_index,
220
+ )
221
+ root = parser.parse()
222
+ except FormulaTranslationError as error:
223
+ return FormulaExpression(
224
+ source_cell=cell.cell_ref,
225
+ raw_formula=raw_formula,
226
+ diagnostics=(
227
+ FormulaTranslationDiagnostic(
228
+ code=error.code,
229
+ message=error.message,
230
+ severity="error",
231
+ location=cell.cell_ref,
232
+ raw_value=error.raw_value,
233
+ ),
234
+ ),
235
+ )
236
+
237
+ return FormulaExpression(source_cell=cell.cell_ref, raw_formula=raw_formula, root=root)
238
+
239
+
240
+ def build_formula_reference_index(graph: DependencyGraph) -> FormulaReferenceIndex:
241
+ """Build fast lookup for formula raw references by target cell."""
242
+
243
+ index: FormulaReferenceIndex = {}
244
+ for edge in graph.execution_edges:
245
+ index.setdefault((edge.target.normalized, edge.raw_reference), _expression_reference_from_edge(edge))
246
+ return index
247
+
248
+
249
+ def _expression_reference_from_edge(edge: DependencyEdge) -> WorkbookReference:
250
+ if edge.resolved_from is not None and edge.resolved_from.kind in {"cell", "range"}:
251
+ return edge.resolved_from
252
+ return edge.source
253
+
254
+
255
+ @dataclass(frozen=True)
256
+ class _FormulaToken:
257
+ kind: str
258
+ value: str
259
+
260
+
261
+ class FormulaTranslationError(Exception):
262
+ def __init__(self, code: str, message: str, raw_value: JsonValue = None) -> None:
263
+ super().__init__(message)
264
+ self.code = code
265
+ self.message = message
266
+ self.raw_value = raw_value
267
+
268
+
269
+ class _FormulaParser:
270
+ def __init__(
271
+ self,
272
+ *,
273
+ tokens: tuple[_FormulaToken, ...],
274
+ cell: CellRecord,
275
+ graph: DependencyGraph,
276
+ reference_index: FormulaReferenceIndex | None,
277
+ ) -> None:
278
+ self.tokens = tokens
279
+ self.cell = cell
280
+ self.graph = graph
281
+ self.reference_index = reference_index
282
+ self.index = 0
283
+
284
+ def parse(self) -> FormulaExpressionNode:
285
+ expression = self._parse_comparison()
286
+ if self._peek() is not None:
287
+ token = self._peek()
288
+ raise FormulaTranslationError("unexpected_formula_token", "unexpected token after expression", token.value)
289
+ return expression
290
+
291
+ def _parse_comparison(self) -> FormulaExpressionNode:
292
+ left = self._parse_concatenation()
293
+ token = self._peek()
294
+ if token is not None and token.value in {">", ">=", "<", "<=", "=", "<>"}:
295
+ self._advance()
296
+ right = self._parse_concatenation()
297
+ return FormulaExpressionNode.comparison(token.value, left, right)
298
+ return left
299
+
300
+ def _parse_concatenation(self) -> FormulaExpressionNode:
301
+ expression = self._parse_additive()
302
+ while (token := self._peek()) is not None and token.value == "&":
303
+ self._advance()
304
+ expression = FormulaExpressionNode.binary(token.value, expression, self._parse_additive())
305
+ return expression
306
+
307
+ def _parse_additive(self) -> FormulaExpressionNode:
308
+ expression = self._parse_multiplicative()
309
+ while (token := self._peek()) is not None and token.value in {"+", "-"}:
310
+ self._advance()
311
+ expression = FormulaExpressionNode.binary(token.value, expression, self._parse_multiplicative())
312
+ return expression
313
+
314
+ def _parse_multiplicative(self) -> FormulaExpressionNode:
315
+ expression = self._parse_exponent()
316
+ while (token := self._peek()) is not None and token.value in {"*", "/"}:
317
+ self._advance()
318
+ expression = FormulaExpressionNode.binary(token.value, expression, self._parse_exponent())
319
+ return expression
320
+
321
+ def _parse_exponent(self) -> FormulaExpressionNode:
322
+ expression = self._parse_unary()
323
+ while (token := self._peek()) is not None and token.value == "^":
324
+ self._advance()
325
+ expression = FormulaExpressionNode.binary(token.value, expression, self._parse_unary())
326
+ return expression
327
+
328
+ def _parse_unary(self) -> FormulaExpressionNode:
329
+ token = self._peek()
330
+ if token is not None and token.kind == "operator" and token.value in {"+", "-"}:
331
+ self._advance()
332
+ operand = self._parse_unary()
333
+ if token.value == "+":
334
+ return operand
335
+ return FormulaExpressionNode.unary(token.value, operand)
336
+ return self._parse_primary()
337
+
338
+ def _parse_primary(self) -> FormulaExpressionNode:
339
+ token = self._advance()
340
+ if token.kind == "number":
341
+ return FormulaExpressionNode.literal(_number_value(token.value))
342
+ if token.kind == "text":
343
+ return FormulaExpressionNode.literal(token.value)
344
+ if token.kind == "logical":
345
+ return FormulaExpressionNode.literal(token.value == "TRUE")
346
+ if token.kind == "reference":
347
+ return FormulaExpressionNode.reference_to(self._resolved_reference(token.value))
348
+ if token.kind == "identifier":
349
+ return self._parse_function_call(token.value)
350
+ if token.value == "(":
351
+ expression = self._parse_comparison()
352
+ self._expect(")")
353
+ return expression
354
+ raise FormulaTranslationError("unsupported_formula_token", "unsupported formula token", token.value)
355
+
356
+ def _parse_function_call(self, function_name: str) -> FormulaExpressionNode:
357
+ self._expect("(")
358
+ raw_function_name = function_name.upper()
359
+ function_name = _normalized_function_name(raw_function_name)
360
+ if function_name not in SUPPORTED_FUNCTIONS:
361
+ raise FormulaTranslationError(
362
+ "unsupported_function",
363
+ f"function {raw_function_name} is not supported",
364
+ raw_function_name,
365
+ )
366
+
367
+ arguments: list[FormulaExpressionNode] = []
368
+ if (token := self._peek()) is not None and token.value == ")":
369
+ self._advance()
370
+ return FormulaExpressionNode.function_call(function_name, tuple(arguments))
371
+
372
+ while True:
373
+ arguments.append(self._parse_comparison())
374
+ token = self._peek()
375
+ if token is not None and token.value == ",":
376
+ self._advance()
377
+ continue
378
+ self._expect(")")
379
+ if function_name == "OFFSET":
380
+ return _static_offset_reference(arguments)
381
+ return FormulaExpressionNode.function_call(function_name, tuple(arguments))
382
+
383
+ def _resolved_reference(self, raw_reference: str) -> WorkbookReference:
384
+ semantic_reference = normalize_reference(raw_reference, current_sheet=_sheet_name(self.cell.cell_ref))
385
+ if semantic_reference.kind == "range":
386
+ return semantic_reference
387
+
388
+ if self.reference_index is not None:
389
+ source = self.reference_index.get((self.cell.cell_ref, raw_reference))
390
+ if source is not None:
391
+ if source.kind == "structured":
392
+ raise FormulaTranslationError(
393
+ "unsupported_structured_reference",
394
+ "structured references are not supported",
395
+ raw_reference,
396
+ )
397
+ return source
398
+
399
+ for edge in self.graph.execution_edges:
400
+ if edge.target.normalized == self.cell.cell_ref and edge.raw_reference == raw_reference:
401
+ source = _expression_reference_from_edge(edge)
402
+ if source.kind == "structured":
403
+ raise FormulaTranslationError(
404
+ "unsupported_structured_reference",
405
+ "structured references are not supported",
406
+ raw_reference,
407
+ )
408
+ return source
409
+
410
+ if semantic_reference.kind == "structured":
411
+ raise FormulaTranslationError(
412
+ "unsupported_structured_reference",
413
+ "structured references are not supported",
414
+ raw_reference,
415
+ )
416
+ if semantic_reference.kind == "named_range":
417
+ raise FormulaTranslationError("unresolved_named_range", "named range could not be resolved", raw_reference)
418
+ return semantic_reference
419
+
420
+ def _peek(self) -> _FormulaToken | None:
421
+ if self.index >= len(self.tokens):
422
+ return None
423
+ return self.tokens[self.index]
424
+
425
+ def _advance(self) -> _FormulaToken:
426
+ token = self._peek()
427
+ if token is None:
428
+ raise FormulaTranslationError("unexpected_formula_end", "formula ended unexpectedly")
429
+ self.index += 1
430
+ return token
431
+
432
+ def _expect(self, value: str) -> None:
433
+ token = self._advance()
434
+ if token.value != value:
435
+ raise FormulaTranslationError("unexpected_formula_token", f"expected {value}", token.value)
436
+
437
+
438
+ def _formula_tokens(raw_formula: str) -> tuple[_FormulaToken, ...]:
439
+ tokens: list[_FormulaToken] = []
440
+ for token in Tokenizer(raw_formula).items:
441
+ if token.type == "WHITE-SPACE":
442
+ continue
443
+ if token.type == "FUNC" and token.subtype == "OPEN":
444
+ tokens.append(_FormulaToken("identifier", token.value[:-1]))
445
+ tokens.append(_FormulaToken("operator", "("))
446
+ continue
447
+ if token.type == "FUNC" and token.subtype == "CLOSE":
448
+ tokens.append(_FormulaToken("operator", ")"))
449
+ continue
450
+ if token.type == "PAREN":
451
+ tokens.append(_FormulaToken("operator", token.value))
452
+ continue
453
+ if token.type == "SEP":
454
+ tokens.append(_FormulaToken("operator", token.value))
455
+ continue
456
+ if token.type == "OPERAND" and token.subtype == "NUMBER":
457
+ tokens.append(_FormulaToken("number", token.value))
458
+ continue
459
+ if token.type == "OPERAND" and token.subtype == "TEXT":
460
+ tokens.append(_FormulaToken("text", token.value.strip('"')))
461
+ continue
462
+ if token.type == "OPERAND" and token.subtype == "LOGICAL":
463
+ tokens.append(_FormulaToken("logical", token.value.upper()))
464
+ continue
465
+ if token.type == "OPERAND" and token.subtype == "ERROR":
466
+ raise FormulaTranslationError(
467
+ "unsupported_error_reference",
468
+ "formula contains an unsupported error reference",
469
+ token.value,
470
+ )
471
+ if token.type == "OPERAND" and token.subtype == "RANGE":
472
+ tokens.append(_FormulaToken("reference", token.value))
473
+ continue
474
+ if token.type.startswith("OPERATOR"):
475
+ if token.value not in SUPPORTED_OPERATORS:
476
+ raise FormulaTranslationError("unsupported_operator", "formula operator is not supported", token.value)
477
+ tokens.append(_FormulaToken("operator", token.value))
478
+ continue
479
+ raise FormulaTranslationError("unsupported_formula_token", "unsupported formula token", token.value)
480
+ return tuple(tokens)
481
+
482
+
483
+ def _normalized_function_name(function_name: str) -> str:
484
+ if function_name.startswith("_XLFN."):
485
+ return function_name.removeprefix("_XLFN.")
486
+ return function_name
487
+
488
+
489
+ def _number_value(raw_value: str) -> int | float:
490
+ value = float(raw_value)
491
+ if value.is_integer():
492
+ return int(value)
493
+ return value
494
+
495
+
496
+ def _static_offset_reference(arguments: list[FormulaExpressionNode]) -> FormulaExpressionNode:
497
+ if len(arguments) != 3:
498
+ raise FormulaTranslationError(
499
+ "unsupported_offset_shape",
500
+ "only static three-argument OFFSET references are supported",
501
+ "OFFSET",
502
+ )
503
+
504
+ base, rows, columns = arguments
505
+ if base.kind != "reference" or base.reference is None or base.reference.kind != "cell":
506
+ raise FormulaTranslationError(
507
+ "unsupported_offset_reference",
508
+ "OFFSET base reference must resolve to one concrete cell",
509
+ "OFFSET",
510
+ )
511
+ row_offset = _literal_integer(rows)
512
+ column_offset = _literal_integer(columns)
513
+ if row_offset is None or column_offset is None:
514
+ raise FormulaTranslationError(
515
+ "unsupported_offset_argument",
516
+ "OFFSET row and column offsets must be static integers",
517
+ "OFFSET",
518
+ )
519
+ reference = _shift_cell_reference(base.reference, row_offset=row_offset, column_offset=column_offset)
520
+ return FormulaExpressionNode.reference_to(reference)
521
+
522
+
523
+ def _literal_integer(node: FormulaExpressionNode) -> int | None:
524
+ if node.kind == "literal" and isinstance(node.value, int):
525
+ return node.value
526
+ if node.kind == "unary" and node.operator == "-":
527
+ (operand,) = node.operands
528
+ value = _literal_integer(operand)
529
+ return None if value is None else -value
530
+ return None
531
+
532
+
533
+ def _shift_cell_reference(
534
+ reference: WorkbookReference,
535
+ *,
536
+ row_offset: int,
537
+ column_offset: int,
538
+ ) -> WorkbookReference:
539
+ if reference.sheet is None or reference.start_cell is None:
540
+ raise FormulaTranslationError(
541
+ "unsupported_offset_reference",
542
+ "OFFSET base reference must include a sheet and cell coordinate",
543
+ "OFFSET",
544
+ )
545
+ try:
546
+ min_col, min_row, max_col, max_row = range_boundaries(reference.start_cell)
547
+ except ValueError as error:
548
+ raise FormulaTranslationError(
549
+ "unsupported_offset_reference",
550
+ "OFFSET base reference must include a valid cell coordinate",
551
+ "OFFSET",
552
+ ) from error
553
+ if min_col != max_col or min_row != max_row:
554
+ raise FormulaTranslationError(
555
+ "unsupported_offset_reference",
556
+ "OFFSET base reference must resolve to one concrete cell",
557
+ "OFFSET",
558
+ )
559
+ shifted_column = min_col + column_offset
560
+ shifted_row = min_row + row_offset
561
+ if shifted_column < 1 or shifted_row < 1:
562
+ raise FormulaTranslationError(
563
+ "unsupported_offset_reference",
564
+ "OFFSET resolved outside the worksheet grid",
565
+ "OFFSET",
566
+ )
567
+ return normalize_reference(f"{reference.sheet}!{get_column_letter(shifted_column)}{shifted_row}")
568
+
569
+
570
+ def _sheet_name(cell_ref: str) -> str:
571
+ return cell_ref.split("!", 1)[0]