modelwright 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
modelwright/graph.py ADDED
@@ -0,0 +1,591 @@
1
+ """Dependency graph records built from extracted workbook facts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Literal
7
+
8
+ from openpyxl.utils.cell import get_column_letter, range_boundaries
9
+
10
+ from modelwright.extraction import CellRecord, TableRecord, WorkbookRecord
11
+ from modelwright.references import WorkbookReference, normalize_reference
12
+
13
+
14
+ JsonValue = str | int | float | bool | None | list[Any] | dict[str, Any]
15
+ EdgeKind = Literal["semantic", "execution"]
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class DependencyEdge:
20
+ """One dependency edge from an upstream source to a formula cell target."""
21
+
22
+ source: WorkbookReference
23
+ target: WorkbookReference
24
+ edge_kind: EdgeKind
25
+ raw_reference: str
26
+ resolved_from: WorkbookReference | None = None
27
+ diagnostic_code: str | None = None
28
+
29
+ @classmethod
30
+ def from_dict(cls, data: dict[str, Any]) -> "DependencyEdge":
31
+ resolved_from_data = data.get("resolved_from")
32
+ return cls(
33
+ source=WorkbookReference.from_dict(data["source"]),
34
+ target=WorkbookReference.from_dict(data["target"]),
35
+ edge_kind=data["edge_kind"],
36
+ raw_reference=data["raw_reference"],
37
+ resolved_from=WorkbookReference.from_dict(resolved_from_data) if resolved_from_data is not None else None,
38
+ diagnostic_code=data.get("diagnostic_code"),
39
+ )
40
+
41
+ def to_dict(self) -> dict[str, JsonValue]:
42
+ return {
43
+ "source": self.source.to_dict(),
44
+ "target": self.target.to_dict(),
45
+ "edge_kind": self.edge_kind,
46
+ "raw_reference": self.raw_reference,
47
+ "resolved_from": self.resolved_from.to_dict() if self.resolved_from is not None else None,
48
+ "diagnostic_code": self.diagnostic_code,
49
+ }
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class DependencyGraph:
54
+ """Dependency edges for one workbook extraction."""
55
+
56
+ workbook_id: str
57
+ edges: tuple[DependencyEdge, ...] = field(default_factory=tuple)
58
+ diagnostics: tuple[str, ...] = field(default_factory=tuple)
59
+
60
+ @property
61
+ def semantic_edges(self) -> tuple[DependencyEdge, ...]:
62
+ return tuple(edge for edge in self.edges if edge.edge_kind == "semantic")
63
+
64
+ @property
65
+ def execution_edges(self) -> tuple[DependencyEdge, ...]:
66
+ return tuple(edge for edge in self.edges if edge.edge_kind == "execution")
67
+
68
+ @classmethod
69
+ def from_dict(cls, data: dict[str, Any]) -> "DependencyGraph":
70
+ return cls(
71
+ workbook_id=data["workbook_id"],
72
+ edges=tuple(DependencyEdge.from_dict(edge) for edge in data.get("edges", [])),
73
+ diagnostics=tuple(data.get("diagnostics", [])),
74
+ )
75
+
76
+ def to_dict(self) -> dict[str, JsonValue]:
77
+ return {
78
+ "workbook_id": self.workbook_id,
79
+ "edges": [edge.to_dict() for edge in self.edges],
80
+ "diagnostics": list(self.diagnostics),
81
+ }
82
+
83
+
84
+ def build_dependency_graph(workbook: WorkbookRecord) -> DependencyGraph:
85
+ """Build semantic and execution dependency edges from extracted formulas."""
86
+
87
+ named_ranges = _named_range_destinations(workbook)
88
+ tables = {table.name: table for table in workbook.tables}
89
+ edges: list[DependencyEdge] = []
90
+ diagnostics: list[str] = []
91
+
92
+ for cell in workbook.cells:
93
+ if cell.formula is None:
94
+ continue
95
+
96
+ target = _target_reference(cell)
97
+ current_sheet = target.sheet
98
+ for raw_reference in cell.formula.raw_references:
99
+ source = normalize_reference(raw_reference, current_sheet=current_sheet)
100
+ execution_edges = _execution_edges_for(
101
+ source,
102
+ target,
103
+ raw_reference,
104
+ named_ranges,
105
+ tables,
106
+ raw_formula=cell.formula.raw_formula,
107
+ )
108
+ diagnostic_code = source.diagnostic_code
109
+ if source.kind == "structured" and all(edge.diagnostic_code is None for edge in execution_edges):
110
+ diagnostic_code = None
111
+ edges.append(
112
+ DependencyEdge(
113
+ source=source,
114
+ target=target,
115
+ edge_kind="semantic",
116
+ raw_reference=raw_reference,
117
+ diagnostic_code=diagnostic_code,
118
+ )
119
+ )
120
+ edges.extend(execution_edges)
121
+
122
+ diagnostics.extend(_diagnostic_codes(edges))
123
+ diagnostics.extend(_circular_dependency_codes(edges))
124
+ return DependencyGraph(workbook_id=workbook.workbook_id, edges=tuple(edges), diagnostics=tuple(dict.fromkeys(diagnostics)))
125
+
126
+
127
+ def _named_range_destinations(workbook: WorkbookRecord) -> dict[str, tuple[WorkbookReference, ...]]:
128
+ return {
129
+ named_range.name: tuple(
130
+ reference
131
+ for destination in named_range.destinations
132
+ if (reference := normalize_reference(destination)).kind in {"cell", "range"}
133
+ )
134
+ for named_range in workbook.named_ranges
135
+ }
136
+
137
+
138
+ def _target_reference(cell: CellRecord) -> WorkbookReference:
139
+ target = normalize_reference(cell.cell_ref)
140
+ if target.kind != "cell":
141
+ raise ValueError(f"formula cell target is not a cell reference: {cell.cell_ref}")
142
+ return target
143
+
144
+
145
+ def _execution_edges_for(
146
+ source: WorkbookReference,
147
+ target: WorkbookReference,
148
+ raw_reference: str,
149
+ named_ranges: dict[str, tuple[WorkbookReference, ...]],
150
+ tables: dict[str, TableRecord],
151
+ *,
152
+ raw_formula: str,
153
+ ) -> tuple[DependencyEdge, ...]:
154
+ if static_offset_reference := _static_offset_reference(
155
+ raw_formula=raw_formula,
156
+ raw_reference=raw_reference,
157
+ source=source,
158
+ target=target,
159
+ tables=tables,
160
+ ):
161
+ return (
162
+ DependencyEdge(
163
+ source=static_offset_reference.shifted,
164
+ target=target,
165
+ edge_kind="execution",
166
+ raw_reference=raw_reference,
167
+ resolved_from=static_offset_reference.base,
168
+ ),
169
+ )
170
+
171
+ if source.kind == "cell":
172
+ return (
173
+ DependencyEdge(
174
+ source=source,
175
+ target=target,
176
+ edge_kind="execution",
177
+ raw_reference=raw_reference,
178
+ ),
179
+ )
180
+
181
+ if source.kind == "range":
182
+ return tuple(
183
+ DependencyEdge(
184
+ source=range_cell,
185
+ target=target,
186
+ edge_kind="execution",
187
+ raw_reference=raw_reference,
188
+ resolved_from=source,
189
+ )
190
+ for range_cell in _expand_range_reference(source)
191
+ )
192
+
193
+ if source.kind == "named_range" and source.name in named_ranges:
194
+ edges: list[DependencyEdge] = []
195
+ for destination in named_ranges[source.name]:
196
+ if destination.kind == "range":
197
+ edges.extend(
198
+ DependencyEdge(
199
+ source=range_cell,
200
+ target=target,
201
+ edge_kind="execution",
202
+ raw_reference=raw_reference,
203
+ resolved_from=destination,
204
+ )
205
+ for range_cell in _expand_range_reference(destination)
206
+ )
207
+ continue
208
+ edges.append(
209
+ DependencyEdge(
210
+ source=destination,
211
+ target=target,
212
+ edge_kind="execution",
213
+ raw_reference=raw_reference,
214
+ resolved_from=source,
215
+ )
216
+ )
217
+ return tuple(edges)
218
+
219
+ if source.kind == "structured":
220
+ resolved = _resolve_structured_reference(source, target, tables)
221
+ if resolved is not None:
222
+ return (
223
+ DependencyEdge(
224
+ source=resolved,
225
+ target=target,
226
+ edge_kind="execution",
227
+ raw_reference=raw_reference,
228
+ resolved_from=source,
229
+ ),
230
+ )
231
+
232
+ return (
233
+ DependencyEdge(
234
+ source=source,
235
+ target=target,
236
+ edge_kind="execution",
237
+ raw_reference=raw_reference,
238
+ diagnostic_code=source.diagnostic_code or f"unsupported_{source.kind}_dependency",
239
+ ),
240
+ )
241
+
242
+
243
+ @dataclass(frozen=True)
244
+ class _StaticOffsetReference:
245
+ base: WorkbookReference
246
+ shifted: WorkbookReference
247
+
248
+
249
+ def _static_offset_reference(
250
+ *,
251
+ raw_formula: str,
252
+ raw_reference: str,
253
+ source: WorkbookReference,
254
+ target: WorkbookReference,
255
+ tables: dict[str, TableRecord],
256
+ ) -> _StaticOffsetReference | None:
257
+ for arguments in _offset_argument_lists(raw_formula):
258
+ if len(arguments) != 3:
259
+ continue
260
+ base_argument, row_argument, column_argument = arguments
261
+ if base_argument != raw_reference:
262
+ continue
263
+ row_offset = _static_integer_argument(row_argument)
264
+ column_offset = _static_integer_argument(column_argument)
265
+ if row_offset is None or column_offset is None:
266
+ continue
267
+ base = _static_offset_base(source, target, tables)
268
+ if base is None:
269
+ continue
270
+ shifted = _shift_cell_reference(base, row_offset=row_offset, column_offset=column_offset)
271
+ if shifted is not None:
272
+ return _StaticOffsetReference(base=base, shifted=shifted)
273
+ return None
274
+
275
+
276
+ def _offset_argument_lists(raw_formula: str) -> tuple[tuple[str, ...], ...]:
277
+ formula = raw_formula.removeprefix("=")
278
+ argument_lists: list[tuple[str, ...]] = []
279
+ search_from = 0
280
+ while True:
281
+ offset_index = formula.upper().find("OFFSET(", search_from)
282
+ if offset_index == -1:
283
+ return tuple(argument_lists)
284
+ args_start = offset_index + len("OFFSET(")
285
+ args_end = _matching_parenthesis(formula, args_start - 1)
286
+ if args_end is None:
287
+ search_from = args_start
288
+ continue
289
+ argument_lists.append(_split_formula_arguments(formula[args_start:args_end]))
290
+ search_from = args_end + 1
291
+
292
+
293
+ def _matching_parenthesis(formula: str, open_index: int) -> int | None:
294
+ depth = 0
295
+ bracket_depth = 0
296
+ in_string = False
297
+ index = open_index
298
+ while index < len(formula):
299
+ character = formula[index]
300
+ if character == '"':
301
+ in_string = not in_string
302
+ elif not in_string:
303
+ if character == "[":
304
+ bracket_depth += 1
305
+ elif character == "]" and bracket_depth:
306
+ bracket_depth -= 1
307
+ elif bracket_depth == 0 and character == "(":
308
+ depth += 1
309
+ elif bracket_depth == 0 and character == ")":
310
+ depth -= 1
311
+ if depth == 0:
312
+ return index
313
+ index += 1
314
+ return None
315
+
316
+
317
+ def _split_formula_arguments(arguments: str) -> tuple[str, ...]:
318
+ parts: list[str] = []
319
+ current: list[str] = []
320
+ paren_depth = 0
321
+ bracket_depth = 0
322
+ in_string = False
323
+ for character in arguments:
324
+ if character == '"':
325
+ in_string = not in_string
326
+ current.append(character)
327
+ continue
328
+ if not in_string:
329
+ if character == "[":
330
+ bracket_depth += 1
331
+ elif character == "]" and bracket_depth:
332
+ bracket_depth -= 1
333
+ elif bracket_depth == 0 and character == "(":
334
+ paren_depth += 1
335
+ elif bracket_depth == 0 and character == ")" and paren_depth:
336
+ paren_depth -= 1
337
+ elif bracket_depth == 0 and paren_depth == 0 and character == ",":
338
+ parts.append("".join(current).strip())
339
+ current = []
340
+ continue
341
+ current.append(character)
342
+ parts.append("".join(current).strip())
343
+ return tuple(parts)
344
+
345
+
346
+ def _static_integer_argument(argument: str) -> int | None:
347
+ try:
348
+ return int(argument)
349
+ except ValueError:
350
+ return None
351
+
352
+
353
+ def _static_offset_base(
354
+ source: WorkbookReference,
355
+ target: WorkbookReference,
356
+ tables: dict[str, TableRecord],
357
+ ) -> WorkbookReference | None:
358
+ if source.kind == "cell":
359
+ return source
360
+ if source.kind == "structured":
361
+ resolved = _resolve_structured_reference(source, target, tables)
362
+ return resolved if resolved is not None and resolved.kind == "cell" else None
363
+ return None
364
+
365
+
366
+ def _shift_cell_reference(
367
+ reference: WorkbookReference,
368
+ *,
369
+ row_offset: int,
370
+ column_offset: int,
371
+ ) -> WorkbookReference | None:
372
+ if reference.sheet is None or reference.start_cell is None:
373
+ return None
374
+ try:
375
+ min_col, min_row, max_col, max_row = range_boundaries(reference.start_cell)
376
+ except ValueError:
377
+ return None
378
+ if min_col != max_col or min_row != max_row:
379
+ return None
380
+ shifted_column = min_col + column_offset
381
+ shifted_row = min_row + row_offset
382
+ if shifted_column < 1 or shifted_row < 1:
383
+ return None
384
+ return normalize_reference(f"{reference.sheet}!{get_column_letter(shifted_column)}{shifted_row}")
385
+
386
+
387
+ def _resolve_structured_reference(
388
+ source: WorkbookReference,
389
+ target: WorkbookReference,
390
+ tables: dict[str, TableRecord],
391
+ ) -> WorkbookReference | None:
392
+ parsed = _parse_structured_reference(source.original)
393
+ if parsed is None:
394
+ return None
395
+
396
+ table = tables.get(parsed.table_name) if parsed.table_name is not None else _table_containing_target(target, tables)
397
+ if table is None:
398
+ return None
399
+
400
+ try:
401
+ min_col, min_row, max_col, max_row = range_boundaries(table.ref)
402
+ except ValueError:
403
+ return None
404
+
405
+ if parsed.column is None:
406
+ start_row = min_row if parsed.include_headers else min_row + 1
407
+ return normalize_reference(
408
+ f"{table.sheet}!{_column_name(min_col)}{start_row}:{_column_name(max_col)}{max_row}"
409
+ )
410
+
411
+ try:
412
+ column_offset = table.columns.index(parsed.column)
413
+ except ValueError:
414
+ return None
415
+
416
+ column_name = _column_name(min_col + column_offset)
417
+ data_start_row = min_row + 1
418
+ if parsed.current_row:
419
+ if target.sheet != table.sheet or target.start_cell is None:
420
+ return _resolve_cross_table_current_row(
421
+ source_table=table,
422
+ target=target,
423
+ column_name=column_name,
424
+ tables=tables,
425
+ )
426
+ try:
427
+ _target_col, target_row, _target_max_col, _target_max_row = range_boundaries(target.start_cell)
428
+ except ValueError:
429
+ return None
430
+ if target_row < data_start_row or target_row > max_row:
431
+ return _resolve_cross_table_current_row(
432
+ source_table=table,
433
+ target=target,
434
+ column_name=column_name,
435
+ tables=tables,
436
+ )
437
+ return normalize_reference(f"{table.sheet}!{column_name}{target_row}")
438
+
439
+ return normalize_reference(f"{table.sheet}!{column_name}{data_start_row}:{column_name}{max_row}")
440
+
441
+
442
+ def _resolve_cross_table_current_row(
443
+ *,
444
+ source_table: TableRecord,
445
+ target: WorkbookReference,
446
+ column_name: str,
447
+ tables: dict[str, TableRecord],
448
+ ) -> WorkbookReference | None:
449
+ target_table = _table_containing_target(target, tables)
450
+ if target_table is None or target.start_cell is None:
451
+ return None
452
+
453
+ try:
454
+ _target_col, target_row, _target_max_col, _target_max_row = range_boundaries(target.start_cell)
455
+ _source_min_col, source_min_row, _source_max_col, source_max_row = range_boundaries(source_table.ref)
456
+ _target_min_col, target_min_row, _target_table_max_col, target_max_row = range_boundaries(target_table.ref)
457
+ except ValueError:
458
+ return None
459
+
460
+ source_data_rows = source_max_row - source_min_row
461
+ target_data_rows = target_max_row - target_min_row
462
+ if source_data_rows != target_data_rows:
463
+ return None
464
+
465
+ target_offset = target_row - (target_min_row + 1)
466
+ mapped_row = source_min_row + 1 + target_offset
467
+ if mapped_row < source_min_row + 1 or mapped_row > source_max_row:
468
+ return None
469
+ return normalize_reference(f"{source_table.sheet}!{column_name}{mapped_row}")
470
+
471
+
472
+ @dataclass(frozen=True)
473
+ class _StructuredReferenceParts:
474
+ table_name: str | None
475
+ column: str | None
476
+ current_row: bool
477
+ include_headers: bool
478
+
479
+
480
+ def _parse_structured_reference(reference: str) -> _StructuredReferenceParts | None:
481
+ if "[" not in reference or "]" not in reference:
482
+ return None
483
+
484
+ table_name = reference.split("[", 1)[0] or None
485
+ bracketed_parts = _bracketed_parts(reference)
486
+ current_row = any(part == "#This Row" or part.startswith("@") for part in bracketed_parts)
487
+ if reference.startswith("[@"):
488
+ current_row = True
489
+ include_headers = any(part == "#All" for part in bracketed_parts)
490
+
491
+ column = next(
492
+ (
493
+ _clean_structured_selector(part)
494
+ for part in reversed(bracketed_parts)
495
+ if not part.startswith("#")
496
+ ),
497
+ None,
498
+ )
499
+ return _StructuredReferenceParts(
500
+ table_name=table_name,
501
+ column=column,
502
+ current_row=current_row,
503
+ include_headers=include_headers,
504
+ )
505
+
506
+
507
+ def _bracketed_parts(reference: str) -> tuple[str, ...]:
508
+ parts: list[str] = []
509
+ current: list[str] = []
510
+ depth = 0
511
+ for character in reference:
512
+ if character == "[":
513
+ if depth > 0:
514
+ current.append(character)
515
+ depth += 1
516
+ continue
517
+ if character == "]":
518
+ depth -= 1
519
+ if depth == 0:
520
+ part = "".join(current)
521
+ current = []
522
+ if part.startswith("[") and part.endswith("]"):
523
+ parts.extend(_bracketed_parts(part))
524
+ elif part:
525
+ parts.append(part)
526
+ continue
527
+ current.append(character)
528
+ continue
529
+ if depth > 0:
530
+ current.append(character)
531
+ return tuple(parts)
532
+
533
+
534
+ def _clean_structured_selector(selector: str) -> str:
535
+ return selector.removeprefix("@").replace("''", "'")
536
+
537
+
538
+ def _table_containing_target(target: WorkbookReference, tables: dict[str, TableRecord]) -> TableRecord | None:
539
+ if target.sheet is None or target.start_cell is None:
540
+ return None
541
+
542
+ try:
543
+ target_col, target_row, _target_max_col, _target_max_row = range_boundaries(target.start_cell)
544
+ except ValueError:
545
+ return None
546
+
547
+ for table in tables.values():
548
+ if table.sheet != target.sheet:
549
+ continue
550
+ try:
551
+ min_col, min_row, max_col, max_row = range_boundaries(table.ref)
552
+ except ValueError:
553
+ continue
554
+ if min_col <= target_col <= max_col and min_row <= target_row <= max_row:
555
+ return table
556
+ return None
557
+
558
+
559
+ def _expand_range_reference(source: WorkbookReference) -> tuple[WorkbookReference, ...]:
560
+ if source.sheet is None or source.start_cell is None or source.end_cell is None:
561
+ return ()
562
+
563
+ min_col, min_row, max_col, max_row = range_boundaries(f"{source.start_cell}:{source.end_cell}")
564
+ return tuple(
565
+ normalize_reference(f"{source.sheet}!{_column_name(column)}{row}")
566
+ for row in range(min_row, max_row + 1)
567
+ for column in range(min_col, max_col + 1)
568
+ )
569
+
570
+
571
+ def _diagnostic_codes(edges: list[DependencyEdge]) -> tuple[str, ...]:
572
+ return tuple(edge.diagnostic_code for edge in edges if edge.diagnostic_code is not None)
573
+
574
+
575
+ def _circular_dependency_codes(edges: list[DependencyEdge]) -> tuple[str, ...]:
576
+ execution_pairs = {
577
+ (edge.source.normalized, edge.target.normalized)
578
+ for edge in edges
579
+ if edge.edge_kind == "execution" and edge.source.kind == "cell" and edge.target.kind == "cell"
580
+ }
581
+ if any((target, source) in execution_pairs for source, target in execution_pairs):
582
+ return ("circular_dependency",)
583
+ return ()
584
+
585
+
586
+ def _column_name(index: int) -> str:
587
+ name = ""
588
+ while index:
589
+ index, remainder = divmod(index - 1, 26)
590
+ name = chr(65 + remainder) + name
591
+ return name
@@ -0,0 +1,59 @@
1
+ """Validation report helpers for oracle-backed comparisons."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping
6
+
7
+ from modelwright.oracles import OracleResult
8
+ from modelwright.validation import (
9
+ Diagnostic,
10
+ JsonValue,
11
+ ValidationReport,
12
+ ValidationScenario,
13
+ build_validation_report,
14
+ )
15
+
16
+
17
+ def build_oracle_validation_report(
18
+ *,
19
+ scenario: ValidationScenario,
20
+ generated_values: Mapping[str, JsonValue],
21
+ oracle_result: OracleResult,
22
+ ) -> ValidationReport:
23
+ """Compare generated values against an oracle result for one scenario."""
24
+
25
+ report = build_validation_report(
26
+ scenario=scenario,
27
+ generated_values=generated_values,
28
+ oracle_values=oracle_result.outputs,
29
+ )
30
+ diagnostics = list(_oracle_diagnostics(oracle_result))
31
+
32
+ if scenario.oracle.backend != oracle_result.backend:
33
+ diagnostics.append(
34
+ Diagnostic(
35
+ diagnostic_code="oracle_backend_mismatch",
36
+ message="scenario oracle backend does not match oracle result backend",
37
+ severity="error",
38
+ location=scenario.scenario_id,
39
+ )
40
+ )
41
+
42
+ return ValidationReport(
43
+ scenario_id=report.scenario_id,
44
+ oracle_backend=oracle_result.backend,
45
+ comparisons=report.comparisons,
46
+ diagnostics=tuple(diagnostics),
47
+ )
48
+
49
+
50
+ def _oracle_diagnostics(oracle_result: OracleResult) -> tuple[Diagnostic, ...]:
51
+ return tuple(
52
+ Diagnostic(
53
+ diagnostic_code=diagnostic.diagnostic_code,
54
+ message=diagnostic.message,
55
+ severity=diagnostic.severity,
56
+ location=diagnostic.location,
57
+ )
58
+ for diagnostic in oracle_result.diagnostics
59
+ )