dataforge-07 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. dataforge/__init__.py +204 -0
  2. dataforge/__main__.py +5 -0
  3. dataforge/agent/__init__.py +16 -0
  4. dataforge/agent/providers.py +259 -0
  5. dataforge/agent/scratchpad.py +183 -0
  6. dataforge/agent/tool_actions.py +343 -0
  7. dataforge/bench/__init__.py +31 -0
  8. dataforge/bench/core.py +426 -0
  9. dataforge/bench/groq_client.py +386 -0
  10. dataforge/bench/methods.py +443 -0
  11. dataforge/bench/report.py +309 -0
  12. dataforge/bench/runner.py +247 -0
  13. dataforge/causal/__init__.py +21 -0
  14. dataforge/causal/dag.py +174 -0
  15. dataforge/causal/pc.py +232 -0
  16. dataforge/causal/root_cause.py +193 -0
  17. dataforge/cli/__init__.py +50 -0
  18. dataforge/cli/audit.py +70 -0
  19. dataforge/cli/bench.py +154 -0
  20. dataforge/cli/common.py +267 -0
  21. dataforge/cli/constraints.py +407 -0
  22. dataforge/cli/profile.py +147 -0
  23. dataforge/cli/release.py +166 -0
  24. dataforge/cli/repair.py +407 -0
  25. dataforge/cli/revert.py +139 -0
  26. dataforge/cli/watch.py +144 -0
  27. dataforge/datasets/__init__.py +25 -0
  28. dataforge/datasets/embedded/hospital/clean.csv +11 -0
  29. dataforge/datasets/embedded/hospital/dirty.csv +11 -0
  30. dataforge/datasets/real_world.py +290 -0
  31. dataforge/datasets/registry.py +103 -0
  32. dataforge/detectors/__init__.py +80 -0
  33. dataforge/detectors/base.py +145 -0
  34. dataforge/detectors/decimal_shift.py +166 -0
  35. dataforge/detectors/fd_violation.py +157 -0
  36. dataforge/detectors/type_mismatch.py +173 -0
  37. dataforge/engine/__init__.py +39 -0
  38. dataforge/engine/repair.py +905 -0
  39. dataforge/env/__init__.py +22 -0
  40. dataforge/env/environment.py +883 -0
  41. dataforge/env/observation.py +61 -0
  42. dataforge/env/openenv_core.py +161 -0
  43. dataforge/env/reward.py +128 -0
  44. dataforge/env/server.py +176 -0
  45. dataforge/evaluation_contract.py +76 -0
  46. dataforge/fixtures/hospital_10rows.csv +11 -0
  47. dataforge/fixtures/hospital_schema.yaml +17 -0
  48. dataforge/http/__init__.py +1 -0
  49. dataforge/http/problem.py +103 -0
  50. dataforge/integrations/__init__.py +1 -0
  51. dataforge/integrations/dbt.py +164 -0
  52. dataforge/observability.py +76 -0
  53. dataforge/py.typed +1 -0
  54. dataforge/release/__init__.py +1 -0
  55. dataforge/release/doctor.py +367 -0
  56. dataforge/release/full_vision.py +702 -0
  57. dataforge/release/gate.py +861 -0
  58. dataforge/release/playground_check.py +411 -0
  59. dataforge/repair_contract.py +468 -0
  60. dataforge/repairers/__init__.py +88 -0
  61. dataforge/repairers/base.py +77 -0
  62. dataforge/repairers/decimal_shift.py +43 -0
  63. dataforge/repairers/fd_violation.py +225 -0
  64. dataforge/repairers/type_mismatch.py +73 -0
  65. dataforge/safety/__init__.py +5 -0
  66. dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
  67. dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
  68. dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
  69. dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
  70. dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
  71. dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
  72. dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
  73. dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
  74. dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
  75. dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
  76. dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
  77. dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
  78. dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
  79. dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
  80. dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
  81. dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
  82. dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
  83. dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
  84. dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
  85. dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
  86. dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
  87. dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
  88. dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
  89. dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
  90. dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
  91. dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
  92. dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
  93. dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
  94. dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
  95. dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
  96. dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
  97. dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
  98. dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
  99. dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
  100. dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
  101. dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
  102. dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
  103. dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
  104. dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
  105. dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
  106. dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
  107. dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
  108. dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
  109. dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
  110. dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
  111. dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
  112. dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
  113. dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
  114. dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
  115. dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
  116. dataforge/safety/constitution.py +307 -0
  117. dataforge/safety/constitutions/default.yaml +40 -0
  118. dataforge/safety/filter.py +134 -0
  119. dataforge/schema_inference.py +620 -0
  120. dataforge/stores/__init__.py +46 -0
  121. dataforge/stores/base.py +73 -0
  122. dataforge/stores/cloud.py +78 -0
  123. dataforge/stores/csv.py +94 -0
  124. dataforge/stores/duckdb.py +313 -0
  125. dataforge/stores/patch_plan.py +178 -0
  126. dataforge/stores/registry.py +82 -0
  127. dataforge/stores/repair.py +121 -0
  128. dataforge/stores/revert.py +22 -0
  129. dataforge/stores/sql.py +27 -0
  130. dataforge/table.py +228 -0
  131. dataforge/transactions/__init__.py +34 -0
  132. dataforge/transactions/files.py +96 -0
  133. dataforge/transactions/log.py +613 -0
  134. dataforge/transactions/revert.py +102 -0
  135. dataforge/transactions/txn.py +104 -0
  136. dataforge/ui/__init__.py +1 -0
  137. dataforge/ui/profile_view.py +136 -0
  138. dataforge/ui/repair_diff.py +91 -0
  139. dataforge/verifier/__init__.py +55 -0
  140. dataforge/verifier/constraint_ir.py +155 -0
  141. dataforge/verifier/explain.py +47 -0
  142. dataforge/verifier/gate.py +5 -0
  143. dataforge/verifier/schema.py +111 -0
  144. dataforge/verifier/smt.py +433 -0
  145. dataforge_07-0.1.0.dist-info/METADATA +436 -0
  146. dataforge_07-0.1.0.dist-info/RECORD +150 -0
  147. dataforge_07-0.1.0.dist-info/WHEEL +5 -0
  148. dataforge_07-0.1.0.dist-info/entry_points.txt +3 -0
  149. dataforge_07-0.1.0.dist-info/licenses/LICENSE +176 -0
  150. dataforge_07-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,5 @@
1
+ """Compatibility exports for the Week 3 verifier."""
2
+
3
+ from dataforge.verifier.smt import SMTVerifier, VerificationResult, VerificationVerdict
4
+
5
+ __all__ = ["SMTVerifier", "VerificationResult", "VerificationVerdict"]
@@ -0,0 +1,111 @@
1
+ """Canonical schema models shared by detectors, safety, and the verifier."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from pydantic import ConfigDict, Field
8
+ from pydantic.dataclasses import dataclass
9
+
10
+ AggregateLiteral = Literal["sum", "avg"]
11
+
12
+ _CONFIG = ConfigDict(frozen=True)
13
+
14
+
15
+ @dataclass(config=_CONFIG, kw_only=True)
16
+ class FunctionalDependency:
17
+ """Declared functional dependency: determinant columns -> dependent column."""
18
+
19
+ determinant: tuple[str, ...] = Field(min_length=1)
20
+ dependent: str = Field(min_length=1)
21
+
22
+
23
+ @dataclass(config=_CONFIG, kw_only=True)
24
+ class DomainBound:
25
+ """Numeric min/max bounds for a column."""
26
+
27
+ column: str = Field(min_length=1)
28
+ min_value: float | None = None
29
+ max_value: float | None = None
30
+ inclusive_min: bool = True
31
+ inclusive_max: bool = True
32
+
33
+
34
+ @dataclass(config=_CONFIG, kw_only=True)
35
+ class AcceptedValues:
36
+ """Closed set of allowed values for one column."""
37
+
38
+ column: str = Field(min_length=1)
39
+ values: tuple[str, ...] = Field(min_length=1)
40
+
41
+
42
+ @dataclass(config=_CONFIG, kw_only=True)
43
+ class RegexConstraint:
44
+ """Regex pattern a string column value must match."""
45
+
46
+ column: str = Field(min_length=1)
47
+ pattern: str = Field(min_length=1)
48
+
49
+
50
+ @dataclass(config=_CONFIG, kw_only=True)
51
+ class RelationshipConstraint:
52
+ """Single-column referential constraint against another relation."""
53
+
54
+ column: str = Field(min_length=1)
55
+ reference: str = Field(min_length=1)
56
+ reference_column: str = Field(min_length=1)
57
+
58
+
59
+ @dataclass(config=_CONFIG, kw_only=True)
60
+ class AggregateDependency:
61
+ """Metadata describing a source column used in an aggregate elsewhere."""
62
+
63
+ source_column: str = Field(min_length=1)
64
+ target_column: str = Field(min_length=1)
65
+ aggregate: AggregateLiteral
66
+ group_by: tuple[str, ...] = Field(default_factory=tuple)
67
+
68
+
69
+ @dataclass(config=_CONFIG, kw_only=True)
70
+ class Schema:
71
+ """Optional declared schema for a dataset."""
72
+
73
+ columns: dict[str, str] = Field(default_factory=dict)
74
+ functional_dependencies: tuple[FunctionalDependency, ...] = Field(default_factory=tuple)
75
+ pii_columns: frozenset[str] = Field(default_factory=frozenset)
76
+ primary_key_columns: frozenset[str] = Field(default_factory=frozenset)
77
+ not_null_columns: frozenset[str] = Field(default_factory=frozenset)
78
+ unique_columns: frozenset[str] = Field(default_factory=frozenset)
79
+ accepted_values: tuple[AcceptedValues, ...] = Field(default_factory=tuple)
80
+ regex_constraints: tuple[RegexConstraint, ...] = Field(default_factory=tuple)
81
+ relationships: tuple[RelationshipConstraint, ...] = Field(default_factory=tuple)
82
+ domain_bounds: tuple[DomainBound, ...] = Field(default_factory=tuple)
83
+ aggregate_dependencies: tuple[AggregateDependency, ...] = Field(default_factory=tuple)
84
+
85
+ def column_type(self, column: str) -> str | None:
86
+ """Return the declared type for a column, if any."""
87
+ return self.columns.get(column)
88
+
89
+ def domain_bounds_for(self, column: str) -> tuple[DomainBound, ...]:
90
+ """Return all domain bounds declared for the given column."""
91
+ return tuple(bound for bound in self.domain_bounds if bound.column == column)
92
+
93
+ def accepted_values_for(self, column: str) -> tuple[AcceptedValues, ...]:
94
+ """Return accepted-values constraints declared for the given column."""
95
+ return tuple(rule for rule in self.accepted_values if rule.column == column)
96
+
97
+ def regex_constraints_for(self, column: str) -> tuple[RegexConstraint, ...]:
98
+ """Return regex constraints declared for the given column."""
99
+ return tuple(rule for rule in self.regex_constraints if rule.column == column)
100
+
101
+ def relationships_for(self, column: str) -> tuple[RelationshipConstraint, ...]:
102
+ """Return relationship constraints declared for the given column."""
103
+ return tuple(rule for rule in self.relationships if rule.column == column)
104
+
105
+ def aggregate_dependencies_for(self, column: str) -> tuple[AggregateDependency, ...]:
106
+ """Return aggregate dependencies where the column is the source input."""
107
+ return tuple(
108
+ dependency
109
+ for dependency in self.aggregate_dependencies
110
+ if dependency.source_column == column
111
+ )
@@ -0,0 +1,433 @@
1
+ """Z3-backed candidate verifier for Week 3 repairs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import enum
6
+ import re
7
+ from collections.abc import Callable
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ from pydantic import BaseModel, Field
12
+ from z3 import ( # type: ignore[import-untyped]
13
+ And,
14
+ Bool,
15
+ ForAll,
16
+ Function,
17
+ Implies,
18
+ Int,
19
+ IntSort,
20
+ IntVal,
21
+ Or,
22
+ RealSort,
23
+ RealVal,
24
+ Solver,
25
+ StringSort,
26
+ StringVal,
27
+ sat,
28
+ unknown,
29
+ unsat,
30
+ )
31
+
32
+ from dataforge.repairers.base import ProposedFix
33
+ from dataforge.table import (
34
+ TableLike,
35
+ cell_value,
36
+ column_names,
37
+ copy_table,
38
+ row_count,
39
+ set_cell_value,
40
+ )
41
+ from dataforge.verifier.explain import explain_unsat_core
42
+ from dataforge.verifier.schema import DomainBound, FunctionalDependency, Schema
43
+
44
+ Z3ExprFactory = Callable[[Any], Any]
45
+ Z3ValueFactory = Callable[[str], Any]
46
+
47
+
48
+ class VerificationVerdict(enum.Enum):
49
+ """Possible outcomes of the verifier gate."""
50
+
51
+ ACCEPT = "accept"
52
+ REJECT = "reject"
53
+ UNKNOWN = "unknown"
54
+
55
+
56
+ class VerificationResult(BaseModel):
57
+ """Typed result for the Week 3 verifier gate."""
58
+
59
+ verdict: VerificationVerdict
60
+ reason: str = Field(min_length=1)
61
+ unsat_core: tuple[str, ...] = Field(default_factory=tuple)
62
+
63
+ model_config = {"frozen": True}
64
+
65
+
66
+ @dataclass(frozen=True)
67
+ class _ColumnEncoding:
68
+ """Z3 encoding helpers for one column."""
69
+
70
+ name: str
71
+ column_type: str
72
+ function: Z3ExprFactory
73
+ value_factory: Z3ValueFactory
74
+
75
+
76
+ class SchemaToSMT:
77
+ """Compile candidate-local constraints from a schema and working dataframe."""
78
+
79
+ def __init__(self, schema: Schema, df: TableLike, *, timeout_ms: int = 200) -> None:
80
+ self._schema = schema
81
+ self._df = df
82
+ self._timeout_ms = timeout_ms
83
+
84
+ def verify_fix(self, proposed_fix: ProposedFix) -> VerificationResult:
85
+ """Return whether a candidate fix satisfies schema constraints."""
86
+ if proposed_fix.fix.operation != "update":
87
+ return VerificationResult(
88
+ verdict=VerificationVerdict.REJECT,
89
+ reason="Only cell updates are supported by the verifier.",
90
+ )
91
+
92
+ row = proposed_fix.fix.row
93
+ column = proposed_fix.fix.column
94
+ if row < 0 or row >= row_count(self._df):
95
+ return VerificationResult(
96
+ verdict=VerificationVerdict.REJECT,
97
+ reason=f"Row {row} is out of bounds for the input file.",
98
+ )
99
+ if column not in column_names(self._df):
100
+ return VerificationResult(
101
+ verdict=VerificationVerdict.REJECT,
102
+ reason=f"Column '{column}' does not exist in the input file.",
103
+ )
104
+
105
+ relevant_columns = {column}
106
+ relevant_fds = tuple(
107
+ fd
108
+ for fd in self._schema.functional_dependencies
109
+ if column == fd.dependent or column in fd.determinant
110
+ )
111
+ for fd in relevant_fds:
112
+ relevant_columns.update(fd.determinant)
113
+ relevant_columns.add(fd.dependent)
114
+
115
+ try:
116
+ encodings = {
117
+ name: self._build_column_encoding(name) for name in sorted(relevant_columns)
118
+ }
119
+ except ValueError as exc:
120
+ return VerificationResult(
121
+ verdict=VerificationVerdict.UNKNOWN,
122
+ reason=str(exc),
123
+ )
124
+
125
+ solver = Solver()
126
+ solver.set(timeout=self._timeout_ms, unsat_core=True)
127
+
128
+ try:
129
+ self._add_value_assignments(solver, encodings, proposed_fix)
130
+ except ValueError as exc:
131
+ return VerificationResult(
132
+ verdict=VerificationVerdict.UNKNOWN,
133
+ reason=str(exc),
134
+ )
135
+
136
+ for column_name in sorted(
137
+ schema_column
138
+ for schema_column in (
139
+ set(self._schema.not_null_columns)
140
+ | set(self._schema.primary_key_columns)
141
+ | set(self._schema.unique_columns)
142
+ | {rule.column for rule in self._schema.accepted_values}
143
+ | {rule.column for rule in self._schema.regex_constraints}
144
+ )
145
+ if schema_column == column
146
+ ):
147
+ if column_name in self._schema.not_null_columns:
148
+ self._track_not_null(solver, encodings[column_name], proposed_fix)
149
+ if column_name in self._schema.primary_key_columns:
150
+ self._track_not_null(
151
+ solver,
152
+ encodings[column_name],
153
+ proposed_fix,
154
+ label_prefix="primary_key_not_null",
155
+ )
156
+ self._track_unique(
157
+ solver,
158
+ encodings[column_name],
159
+ proposed_fix,
160
+ label_prefix="primary_key_unique",
161
+ )
162
+ if column_name in self._schema.unique_columns:
163
+ self._track_unique(solver, encodings[column_name], proposed_fix)
164
+ for rule in self._schema.accepted_values_for(column_name):
165
+ try:
166
+ self._track_accepted_values(
167
+ solver,
168
+ encodings[column_name],
169
+ proposed_fix,
170
+ rule.values,
171
+ )
172
+ except ValueError as exc:
173
+ return VerificationResult(
174
+ verdict=VerificationVerdict.UNKNOWN,
175
+ reason=str(exc),
176
+ )
177
+ regex_result = self._check_regex_constraints(column_name, proposed_fix)
178
+ if regex_result is not None:
179
+ return regex_result
180
+
181
+ for bound in self._schema.domain_bounds_for(column):
182
+ self._track_domain_bound(solver, encodings[column], proposed_fix, bound)
183
+
184
+ for fd in relevant_fds:
185
+ self._track_fd_constraint(solver, encodings, proposed_fix, fd)
186
+
187
+ result = solver.check()
188
+ if result == sat:
189
+ return VerificationResult(
190
+ verdict=VerificationVerdict.ACCEPT,
191
+ reason="The candidate fix satisfied all tracked verifier constraints.",
192
+ )
193
+ if result == unsat:
194
+ unsat_core = tuple(str(label) for label in solver.unsat_core())
195
+ return VerificationResult(
196
+ verdict=VerificationVerdict.REJECT,
197
+ reason=explain_unsat_core(unsat_core, self._schema),
198
+ unsat_core=unsat_core,
199
+ )
200
+ if result == unknown:
201
+ return VerificationResult(
202
+ verdict=VerificationVerdict.UNKNOWN,
203
+ reason=f"Solver returned unknown: {solver.reason_unknown()}",
204
+ )
205
+ return VerificationResult(
206
+ verdict=VerificationVerdict.UNKNOWN,
207
+ reason="Solver returned an unrecognized status.",
208
+ )
209
+
210
+ def _build_column_encoding(self, column: str) -> _ColumnEncoding:
211
+ column_type = (self._schema.column_type(column) or "str").strip().lower()
212
+ function_name = f"col_{column.replace(' ', '_')}"
213
+ if column_type in {"int", "integer"}:
214
+ return _ColumnEncoding(
215
+ name=column,
216
+ column_type=column_type,
217
+ function=Function(function_name, IntSort(), IntSort()),
218
+ value_factory=lambda raw: IntVal(int(raw)),
219
+ )
220
+ if column_type in {"float", "decimal", "real"}:
221
+ return _ColumnEncoding(
222
+ name=column,
223
+ column_type=column_type,
224
+ function=Function(function_name, IntSort(), RealSort()),
225
+ value_factory=lambda raw: RealVal(str(float(raw))),
226
+ )
227
+ if column_type in {"str", "string"}:
228
+ return _ColumnEncoding(
229
+ name=column,
230
+ column_type=column_type,
231
+ function=Function(function_name, IntSort(), StringSort()),
232
+ value_factory=lambda raw: StringVal(str(raw)),
233
+ )
234
+ raise ValueError(f"Unsupported schema type '{column_type}' for column '{column}'.")
235
+
236
+ def _add_value_assignments(
237
+ self,
238
+ solver: Solver,
239
+ encodings: dict[str, _ColumnEncoding],
240
+ proposed_fix: ProposedFix,
241
+ ) -> None:
242
+ for column, encoding in encodings.items():
243
+ for index in range(row_count(self._df)):
244
+ raw_value = cell_value(self._df, index, column)
245
+ if index == proposed_fix.fix.row and column == proposed_fix.fix.column:
246
+ raw_value = proposed_fix.fix.new_value
247
+ try:
248
+ z3_value = encoding.value_factory(raw_value)
249
+ except (TypeError, ValueError) as exc:
250
+ raise ValueError(
251
+ f"Could not encode value '{raw_value}' for column '{column}' "
252
+ f"as type '{encoding.column_type}'."
253
+ ) from exc
254
+ solver.add(encoding.function(IntVal(index)) == z3_value)
255
+
256
+ def _track_domain_bound(
257
+ self,
258
+ solver: Solver,
259
+ encoding: _ColumnEncoding,
260
+ proposed_fix: ProposedFix,
261
+ bound: DomainBound,
262
+ ) -> None:
263
+ row_expr = encoding.function(IntVal(proposed_fix.fix.row))
264
+ if bound.min_value is not None:
265
+ label = Bool(f"domain::{bound.column}::min::row::{proposed_fix.fix.row}")
266
+ threshold = (
267
+ RealVal(str(bound.min_value))
268
+ if encoding.column_type != "int"
269
+ else IntVal(int(bound.min_value))
270
+ )
271
+ formula = row_expr >= threshold if bound.inclusive_min else row_expr > threshold
272
+ solver.assert_and_track(formula, label)
273
+ if bound.max_value is not None:
274
+ label = Bool(f"domain::{bound.column}::max::row::{proposed_fix.fix.row}")
275
+ threshold = (
276
+ RealVal(str(bound.max_value))
277
+ if encoding.column_type != "int"
278
+ else IntVal(int(bound.max_value))
279
+ )
280
+ formula = row_expr <= threshold if bound.inclusive_max else row_expr < threshold
281
+ solver.assert_and_track(formula, label)
282
+
283
+ def _track_not_null(
284
+ self,
285
+ solver: Solver,
286
+ encoding: _ColumnEncoding,
287
+ proposed_fix: ProposedFix,
288
+ *,
289
+ label_prefix: str = "not_null",
290
+ ) -> None:
291
+ """Track a non-empty value constraint for the candidate cell."""
292
+ if encoding.column_type not in {"str", "string"}:
293
+ return
294
+ label = Bool(f"{label_prefix}::{encoding.name}::row::{proposed_fix.fix.row}")
295
+ row_expr = encoding.function(IntVal(proposed_fix.fix.row))
296
+ empty_value = encoding.value_factory("")
297
+ solver.assert_and_track(row_expr != empty_value, label)
298
+
299
+ def _track_unique(
300
+ self,
301
+ solver: Solver,
302
+ encoding: _ColumnEncoding,
303
+ proposed_fix: ProposedFix,
304
+ *,
305
+ label_prefix: str = "unique",
306
+ ) -> None:
307
+ """Track that the candidate value is unique across all other rows."""
308
+ other_rows = [
309
+ encoding.function(IntVal(index)) != encoding.function(IntVal(proposed_fix.fix.row))
310
+ for index in range(row_count(self._df))
311
+ if index != proposed_fix.fix.row
312
+ ]
313
+ if not other_rows:
314
+ return
315
+ label = Bool(f"{label_prefix}::{encoding.name}::row::{proposed_fix.fix.row}")
316
+ solver.assert_and_track(And(*other_rows), label)
317
+
318
+ def _track_accepted_values(
319
+ self,
320
+ solver: Solver,
321
+ encoding: _ColumnEncoding,
322
+ proposed_fix: ProposedFix,
323
+ values: tuple[str, ...],
324
+ ) -> None:
325
+ """Track that the candidate value belongs to a closed allowed set."""
326
+ if not values:
327
+ return
328
+ row_expr = encoding.function(IntVal(proposed_fix.fix.row))
329
+ try:
330
+ allowed = [row_expr == encoding.value_factory(value) for value in values]
331
+ except (TypeError, ValueError) as exc:
332
+ raise ValueError(
333
+ f"Could not encode accepted values for column '{encoding.name}' "
334
+ f"as type '{encoding.column_type}'."
335
+ ) from exc
336
+ label = Bool(f"accepted_values::{encoding.name}::row::{proposed_fix.fix.row}")
337
+ solver.assert_and_track(Or(*allowed), label)
338
+
339
+ def _check_regex_constraints(
340
+ self,
341
+ column: str,
342
+ proposed_fix: ProposedFix,
343
+ ) -> VerificationResult | None:
344
+ """Conservatively evaluate declared regex constraints before solver check."""
345
+ if column != proposed_fix.fix.column:
346
+ return None
347
+ for rule in self._schema.regex_constraints_for(column):
348
+ try:
349
+ matches = re.fullmatch(rule.pattern, proposed_fix.fix.new_value) is not None
350
+ except re.error as exc:
351
+ return VerificationResult(
352
+ verdict=VerificationVerdict.UNKNOWN,
353
+ reason=f"Invalid regex constraint for column '{column}': {exc}",
354
+ )
355
+ if not matches:
356
+ label = f"regex::{column}::row::{proposed_fix.fix.row}"
357
+ return VerificationResult(
358
+ verdict=VerificationVerdict.REJECT,
359
+ reason=explain_unsat_core((label,), self._schema),
360
+ unsat_core=(label,),
361
+ )
362
+ return None
363
+
364
+ def _track_fd_constraint(
365
+ self,
366
+ solver: Solver,
367
+ encodings: dict[str, _ColumnEncoding],
368
+ proposed_fix: ProposedFix,
369
+ fd: FunctionalDependency,
370
+ ) -> None:
371
+ # Use a universally-quantified implication over all valid other rows.
372
+ other_row = Int("other_row")
373
+ bounds_guard = And(other_row >= 0, other_row < row_count(self._df))
374
+ candidate_row = IntVal(proposed_fix.fix.row)
375
+ determinant_equal = And(
376
+ *[
377
+ encodings[column].function(candidate_row) == encodings[column].function(other_row)
378
+ for column in fd.determinant
379
+ ]
380
+ )
381
+ dependent_equal = encodings[fd.dependent].function(candidate_row) == encodings[
382
+ fd.dependent
383
+ ].function(other_row)
384
+ determinant_label = "+".join(fd.determinant)
385
+ label = Bool(f"fd::{determinant_label}::{fd.dependent}::row::{proposed_fix.fix.row}")
386
+ solver.assert_and_track(
387
+ ForAll([other_row], Implies(bounds_guard, Implies(determinant_equal, dependent_equal))),
388
+ label,
389
+ )
390
+
391
+
392
+ class SMTVerifier:
393
+ """Compatibility wrapper over the Week 3 `SchemaToSMT` verifier."""
394
+
395
+ def verify(
396
+ self,
397
+ df: TableLike,
398
+ fixes: list[ProposedFix],
399
+ schema: Schema | None = None,
400
+ ) -> VerificationResult:
401
+ """Verify one or more candidate fixes against the working dataframe."""
402
+ if schema is None:
403
+ total_rows = row_count(df)
404
+ for proposed in fixes:
405
+ if proposed.fix.row < 0 or proposed.fix.row >= total_rows:
406
+ return VerificationResult(
407
+ verdict=VerificationVerdict.REJECT,
408
+ reason=f"Row {proposed.fix.row} is out of bounds for the input file.",
409
+ )
410
+ if proposed.fix.column not in column_names(df):
411
+ return VerificationResult(
412
+ verdict=VerificationVerdict.REJECT,
413
+ reason=f"Column '{proposed.fix.column}' does not exist in the input file.",
414
+ )
415
+ return VerificationResult(
416
+ verdict=VerificationVerdict.ACCEPT,
417
+ reason="All proposed fixes passed structural verification.",
418
+ )
419
+
420
+ working_df = copy_table(df)
421
+ verifier = SchemaToSMT(schema, working_df)
422
+ for proposed in fixes:
423
+ result = verifier.verify_fix(proposed)
424
+ if result.verdict != VerificationVerdict.ACCEPT:
425
+ return result
426
+ set_cell_value(
427
+ working_df, proposed.fix.row, proposed.fix.column, proposed.fix.new_value
428
+ )
429
+ verifier = SchemaToSMT(schema, working_df)
430
+ return VerificationResult(
431
+ verdict=VerificationVerdict.ACCEPT,
432
+ reason="All proposed fixes passed the SMT verifier.",
433
+ )