dataforge-07 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge/__init__.py +204 -0
- dataforge/__main__.py +5 -0
- dataforge/agent/__init__.py +16 -0
- dataforge/agent/providers.py +259 -0
- dataforge/agent/scratchpad.py +183 -0
- dataforge/agent/tool_actions.py +343 -0
- dataforge/bench/__init__.py +31 -0
- dataforge/bench/core.py +426 -0
- dataforge/bench/groq_client.py +386 -0
- dataforge/bench/methods.py +443 -0
- dataforge/bench/report.py +309 -0
- dataforge/bench/runner.py +247 -0
- dataforge/causal/__init__.py +21 -0
- dataforge/causal/dag.py +174 -0
- dataforge/causal/pc.py +232 -0
- dataforge/causal/root_cause.py +193 -0
- dataforge/cli/__init__.py +50 -0
- dataforge/cli/audit.py +70 -0
- dataforge/cli/bench.py +154 -0
- dataforge/cli/common.py +267 -0
- dataforge/cli/constraints.py +407 -0
- dataforge/cli/profile.py +147 -0
- dataforge/cli/release.py +166 -0
- dataforge/cli/repair.py +407 -0
- dataforge/cli/revert.py +139 -0
- dataforge/cli/watch.py +144 -0
- dataforge/datasets/__init__.py +25 -0
- dataforge/datasets/embedded/hospital/clean.csv +11 -0
- dataforge/datasets/embedded/hospital/dirty.csv +11 -0
- dataforge/datasets/real_world.py +290 -0
- dataforge/datasets/registry.py +103 -0
- dataforge/detectors/__init__.py +80 -0
- dataforge/detectors/base.py +145 -0
- dataforge/detectors/decimal_shift.py +166 -0
- dataforge/detectors/fd_violation.py +157 -0
- dataforge/detectors/type_mismatch.py +173 -0
- dataforge/engine/__init__.py +39 -0
- dataforge/engine/repair.py +905 -0
- dataforge/env/__init__.py +22 -0
- dataforge/env/environment.py +883 -0
- dataforge/env/observation.py +61 -0
- dataforge/env/openenv_core.py +161 -0
- dataforge/env/reward.py +128 -0
- dataforge/env/server.py +176 -0
- dataforge/evaluation_contract.py +76 -0
- dataforge/fixtures/hospital_10rows.csv +11 -0
- dataforge/fixtures/hospital_schema.yaml +17 -0
- dataforge/http/__init__.py +1 -0
- dataforge/http/problem.py +103 -0
- dataforge/integrations/__init__.py +1 -0
- dataforge/integrations/dbt.py +164 -0
- dataforge/observability.py +76 -0
- dataforge/py.typed +1 -0
- dataforge/release/__init__.py +1 -0
- dataforge/release/doctor.py +367 -0
- dataforge/release/full_vision.py +702 -0
- dataforge/release/gate.py +861 -0
- dataforge/release/playground_check.py +411 -0
- dataforge/repair_contract.py +468 -0
- dataforge/repairers/__init__.py +88 -0
- dataforge/repairers/base.py +77 -0
- dataforge/repairers/decimal_shift.py +43 -0
- dataforge/repairers/fd_violation.py +225 -0
- dataforge/repairers/type_mismatch.py +73 -0
- dataforge/safety/__init__.py +5 -0
- dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
- dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
- dataforge/safety/constitution.py +307 -0
- dataforge/safety/constitutions/default.yaml +40 -0
- dataforge/safety/filter.py +134 -0
- dataforge/schema_inference.py +620 -0
- dataforge/stores/__init__.py +46 -0
- dataforge/stores/base.py +73 -0
- dataforge/stores/cloud.py +78 -0
- dataforge/stores/csv.py +94 -0
- dataforge/stores/duckdb.py +313 -0
- dataforge/stores/patch_plan.py +178 -0
- dataforge/stores/registry.py +82 -0
- dataforge/stores/repair.py +121 -0
- dataforge/stores/revert.py +22 -0
- dataforge/stores/sql.py +27 -0
- dataforge/table.py +228 -0
- dataforge/transactions/__init__.py +34 -0
- dataforge/transactions/files.py +96 -0
- dataforge/transactions/log.py +613 -0
- dataforge/transactions/revert.py +102 -0
- dataforge/transactions/txn.py +104 -0
- dataforge/ui/__init__.py +1 -0
- dataforge/ui/profile_view.py +136 -0
- dataforge/ui/repair_diff.py +91 -0
- dataforge/verifier/__init__.py +55 -0
- dataforge/verifier/constraint_ir.py +155 -0
- dataforge/verifier/explain.py +47 -0
- dataforge/verifier/gate.py +5 -0
- dataforge/verifier/schema.py +111 -0
- dataforge/verifier/smt.py +433 -0
- dataforge_07-0.1.0.dist-info/METADATA +436 -0
- dataforge_07-0.1.0.dist-info/RECORD +150 -0
- dataforge_07-0.1.0.dist-info/WHEEL +5 -0
- dataforge_07-0.1.0.dist-info/entry_points.txt +3 -0
- dataforge_07-0.1.0.dist-info/licenses/LICENSE +176 -0
- dataforge_07-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Backend-neutral patch plan models for verified tabular repairs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import secrets
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
12
|
+
|
|
13
|
+
from dataforge.transactions.txn import CellFix
|
|
14
|
+
|
|
15
|
+
PATCH_PLAN_SCHEMA_VERSION: Literal["patch_plan_v1"] = "patch_plan_v1"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RowIdentity(BaseModel):
|
|
19
|
+
"""Stable row locator required before a table-store repair can apply."""
|
|
20
|
+
|
|
21
|
+
kind: Literal["csv_position", "column_values", "unavailable"]
|
|
22
|
+
columns: tuple[str, ...] = Field(default_factory=tuple)
|
|
23
|
+
values: dict[str, str] = Field(default_factory=dict)
|
|
24
|
+
stable: bool = False
|
|
25
|
+
reason: str = Field(min_length=1)
|
|
26
|
+
|
|
27
|
+
model_config = ConfigDict(extra="forbid", frozen=True)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PatchOperation(BaseModel):
|
|
31
|
+
"""One cell-level mutation in a backend-neutral repair plan."""
|
|
32
|
+
|
|
33
|
+
operation: Literal["update"] = "update"
|
|
34
|
+
relation: str = Field(min_length=1)
|
|
35
|
+
row: int = Field(ge=0)
|
|
36
|
+
row_identity: RowIdentity
|
|
37
|
+
column: str = Field(min_length=1)
|
|
38
|
+
old_value: str
|
|
39
|
+
new_value: str
|
|
40
|
+
detector_id: str = Field(min_length=1)
|
|
41
|
+
reason: str = Field(min_length=1)
|
|
42
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
43
|
+
provenance: str = Field(min_length=1)
|
|
44
|
+
precondition_sql: str | None = None
|
|
45
|
+
forward_sql: str | None = None
|
|
46
|
+
rollback_sql: str | None = None
|
|
47
|
+
verification_sql: tuple[str, ...] = Field(default_factory=tuple)
|
|
48
|
+
|
|
49
|
+
model_config = ConfigDict(extra="forbid", frozen=True)
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def from_cell_fix(
|
|
53
|
+
cls,
|
|
54
|
+
fix: CellFix,
|
|
55
|
+
*,
|
|
56
|
+
relation: str,
|
|
57
|
+
row_identity: RowIdentity,
|
|
58
|
+
reason: str,
|
|
59
|
+
confidence: float,
|
|
60
|
+
provenance: str,
|
|
61
|
+
precondition_sql: str | None = None,
|
|
62
|
+
forward_sql: str | None = None,
|
|
63
|
+
rollback_sql: str | None = None,
|
|
64
|
+
verification_sql: tuple[str, ...] = (),
|
|
65
|
+
) -> PatchOperation:
|
|
66
|
+
"""Build an operation from an existing DataForge cell fix."""
|
|
67
|
+
if fix.operation != "update":
|
|
68
|
+
raise ValueError("PatchPlan v1 supports cell updates only.")
|
|
69
|
+
return cls(
|
|
70
|
+
relation=relation,
|
|
71
|
+
row=fix.row,
|
|
72
|
+
row_identity=row_identity,
|
|
73
|
+
column=fix.column,
|
|
74
|
+
old_value=fix.old_value,
|
|
75
|
+
new_value=fix.new_value,
|
|
76
|
+
detector_id=fix.detector_id,
|
|
77
|
+
reason=reason,
|
|
78
|
+
confidence=confidence,
|
|
79
|
+
provenance=provenance,
|
|
80
|
+
precondition_sql=precondition_sql,
|
|
81
|
+
forward_sql=forward_sql,
|
|
82
|
+
rollback_sql=rollback_sql,
|
|
83
|
+
verification_sql=verification_sql,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class CostEstimate(BaseModel):
|
|
88
|
+
"""Small, backend-agnostic estimate surfaced before mutation."""
|
|
89
|
+
|
|
90
|
+
rows_scanned: int = Field(ge=0)
|
|
91
|
+
rows_written: int = Field(ge=0)
|
|
92
|
+
bytes_scanned: int | None = Field(default=None, ge=0)
|
|
93
|
+
quota_units: float = Field(default=0.0, ge=0.0)
|
|
94
|
+
|
|
95
|
+
model_config = ConfigDict(extra="forbid", frozen=True)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class PatchPlan(BaseModel):
|
|
99
|
+
"""The only write contract accepted by non-CSV DataForge stores."""
|
|
100
|
+
|
|
101
|
+
schema_version: Literal["patch_plan_v1"] = PATCH_PLAN_SCHEMA_VERSION
|
|
102
|
+
plan_id: str = Field(pattern=r"^plan-[0-9a-f]{12}$")
|
|
103
|
+
created_at: datetime
|
|
104
|
+
backend: str = Field(min_length=1)
|
|
105
|
+
target: str = Field(min_length=1)
|
|
106
|
+
relation: str = Field(min_length=1)
|
|
107
|
+
row_identity_columns: tuple[str, ...] = Field(default_factory=tuple)
|
|
108
|
+
stable_row_identity: bool
|
|
109
|
+
operations: tuple[PatchOperation, ...] = Field(default_factory=tuple)
|
|
110
|
+
forward_sql: tuple[str, ...] = Field(default_factory=tuple)
|
|
111
|
+
rollback_sql: tuple[str, ...] = Field(default_factory=tuple)
|
|
112
|
+
preflight_probes: tuple[str, ...] = Field(default_factory=tuple)
|
|
113
|
+
verification_queries: tuple[str, ...] = Field(default_factory=tuple)
|
|
114
|
+
touched_constraints: tuple[str, ...] = Field(default_factory=tuple)
|
|
115
|
+
smt_obligations: tuple[str, ...] = Field(default_factory=tuple)
|
|
116
|
+
cost_estimate: CostEstimate
|
|
117
|
+
safety_verdict: str = Field(min_length=1)
|
|
118
|
+
reversible: bool
|
|
119
|
+
apply_supported: bool
|
|
120
|
+
apply_requires_approval: bool = True
|
|
121
|
+
audit_metadata: dict[str, str] = Field(default_factory=dict)
|
|
122
|
+
reason: str = Field(min_length=1)
|
|
123
|
+
|
|
124
|
+
model_config = ConfigDict(extra="forbid", frozen=True)
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def new(
|
|
128
|
+
cls,
|
|
129
|
+
*,
|
|
130
|
+
backend: str,
|
|
131
|
+
target: str,
|
|
132
|
+
relation: str,
|
|
133
|
+
row_identity_columns: tuple[str, ...],
|
|
134
|
+
operations: tuple[PatchOperation, ...],
|
|
135
|
+
safety_verdict: str,
|
|
136
|
+
rows_scanned: int,
|
|
137
|
+
reason: str,
|
|
138
|
+
touched_constraints: tuple[str, ...] = (),
|
|
139
|
+
smt_obligations: tuple[str, ...] = (),
|
|
140
|
+
audit_metadata: dict[str, str] | None = None,
|
|
141
|
+
apply_supported: bool | None = None,
|
|
142
|
+
reversible: bool | None = None,
|
|
143
|
+
) -> PatchPlan:
|
|
144
|
+
"""Construct a stable plan with derived SQL and support flags."""
|
|
145
|
+
stable = bool(row_identity_columns) and all(op.row_identity.stable for op in operations)
|
|
146
|
+
has_operations = bool(operations)
|
|
147
|
+
supported = stable and has_operations if apply_supported is None else apply_supported
|
|
148
|
+
is_reversible = supported if reversible is None else reversible
|
|
149
|
+
return cls(
|
|
150
|
+
plan_id=f"plan-{secrets.token_hex(6)}",
|
|
151
|
+
created_at=datetime.now(UTC),
|
|
152
|
+
backend=backend,
|
|
153
|
+
target=target,
|
|
154
|
+
relation=relation,
|
|
155
|
+
row_identity_columns=row_identity_columns,
|
|
156
|
+
stable_row_identity=stable,
|
|
157
|
+
operations=operations,
|
|
158
|
+
forward_sql=tuple(sql for op in operations if (sql := op.forward_sql)),
|
|
159
|
+
rollback_sql=tuple(sql for op in operations if (sql := op.rollback_sql)),
|
|
160
|
+
preflight_probes=tuple(sql for op in operations if (sql := op.precondition_sql)),
|
|
161
|
+
verification_queries=tuple(query for op in operations for query in op.verification_sql),
|
|
162
|
+
touched_constraints=touched_constraints,
|
|
163
|
+
smt_obligations=smt_obligations,
|
|
164
|
+
cost_estimate=CostEstimate(rows_scanned=rows_scanned, rows_written=len(operations)),
|
|
165
|
+
safety_verdict=safety_verdict,
|
|
166
|
+
reversible=is_reversible,
|
|
167
|
+
apply_supported=supported,
|
|
168
|
+
audit_metadata=audit_metadata or {},
|
|
169
|
+
reason=reason,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def canonical_json(self) -> str:
|
|
173
|
+
"""Return deterministic JSON suitable for audit hashing."""
|
|
174
|
+
return json.dumps(self.model_dump(mode="json"), sort_keys=True, separators=(",", ":"))
|
|
175
|
+
|
|
176
|
+
def sha256(self) -> str:
|
|
177
|
+
"""Return a SHA-256 digest of the canonical plan."""
|
|
178
|
+
return hashlib.sha256(self.canonical_json().encode("utf-8")).hexdigest()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Table-store URI parsing and adapter selection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from urllib.parse import parse_qs, unquote, urlparse
|
|
8
|
+
|
|
9
|
+
from dataforge.stores.base import TableStore, TableStoreError
|
|
10
|
+
from dataforge.stores.cloud import CloudWarehouseStore
|
|
11
|
+
from dataforge.stores.duckdb import DuckDBStore
|
|
12
|
+
|
|
13
|
+
_CLOUD_BACKENDS = {"snowflake", "bigquery", "databricks", "databricks_delta"}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class TableStoreSpec:
|
|
18
|
+
"""Parsed table-store URI."""
|
|
19
|
+
|
|
20
|
+
backend: str
|
|
21
|
+
target: str
|
|
22
|
+
relation: str
|
|
23
|
+
database_path: Path | None
|
|
24
|
+
row_identity_columns: tuple[str, ...]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def is_table_store_uri(raw: str) -> bool:
|
|
28
|
+
"""Return whether a CLI target string names a DataForge table store."""
|
|
29
|
+
return raw.startswith("warehouse://")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def parse_table_store_uri(raw: str, *, row_ids: tuple[str, ...] = ()) -> TableStoreSpec:
|
|
33
|
+
"""Parse a ``warehouse://`` URI into an adapter spec.
|
|
34
|
+
|
|
35
|
+
Supported local form:
|
|
36
|
+
``warehouse://duckdb?database=/tmp/dev.duckdb&relation=main.model&row_id=id``.
|
|
37
|
+
"""
|
|
38
|
+
parsed = urlparse(raw)
|
|
39
|
+
if parsed.scheme != "warehouse":
|
|
40
|
+
raise TableStoreError("Table-store URIs must use the warehouse:// scheme.")
|
|
41
|
+
backend = (parsed.netloc or parsed.path.strip("/").split("/", 1)[0]).lower()
|
|
42
|
+
if not backend:
|
|
43
|
+
raise TableStoreError("Warehouse URI must include a backend name.")
|
|
44
|
+
query = {key: values[-1] for key, values in parse_qs(parsed.query).items() if values}
|
|
45
|
+
relation = unquote(query.get("relation", ""))
|
|
46
|
+
if not relation:
|
|
47
|
+
raise TableStoreError("Warehouse URI must include relation=<schema.table>.")
|
|
48
|
+
database = query.get("database") or query.get("path")
|
|
49
|
+
row_id_query = query.get("row_id") or query.get("key")
|
|
50
|
+
resolved_row_ids = row_ids
|
|
51
|
+
if row_id_query:
|
|
52
|
+
resolved_row_ids = tuple(part.strip() for part in row_id_query.split(",") if part.strip())
|
|
53
|
+
return TableStoreSpec(
|
|
54
|
+
backend=backend,
|
|
55
|
+
target=raw,
|
|
56
|
+
relation=relation,
|
|
57
|
+
database_path=Path(unquote(database)).expanduser() if database else None,
|
|
58
|
+
row_identity_columns=resolved_row_ids,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def store_from_uri(raw: str, *, row_ids: tuple[str, ...] = ()) -> TableStore:
|
|
63
|
+
"""Create a table-store adapter from a CLI URI."""
|
|
64
|
+
spec = parse_table_store_uri(raw, row_ids=row_ids)
|
|
65
|
+
if spec.backend == "duckdb":
|
|
66
|
+
if spec.database_path is None:
|
|
67
|
+
raise TableStoreError("DuckDB warehouse URI requires database=<path>.")
|
|
68
|
+
return DuckDBStore(
|
|
69
|
+
database_path=spec.database_path,
|
|
70
|
+
relation=spec.relation,
|
|
71
|
+
row_identity_columns=spec.row_identity_columns,
|
|
72
|
+
target=spec.target,
|
|
73
|
+
)
|
|
74
|
+
if spec.backend in _CLOUD_BACKENDS:
|
|
75
|
+
backend = "databricks" if spec.backend == "databricks_delta" else spec.backend
|
|
76
|
+
return CloudWarehouseStore(
|
|
77
|
+
backend=backend,
|
|
78
|
+
target=spec.target,
|
|
79
|
+
relation=spec.relation,
|
|
80
|
+
row_identity_columns=spec.row_identity_columns,
|
|
81
|
+
)
|
|
82
|
+
raise TableStoreError(f"Unsupported warehouse backend: {spec.backend}")
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Repair pipeline entrypoints for table-store targets."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict
|
|
8
|
+
|
|
9
|
+
from dataforge.detectors import run_all_detectors
|
|
10
|
+
from dataforge.detectors.base import Issue, Schema
|
|
11
|
+
from dataforge.engine.repair import propose_repairs
|
|
12
|
+
from dataforge.safety import SafetyFilter, SafetyVerdict
|
|
13
|
+
from dataforge.stores.base import StoreApplyReceipt, TableStore, TableStoreError
|
|
14
|
+
from dataforge.stores.patch_plan import PatchPlan
|
|
15
|
+
from dataforge.table import copy_table
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TableStoreRepairResult(BaseModel):
|
|
19
|
+
"""Repair result for warehouse/table-store CLI calls."""
|
|
20
|
+
|
|
21
|
+
schema_version: str = "table_store_repair_result_v1"
|
|
22
|
+
mode: str
|
|
23
|
+
target: str
|
|
24
|
+
backend: str
|
|
25
|
+
issues: list[Issue]
|
|
26
|
+
fixes: list[dict[str, object]]
|
|
27
|
+
patch_plan: PatchPlan
|
|
28
|
+
apply_receipt: StoreApplyReceipt | None = None
|
|
29
|
+
|
|
30
|
+
model_config = ConfigDict(
|
|
31
|
+
strict=True, arbitrary_types_allowed=True, extra="forbid", frozen=True
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def run_table_store_repair(
|
|
36
|
+
store: TableStore,
|
|
37
|
+
*,
|
|
38
|
+
mode: str,
|
|
39
|
+
schema: Schema | None,
|
|
40
|
+
allow_llm: bool = False,
|
|
41
|
+
model: str = "gemini-2.0-flash",
|
|
42
|
+
allow_pii: bool = False,
|
|
43
|
+
confirm_pii: bool = False,
|
|
44
|
+
confirm_escalations: bool = False,
|
|
45
|
+
state_root: Path | None = None,
|
|
46
|
+
only_column: str | None = None,
|
|
47
|
+
) -> TableStoreRepairResult:
|
|
48
|
+
"""Detect, verify, plan, and optionally apply repairs for a table store."""
|
|
49
|
+
if mode not in {"dry_run", "apply"}:
|
|
50
|
+
raise TableStoreError("Table-store repair mode must be dry_run or apply.")
|
|
51
|
+
|
|
52
|
+
if store.backend in {"snowflake", "bigquery", "databricks"}:
|
|
53
|
+
plan = store.build_patch_plan(
|
|
54
|
+
[],
|
|
55
|
+
schema=schema,
|
|
56
|
+
safety_verdict="dry_run_only",
|
|
57
|
+
touched_constraints=(),
|
|
58
|
+
smt_obligations=(),
|
|
59
|
+
)
|
|
60
|
+
if mode == "apply":
|
|
61
|
+
raise TableStoreError(plan.reason)
|
|
62
|
+
return TableStoreRepairResult(
|
|
63
|
+
mode=mode,
|
|
64
|
+
target=store.target,
|
|
65
|
+
backend=store.backend,
|
|
66
|
+
issues=[],
|
|
67
|
+
fixes=[],
|
|
68
|
+
patch_plan=plan,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
table = store.read_table()
|
|
72
|
+
issues = run_all_detectors(table, schema)
|
|
73
|
+
if only_column is not None:
|
|
74
|
+
issues = [issue for issue in issues if issue.column == only_column]
|
|
75
|
+
accepted_fixes, attempt_groups = propose_repairs(
|
|
76
|
+
issues,
|
|
77
|
+
Path.cwd() / ".dataforge" / "warehouse-target.csv",
|
|
78
|
+
copy_table(table),
|
|
79
|
+
schema,
|
|
80
|
+
allow_llm=allow_llm,
|
|
81
|
+
model=model,
|
|
82
|
+
allow_pii=allow_pii,
|
|
83
|
+
confirm_pii=confirm_pii,
|
|
84
|
+
confirm_escalations=confirm_escalations,
|
|
85
|
+
interactive=False,
|
|
86
|
+
)
|
|
87
|
+
batch_safety = SafetyFilter().evaluate_batch(accepted_fixes)
|
|
88
|
+
if batch_safety.verdict != SafetyVerdict.ALLOW:
|
|
89
|
+
accepted_fixes = []
|
|
90
|
+
plan = store.build_patch_plan(
|
|
91
|
+
accepted_fixes,
|
|
92
|
+
schema=schema,
|
|
93
|
+
safety_verdict=batch_safety.verdict.value,
|
|
94
|
+
touched_constraints=(),
|
|
95
|
+
smt_obligations=("SMTVerifier.verify",) if accepted_fixes else (),
|
|
96
|
+
)
|
|
97
|
+
apply_receipt = None
|
|
98
|
+
if mode == "apply":
|
|
99
|
+
apply_receipt = store.apply_patch_plan(plan, state_root=state_root)
|
|
100
|
+
|
|
101
|
+
return TableStoreRepairResult(
|
|
102
|
+
mode=mode,
|
|
103
|
+
target=store.target,
|
|
104
|
+
backend=store.backend,
|
|
105
|
+
issues=issues,
|
|
106
|
+
fixes=[
|
|
107
|
+
{
|
|
108
|
+
"row": fix.fix.row,
|
|
109
|
+
"column": fix.fix.column,
|
|
110
|
+
"old_value": fix.fix.old_value,
|
|
111
|
+
"new_value": fix.fix.new_value,
|
|
112
|
+
"detector_id": fix.fix.detector_id,
|
|
113
|
+
"reason": fix.reason,
|
|
114
|
+
"confidence": fix.confidence,
|
|
115
|
+
"provenance": fix.provenance,
|
|
116
|
+
}
|
|
117
|
+
for fix in accepted_fixes
|
|
118
|
+
],
|
|
119
|
+
patch_plan=plan,
|
|
120
|
+
apply_receipt=apply_receipt,
|
|
121
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Rollback helpers for table-store transactions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from dataforge.stores.base import TableStoreError
|
|
8
|
+
from dataforge.stores.duckdb import load_duckdb_transaction
|
|
9
|
+
from dataforge.transactions.log import load_transaction
|
|
10
|
+
from dataforge.transactions.txn import RepairTransaction
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def revert_table_store_transaction(log_path: Path) -> RepairTransaction:
|
|
14
|
+
"""Revert a table-store transaction using its recorded backend patch plan."""
|
|
15
|
+
transaction = load_transaction(log_path)
|
|
16
|
+
if transaction.backend == "duckdb":
|
|
17
|
+
store, loaded = load_duckdb_transaction(log_path)
|
|
18
|
+
store.revert_transaction(loaded, log_path=log_path)
|
|
19
|
+
return load_transaction(log_path)
|
|
20
|
+
raise TableStoreError(
|
|
21
|
+
f"Table-store revert is not implemented for backend {transaction.backend!r}."
|
|
22
|
+
)
|
dataforge/stores/sql.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""SQL rendering helpers for DataForge table-store adapters."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
_SAFE_RELATION_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*|\.\"[^\"]+\")*$")
|
|
8
|
+
_SAFE_IDENTIFIER_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def ensure_safe_relation(relation: str) -> str:
|
|
12
|
+
"""Return a validated relation identifier or raise ``ValueError``."""
|
|
13
|
+
if not _SAFE_RELATION_RE.fullmatch(relation):
|
|
14
|
+
raise ValueError(f"Unsafe relation identifier: {relation}")
|
|
15
|
+
return relation
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def quote_identifier(identifier: str) -> str:
|
|
19
|
+
"""Return a double-quoted SQL identifier after strict validation."""
|
|
20
|
+
if not _SAFE_IDENTIFIER_RE.fullmatch(identifier):
|
|
21
|
+
raise ValueError(f"Unsafe column identifier: {identifier}")
|
|
22
|
+
return f'"{identifier}"'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def sql_literal(value: object) -> str:
|
|
26
|
+
"""Render a SQL string literal for generated patch statements."""
|
|
27
|
+
return "'" + str(value).replace("'", "''") + "'"
|
dataforge/table.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Small string-preserving table primitives for DataForge core paths.
|
|
2
|
+
|
|
3
|
+
The CLI hot path should not need pandas just to profile or repair a CSV.
|
|
4
|
+
This module provides the narrow DataFrame-like surface that DataForge's
|
|
5
|
+
detectors, repairers, and verifier actually need.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import csv
|
|
11
|
+
import io
|
|
12
|
+
from collections.abc import Iterable, Iterator, Sequence
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Protocol, cast, overload
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TableLike(Protocol):
|
|
18
|
+
"""Protocol for the tabular surface consumed by DataForge core logic."""
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def columns(self) -> Any: ...
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def index(self) -> Any: ...
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def at(self) -> Any: ...
|
|
28
|
+
|
|
29
|
+
def __getitem__(self, key: str) -> Any: ...
|
|
30
|
+
|
|
31
|
+
def copy(self, deep: bool = True) -> Any: ...
|
|
32
|
+
|
|
33
|
+
def to_csv(
|
|
34
|
+
self,
|
|
35
|
+
buffer: io.StringIO,
|
|
36
|
+
*,
|
|
37
|
+
index: bool = False,
|
|
38
|
+
lineterminator: str = "\n",
|
|
39
|
+
) -> None: ...
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ColumnView(Sequence[str]):
|
|
43
|
+
"""Read-only column view with the small API repairers expect."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, values: Sequence[str]) -> None:
|
|
46
|
+
self._values = values
|
|
47
|
+
|
|
48
|
+
def __iter__(self) -> Iterator[str]:
|
|
49
|
+
return iter(self._values)
|
|
50
|
+
|
|
51
|
+
def __len__(self) -> int:
|
|
52
|
+
return len(self._values)
|
|
53
|
+
|
|
54
|
+
@overload
|
|
55
|
+
def __getitem__(self, index: int) -> str: ...
|
|
56
|
+
|
|
57
|
+
@overload
|
|
58
|
+
def __getitem__(self, index: slice) -> Sequence[str]: ...
|
|
59
|
+
|
|
60
|
+
def __getitem__(self, index: int | slice) -> str | Sequence[str]:
|
|
61
|
+
return self._values[index]
|
|
62
|
+
|
|
63
|
+
def tolist(self) -> list[str]:
|
|
64
|
+
"""Return a list copy, matching pandas Series enough for detectors."""
|
|
65
|
+
return list(self._values)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class _AtIndexer:
|
|
69
|
+
"""``table.at[row, column]`` getter/setter compatibility shim."""
|
|
70
|
+
|
|
71
|
+
def __init__(self, table: Table) -> None:
|
|
72
|
+
self._table = table
|
|
73
|
+
|
|
74
|
+
def __getitem__(self, key: tuple[int, str]) -> str:
|
|
75
|
+
row, column = key
|
|
76
|
+
return self._table.cell(row, column)
|
|
77
|
+
|
|
78
|
+
def __setitem__(self, key: tuple[int, str], value: object) -> None:
|
|
79
|
+
row, column = key
|
|
80
|
+
self._table.set_cell(row, column, value)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class Table:
|
|
84
|
+
"""In-memory CSV table with string-preserving cells."""
|
|
85
|
+
|
|
86
|
+
def __init__(self, columns: Sequence[str], rows: Iterable[dict[str, object]]) -> None:
|
|
87
|
+
self._columns = [str(column) for column in columns]
|
|
88
|
+
self._rows: list[dict[str, str]] = [
|
|
89
|
+
{
|
|
90
|
+
column: "" if row.get(column) is None else str(row.get(column, ""))
|
|
91
|
+
for column in self._columns
|
|
92
|
+
}
|
|
93
|
+
for row in rows
|
|
94
|
+
]
|
|
95
|
+
self.at = _AtIndexer(self)
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def columns(self) -> list[str]:
|
|
99
|
+
"""Return column names in CSV order."""
|
|
100
|
+
return list(self._columns)
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def index(self) -> range:
|
|
104
|
+
"""Return zero-based row positions."""
|
|
105
|
+
return range(len(self._rows))
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def empty(self) -> bool:
|
|
109
|
+
"""Return whether the table has no rows."""
|
|
110
|
+
return not self._rows
|
|
111
|
+
|
|
112
|
+
@overload
|
|
113
|
+
def __getitem__(self, key: str) -> ColumnView: ...
|
|
114
|
+
|
|
115
|
+
@overload
|
|
116
|
+
def __getitem__(self, key: list[str]) -> Table: ...
|
|
117
|
+
|
|
118
|
+
@overload
|
|
119
|
+
def __getitem__(self, key: tuple[str, ...]) -> Table: ...
|
|
120
|
+
|
|
121
|
+
def __getitem__(self, key: str | list[str] | tuple[str, ...]) -> ColumnView | Table:
|
|
122
|
+
if isinstance(key, str):
|
|
123
|
+
if key not in self._columns:
|
|
124
|
+
raise KeyError(key)
|
|
125
|
+
return ColumnView([row.get(key, "") for row in self._rows])
|
|
126
|
+
columns = [str(column) for column in key]
|
|
127
|
+
for column in columns:
|
|
128
|
+
if column not in self._columns:
|
|
129
|
+
raise KeyError(column)
|
|
130
|
+
return Table(
|
|
131
|
+
columns, ({column: row.get(column, "") for column in columns} for row in self._rows)
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def __len__(self) -> int:
|
|
135
|
+
return len(self._rows)
|
|
136
|
+
|
|
137
|
+
def copy(self, deep: bool = True) -> Table:
|
|
138
|
+
"""Return an independent table copy."""
|
|
139
|
+
del deep
|
|
140
|
+
return Table(self._columns, (dict(row) for row in self._rows))
|
|
141
|
+
|
|
142
|
+
def cell(self, row: int, column: str) -> str:
|
|
143
|
+
"""Return a cell value."""
|
|
144
|
+
if column not in self._columns:
|
|
145
|
+
raise KeyError(column)
|
|
146
|
+
return self._rows[row].get(column, "")
|
|
147
|
+
|
|
148
|
+
def set_cell(self, row: int, column: str, value: object) -> None:
|
|
149
|
+
"""Set a cell value after validating the column."""
|
|
150
|
+
if column not in self._columns:
|
|
151
|
+
raise KeyError(column)
|
|
152
|
+
self._rows[row][column] = "" if value is None else str(value)
|
|
153
|
+
|
|
154
|
+
def iter_records(self, columns: Sequence[str] | None = None) -> Iterator[dict[str, str]]:
|
|
155
|
+
"""Yield row dictionaries in table order."""
|
|
156
|
+
selected = self._columns if columns is None else [str(column) for column in columns]
|
|
157
|
+
for row in self._rows:
|
|
158
|
+
yield {column: row.get(column, "") for column in selected}
|
|
159
|
+
|
|
160
|
+
def to_dict(self, orient: str = "records") -> list[dict[str, str]]:
|
|
161
|
+
"""Return records in the pandas-compatible orientation used by DataForge."""
|
|
162
|
+
if orient != "records":
|
|
163
|
+
raise ValueError("Only orient='records' is supported.")
|
|
164
|
+
return list(self.iter_records())
|
|
165
|
+
|
|
166
|
+
def to_csv(
|
|
167
|
+
self, buffer: io.StringIO, *, index: bool = False, lineterminator: str = "\n"
|
|
168
|
+
) -> None:
|
|
169
|
+
"""Write the table as CSV to a text buffer."""
|
|
170
|
+
if index:
|
|
171
|
+
raise ValueError("Table.to_csv does not support index=True.")
|
|
172
|
+
writer = csv.DictWriter(buffer, fieldnames=self._columns, lineterminator=lineterminator)
|
|
173
|
+
writer.writeheader()
|
|
174
|
+
for row in self._rows:
|
|
175
|
+
writer.writerow({column: row.get(column, "") for column in self._columns})
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def read_csv(path: Path) -> Table:
|
|
179
|
+
"""Read a CSV as a string-preserving ``Table``."""
|
|
180
|
+
with path.open("r", encoding="utf-8-sig", newline="") as handle:
|
|
181
|
+
reader = csv.DictReader(handle)
|
|
182
|
+
columns = list(reader.fieldnames or [])
|
|
183
|
+
return Table(columns, reader)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def table_to_csv_bytes(table: TableLike) -> bytes:
|
|
187
|
+
"""Serialize a table-like object to UTF-8 CSV bytes."""
|
|
188
|
+
output = io.StringIO()
|
|
189
|
+
if isinstance(table, Table):
|
|
190
|
+
table.to_csv(output, index=False, lineterminator="\n")
|
|
191
|
+
else:
|
|
192
|
+
# pandas-compatible fallback for tests and optional integrations.
|
|
193
|
+
table.to_csv(output, index=False, lineterminator="\n")
|
|
194
|
+
return output.getvalue().encode("utf-8")
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def column_names(table: TableLike) -> list[str]:
|
|
198
|
+
"""Return table column names as strings."""
|
|
199
|
+
return [str(column) for column in table.columns]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def row_count(table: TableLike) -> int:
|
|
203
|
+
"""Return the number of rows in a table-like object."""
|
|
204
|
+
return len(table.index)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def column_values(table: TableLike, column: str) -> list[Any]:
|
|
208
|
+
"""Return all values for one column."""
|
|
209
|
+
values = table[column]
|
|
210
|
+
if hasattr(values, "tolist"):
|
|
211
|
+
return list(values.tolist())
|
|
212
|
+
return list(values)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def cell_value(table: TableLike, row: int, column: str) -> str:
|
|
216
|
+
"""Return a cell value as a string."""
|
|
217
|
+
return str(table.at[row, column])
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def set_cell_value(table: TableLike, row: int, column: str, value: object) -> None:
|
|
221
|
+
"""Set a cell value on a table-like object."""
|
|
222
|
+
table.at[row, column] = value
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def copy_table(table: TableLike) -> TableLike:
|
|
226
|
+
"""Return a deep copy of a table-like object."""
|
|
227
|
+
copied = table.copy(deep=True)
|
|
228
|
+
return cast(TableLike, copied)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Transaction exports for DataForge."""
|
|
2
|
+
|
|
3
|
+
from dataforge.transactions.log import (
|
|
4
|
+
LEGACY_SCHEMA_NAME,
|
|
5
|
+
SCHEMA_NAME,
|
|
6
|
+
TransactionAuditReport,
|
|
7
|
+
TransactionAuditVerdict,
|
|
8
|
+
append_applied_event,
|
|
9
|
+
append_created_transaction,
|
|
10
|
+
append_reverted_event,
|
|
11
|
+
find_transaction_log,
|
|
12
|
+
load_transaction,
|
|
13
|
+
verify_transaction_log,
|
|
14
|
+
)
|
|
15
|
+
from dataforge.transactions.revert import TransactionRevertError, revert_transaction
|
|
16
|
+
from dataforge.transactions.txn import CellFix, RepairTransaction, generate_txn_id
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"CellFix",
|
|
20
|
+
"LEGACY_SCHEMA_NAME",
|
|
21
|
+
"SCHEMA_NAME",
|
|
22
|
+
"TransactionAuditReport",
|
|
23
|
+
"TransactionAuditVerdict",
|
|
24
|
+
"RepairTransaction",
|
|
25
|
+
"TransactionRevertError",
|
|
26
|
+
"append_applied_event",
|
|
27
|
+
"append_created_transaction",
|
|
28
|
+
"append_reverted_event",
|
|
29
|
+
"find_transaction_log",
|
|
30
|
+
"generate_txn_id",
|
|
31
|
+
"load_transaction",
|
|
32
|
+
"revert_transaction",
|
|
33
|
+
"verify_transaction_log",
|
|
34
|
+
]
|