dataforge-07 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. dataforge/__init__.py +204 -0
  2. dataforge/__main__.py +5 -0
  3. dataforge/agent/__init__.py +16 -0
  4. dataforge/agent/providers.py +259 -0
  5. dataforge/agent/scratchpad.py +183 -0
  6. dataforge/agent/tool_actions.py +343 -0
  7. dataforge/bench/__init__.py +31 -0
  8. dataforge/bench/core.py +426 -0
  9. dataforge/bench/groq_client.py +386 -0
  10. dataforge/bench/methods.py +443 -0
  11. dataforge/bench/report.py +309 -0
  12. dataforge/bench/runner.py +247 -0
  13. dataforge/causal/__init__.py +21 -0
  14. dataforge/causal/dag.py +174 -0
  15. dataforge/causal/pc.py +232 -0
  16. dataforge/causal/root_cause.py +193 -0
  17. dataforge/cli/__init__.py +50 -0
  18. dataforge/cli/audit.py +70 -0
  19. dataforge/cli/bench.py +154 -0
  20. dataforge/cli/common.py +267 -0
  21. dataforge/cli/constraints.py +407 -0
  22. dataforge/cli/profile.py +147 -0
  23. dataforge/cli/release.py +166 -0
  24. dataforge/cli/repair.py +407 -0
  25. dataforge/cli/revert.py +139 -0
  26. dataforge/cli/watch.py +144 -0
  27. dataforge/datasets/__init__.py +25 -0
  28. dataforge/datasets/embedded/hospital/clean.csv +11 -0
  29. dataforge/datasets/embedded/hospital/dirty.csv +11 -0
  30. dataforge/datasets/real_world.py +290 -0
  31. dataforge/datasets/registry.py +103 -0
  32. dataforge/detectors/__init__.py +80 -0
  33. dataforge/detectors/base.py +145 -0
  34. dataforge/detectors/decimal_shift.py +166 -0
  35. dataforge/detectors/fd_violation.py +157 -0
  36. dataforge/detectors/type_mismatch.py +173 -0
  37. dataforge/engine/__init__.py +39 -0
  38. dataforge/engine/repair.py +905 -0
  39. dataforge/env/__init__.py +22 -0
  40. dataforge/env/environment.py +883 -0
  41. dataforge/env/observation.py +61 -0
  42. dataforge/env/openenv_core.py +161 -0
  43. dataforge/env/reward.py +128 -0
  44. dataforge/env/server.py +176 -0
  45. dataforge/evaluation_contract.py +76 -0
  46. dataforge/fixtures/hospital_10rows.csv +11 -0
  47. dataforge/fixtures/hospital_schema.yaml +17 -0
  48. dataforge/http/__init__.py +1 -0
  49. dataforge/http/problem.py +103 -0
  50. dataforge/integrations/__init__.py +1 -0
  51. dataforge/integrations/dbt.py +164 -0
  52. dataforge/observability.py +76 -0
  53. dataforge/py.typed +1 -0
  54. dataforge/release/__init__.py +1 -0
  55. dataforge/release/doctor.py +367 -0
  56. dataforge/release/full_vision.py +702 -0
  57. dataforge/release/gate.py +861 -0
  58. dataforge/release/playground_check.py +411 -0
  59. dataforge/repair_contract.py +468 -0
  60. dataforge/repairers/__init__.py +88 -0
  61. dataforge/repairers/base.py +77 -0
  62. dataforge/repairers/decimal_shift.py +43 -0
  63. dataforge/repairers/fd_violation.py +225 -0
  64. dataforge/repairers/type_mismatch.py +73 -0
  65. dataforge/safety/__init__.py +5 -0
  66. dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
  67. dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
  68. dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
  69. dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
  70. dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
  71. dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
  72. dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
  73. dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
  74. dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
  75. dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
  76. dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
  77. dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
  78. dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
  79. dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
  80. dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
  81. dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
  82. dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
  83. dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
  84. dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
  85. dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
  86. dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
  87. dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
  88. dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
  89. dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
  90. dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
  91. dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
  92. dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
  93. dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
  94. dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
  95. dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
  96. dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
  97. dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
  98. dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
  99. dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
  100. dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
  101. dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
  102. dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
  103. dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
  104. dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
  105. dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
  106. dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
  107. dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
  108. dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
  109. dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
  110. dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
  111. dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
  112. dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
  113. dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
  114. dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
  115. dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
  116. dataforge/safety/constitution.py +307 -0
  117. dataforge/safety/constitutions/default.yaml +40 -0
  118. dataforge/safety/filter.py +134 -0
  119. dataforge/schema_inference.py +620 -0
  120. dataforge/stores/__init__.py +46 -0
  121. dataforge/stores/base.py +73 -0
  122. dataforge/stores/cloud.py +78 -0
  123. dataforge/stores/csv.py +94 -0
  124. dataforge/stores/duckdb.py +313 -0
  125. dataforge/stores/patch_plan.py +178 -0
  126. dataforge/stores/registry.py +82 -0
  127. dataforge/stores/repair.py +121 -0
  128. dataforge/stores/revert.py +22 -0
  129. dataforge/stores/sql.py +27 -0
  130. dataforge/table.py +228 -0
  131. dataforge/transactions/__init__.py +34 -0
  132. dataforge/transactions/files.py +96 -0
  133. dataforge/transactions/log.py +613 -0
  134. dataforge/transactions/revert.py +102 -0
  135. dataforge/transactions/txn.py +104 -0
  136. dataforge/ui/__init__.py +1 -0
  137. dataforge/ui/profile_view.py +136 -0
  138. dataforge/ui/repair_diff.py +91 -0
  139. dataforge/verifier/__init__.py +55 -0
  140. dataforge/verifier/constraint_ir.py +155 -0
  141. dataforge/verifier/explain.py +47 -0
  142. dataforge/verifier/gate.py +5 -0
  143. dataforge/verifier/schema.py +111 -0
  144. dataforge/verifier/smt.py +433 -0
  145. dataforge_07-0.1.0.dist-info/METADATA +436 -0
  146. dataforge_07-0.1.0.dist-info/RECORD +150 -0
  147. dataforge_07-0.1.0.dist-info/WHEEL +5 -0
  148. dataforge_07-0.1.0.dist-info/entry_points.txt +3 -0
  149. dataforge_07-0.1.0.dist-info/licenses/LICENSE +176 -0
  150. dataforge_07-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,178 @@
1
+ """Backend-neutral patch plan models for verified tabular repairs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ import secrets
8
+ from datetime import UTC, datetime
9
+ from typing import Literal
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+ from dataforge.transactions.txn import CellFix
14
+
15
+ PATCH_PLAN_SCHEMA_VERSION: Literal["patch_plan_v1"] = "patch_plan_v1"
16
+
17
+
18
+ class RowIdentity(BaseModel):
19
+ """Stable row locator required before a table-store repair can apply."""
20
+
21
+ kind: Literal["csv_position", "column_values", "unavailable"]
22
+ columns: tuple[str, ...] = Field(default_factory=tuple)
23
+ values: dict[str, str] = Field(default_factory=dict)
24
+ stable: bool = False
25
+ reason: str = Field(min_length=1)
26
+
27
+ model_config = ConfigDict(extra="forbid", frozen=True)
28
+
29
+
30
+ class PatchOperation(BaseModel):
31
+ """One cell-level mutation in a backend-neutral repair plan."""
32
+
33
+ operation: Literal["update"] = "update"
34
+ relation: str = Field(min_length=1)
35
+ row: int = Field(ge=0)
36
+ row_identity: RowIdentity
37
+ column: str = Field(min_length=1)
38
+ old_value: str
39
+ new_value: str
40
+ detector_id: str = Field(min_length=1)
41
+ reason: str = Field(min_length=1)
42
+ confidence: float = Field(ge=0.0, le=1.0)
43
+ provenance: str = Field(min_length=1)
44
+ precondition_sql: str | None = None
45
+ forward_sql: str | None = None
46
+ rollback_sql: str | None = None
47
+ verification_sql: tuple[str, ...] = Field(default_factory=tuple)
48
+
49
+ model_config = ConfigDict(extra="forbid", frozen=True)
50
+
51
+ @classmethod
52
+ def from_cell_fix(
53
+ cls,
54
+ fix: CellFix,
55
+ *,
56
+ relation: str,
57
+ row_identity: RowIdentity,
58
+ reason: str,
59
+ confidence: float,
60
+ provenance: str,
61
+ precondition_sql: str | None = None,
62
+ forward_sql: str | None = None,
63
+ rollback_sql: str | None = None,
64
+ verification_sql: tuple[str, ...] = (),
65
+ ) -> PatchOperation:
66
+ """Build an operation from an existing DataForge cell fix."""
67
+ if fix.operation != "update":
68
+ raise ValueError("PatchPlan v1 supports cell updates only.")
69
+ return cls(
70
+ relation=relation,
71
+ row=fix.row,
72
+ row_identity=row_identity,
73
+ column=fix.column,
74
+ old_value=fix.old_value,
75
+ new_value=fix.new_value,
76
+ detector_id=fix.detector_id,
77
+ reason=reason,
78
+ confidence=confidence,
79
+ provenance=provenance,
80
+ precondition_sql=precondition_sql,
81
+ forward_sql=forward_sql,
82
+ rollback_sql=rollback_sql,
83
+ verification_sql=verification_sql,
84
+ )
85
+
86
+
87
+ class CostEstimate(BaseModel):
88
+ """Small, backend-agnostic estimate surfaced before mutation."""
89
+
90
+ rows_scanned: int = Field(ge=0)
91
+ rows_written: int = Field(ge=0)
92
+ bytes_scanned: int | None = Field(default=None, ge=0)
93
+ quota_units: float = Field(default=0.0, ge=0.0)
94
+
95
+ model_config = ConfigDict(extra="forbid", frozen=True)
96
+
97
+
98
+ class PatchPlan(BaseModel):
99
+ """The only write contract accepted by non-CSV DataForge stores."""
100
+
101
+ schema_version: Literal["patch_plan_v1"] = PATCH_PLAN_SCHEMA_VERSION
102
+ plan_id: str = Field(pattern=r"^plan-[0-9a-f]{12}$")
103
+ created_at: datetime
104
+ backend: str = Field(min_length=1)
105
+ target: str = Field(min_length=1)
106
+ relation: str = Field(min_length=1)
107
+ row_identity_columns: tuple[str, ...] = Field(default_factory=tuple)
108
+ stable_row_identity: bool
109
+ operations: tuple[PatchOperation, ...] = Field(default_factory=tuple)
110
+ forward_sql: tuple[str, ...] = Field(default_factory=tuple)
111
+ rollback_sql: tuple[str, ...] = Field(default_factory=tuple)
112
+ preflight_probes: tuple[str, ...] = Field(default_factory=tuple)
113
+ verification_queries: tuple[str, ...] = Field(default_factory=tuple)
114
+ touched_constraints: tuple[str, ...] = Field(default_factory=tuple)
115
+ smt_obligations: tuple[str, ...] = Field(default_factory=tuple)
116
+ cost_estimate: CostEstimate
117
+ safety_verdict: str = Field(min_length=1)
118
+ reversible: bool
119
+ apply_supported: bool
120
+ apply_requires_approval: bool = True
121
+ audit_metadata: dict[str, str] = Field(default_factory=dict)
122
+ reason: str = Field(min_length=1)
123
+
124
+ model_config = ConfigDict(extra="forbid", frozen=True)
125
+
126
+ @classmethod
127
+ def new(
128
+ cls,
129
+ *,
130
+ backend: str,
131
+ target: str,
132
+ relation: str,
133
+ row_identity_columns: tuple[str, ...],
134
+ operations: tuple[PatchOperation, ...],
135
+ safety_verdict: str,
136
+ rows_scanned: int,
137
+ reason: str,
138
+ touched_constraints: tuple[str, ...] = (),
139
+ smt_obligations: tuple[str, ...] = (),
140
+ audit_metadata: dict[str, str] | None = None,
141
+ apply_supported: bool | None = None,
142
+ reversible: bool | None = None,
143
+ ) -> PatchPlan:
144
+ """Construct a stable plan with derived SQL and support flags."""
145
+ stable = bool(row_identity_columns) and all(op.row_identity.stable for op in operations)
146
+ has_operations = bool(operations)
147
+ supported = stable and has_operations if apply_supported is None else apply_supported
148
+ is_reversible = supported if reversible is None else reversible
149
+ return cls(
150
+ plan_id=f"plan-{secrets.token_hex(6)}",
151
+ created_at=datetime.now(UTC),
152
+ backend=backend,
153
+ target=target,
154
+ relation=relation,
155
+ row_identity_columns=row_identity_columns,
156
+ stable_row_identity=stable,
157
+ operations=operations,
158
+ forward_sql=tuple(sql for op in operations if (sql := op.forward_sql)),
159
+ rollback_sql=tuple(sql for op in operations if (sql := op.rollback_sql)),
160
+ preflight_probes=tuple(sql for op in operations if (sql := op.precondition_sql)),
161
+ verification_queries=tuple(query for op in operations for query in op.verification_sql),
162
+ touched_constraints=touched_constraints,
163
+ smt_obligations=smt_obligations,
164
+ cost_estimate=CostEstimate(rows_scanned=rows_scanned, rows_written=len(operations)),
165
+ safety_verdict=safety_verdict,
166
+ reversible=is_reversible,
167
+ apply_supported=supported,
168
+ audit_metadata=audit_metadata or {},
169
+ reason=reason,
170
+ )
171
+
172
+ def canonical_json(self) -> str:
173
+ """Return deterministic JSON suitable for audit hashing."""
174
+ return json.dumps(self.model_dump(mode="json"), sort_keys=True, separators=(",", ":"))
175
+
176
+ def sha256(self) -> str:
177
+ """Return a SHA-256 digest of the canonical plan."""
178
+ return hashlib.sha256(self.canonical_json().encode("utf-8")).hexdigest()
@@ -0,0 +1,82 @@
1
+ """Table-store URI parsing and adapter selection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from urllib.parse import parse_qs, unquote, urlparse
8
+
9
+ from dataforge.stores.base import TableStore, TableStoreError
10
+ from dataforge.stores.cloud import CloudWarehouseStore
11
+ from dataforge.stores.duckdb import DuckDBStore
12
+
13
+ _CLOUD_BACKENDS = {"snowflake", "bigquery", "databricks", "databricks_delta"}
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class TableStoreSpec:
18
+ """Parsed table-store URI."""
19
+
20
+ backend: str
21
+ target: str
22
+ relation: str
23
+ database_path: Path | None
24
+ row_identity_columns: tuple[str, ...]
25
+
26
+
27
+ def is_table_store_uri(raw: str) -> bool:
28
+ """Return whether a CLI target string names a DataForge table store."""
29
+ return raw.startswith("warehouse://")
30
+
31
+
32
+ def parse_table_store_uri(raw: str, *, row_ids: tuple[str, ...] = ()) -> TableStoreSpec:
33
+ """Parse a ``warehouse://`` URI into an adapter spec.
34
+
35
+ Supported local form:
36
+ ``warehouse://duckdb?database=/tmp/dev.duckdb&relation=main.model&row_id=id``.
37
+ """
38
+ parsed = urlparse(raw)
39
+ if parsed.scheme != "warehouse":
40
+ raise TableStoreError("Table-store URIs must use the warehouse:// scheme.")
41
+ backend = (parsed.netloc or parsed.path.strip("/").split("/", 1)[0]).lower()
42
+ if not backend:
43
+ raise TableStoreError("Warehouse URI must include a backend name.")
44
+ query = {key: values[-1] for key, values in parse_qs(parsed.query).items() if values}
45
+ relation = unquote(query.get("relation", ""))
46
+ if not relation:
47
+ raise TableStoreError("Warehouse URI must include relation=<schema.table>.")
48
+ database = query.get("database") or query.get("path")
49
+ row_id_query = query.get("row_id") or query.get("key")
50
+ resolved_row_ids = row_ids
51
+ if row_id_query:
52
+ resolved_row_ids = tuple(part.strip() for part in row_id_query.split(",") if part.strip())
53
+ return TableStoreSpec(
54
+ backend=backend,
55
+ target=raw,
56
+ relation=relation,
57
+ database_path=Path(unquote(database)).expanduser() if database else None,
58
+ row_identity_columns=resolved_row_ids,
59
+ )
60
+
61
+
62
+ def store_from_uri(raw: str, *, row_ids: tuple[str, ...] = ()) -> TableStore:
63
+ """Create a table-store adapter from a CLI URI."""
64
+ spec = parse_table_store_uri(raw, row_ids=row_ids)
65
+ if spec.backend == "duckdb":
66
+ if spec.database_path is None:
67
+ raise TableStoreError("DuckDB warehouse URI requires database=<path>.")
68
+ return DuckDBStore(
69
+ database_path=spec.database_path,
70
+ relation=spec.relation,
71
+ row_identity_columns=spec.row_identity_columns,
72
+ target=spec.target,
73
+ )
74
+ if spec.backend in _CLOUD_BACKENDS:
75
+ backend = "databricks" if spec.backend == "databricks_delta" else spec.backend
76
+ return CloudWarehouseStore(
77
+ backend=backend,
78
+ target=spec.target,
79
+ relation=spec.relation,
80
+ row_identity_columns=spec.row_identity_columns,
81
+ )
82
+ raise TableStoreError(f"Unsupported warehouse backend: {spec.backend}")
@@ -0,0 +1,121 @@
1
+ """Repair pipeline entrypoints for table-store targets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from pydantic import BaseModel, ConfigDict
8
+
9
+ from dataforge.detectors import run_all_detectors
10
+ from dataforge.detectors.base import Issue, Schema
11
+ from dataforge.engine.repair import propose_repairs
12
+ from dataforge.safety import SafetyFilter, SafetyVerdict
13
+ from dataforge.stores.base import StoreApplyReceipt, TableStore, TableStoreError
14
+ from dataforge.stores.patch_plan import PatchPlan
15
+ from dataforge.table import copy_table
16
+
17
+
18
+ class TableStoreRepairResult(BaseModel):
19
+ """Repair result for warehouse/table-store CLI calls."""
20
+
21
+ schema_version: str = "table_store_repair_result_v1"
22
+ mode: str
23
+ target: str
24
+ backend: str
25
+ issues: list[Issue]
26
+ fixes: list[dict[str, object]]
27
+ patch_plan: PatchPlan
28
+ apply_receipt: StoreApplyReceipt | None = None
29
+
30
+ model_config = ConfigDict(
31
+ strict=True, arbitrary_types_allowed=True, extra="forbid", frozen=True
32
+ )
33
+
34
+
35
+ def run_table_store_repair(
36
+ store: TableStore,
37
+ *,
38
+ mode: str,
39
+ schema: Schema | None,
40
+ allow_llm: bool = False,
41
+ model: str = "gemini-2.0-flash",
42
+ allow_pii: bool = False,
43
+ confirm_pii: bool = False,
44
+ confirm_escalations: bool = False,
45
+ state_root: Path | None = None,
46
+ only_column: str | None = None,
47
+ ) -> TableStoreRepairResult:
48
+ """Detect, verify, plan, and optionally apply repairs for a table store."""
49
+ if mode not in {"dry_run", "apply"}:
50
+ raise TableStoreError("Table-store repair mode must be dry_run or apply.")
51
+
52
+ if store.backend in {"snowflake", "bigquery", "databricks"}:
53
+ plan = store.build_patch_plan(
54
+ [],
55
+ schema=schema,
56
+ safety_verdict="dry_run_only",
57
+ touched_constraints=(),
58
+ smt_obligations=(),
59
+ )
60
+ if mode == "apply":
61
+ raise TableStoreError(plan.reason)
62
+ return TableStoreRepairResult(
63
+ mode=mode,
64
+ target=store.target,
65
+ backend=store.backend,
66
+ issues=[],
67
+ fixes=[],
68
+ patch_plan=plan,
69
+ )
70
+
71
+ table = store.read_table()
72
+ issues = run_all_detectors(table, schema)
73
+ if only_column is not None:
74
+ issues = [issue for issue in issues if issue.column == only_column]
75
+ accepted_fixes, attempt_groups = propose_repairs(
76
+ issues,
77
+ Path.cwd() / ".dataforge" / "warehouse-target.csv",
78
+ copy_table(table),
79
+ schema,
80
+ allow_llm=allow_llm,
81
+ model=model,
82
+ allow_pii=allow_pii,
83
+ confirm_pii=confirm_pii,
84
+ confirm_escalations=confirm_escalations,
85
+ interactive=False,
86
+ )
87
+ batch_safety = SafetyFilter().evaluate_batch(accepted_fixes)
88
+ if batch_safety.verdict != SafetyVerdict.ALLOW:
89
+ accepted_fixes = []
90
+ plan = store.build_patch_plan(
91
+ accepted_fixes,
92
+ schema=schema,
93
+ safety_verdict=batch_safety.verdict.value,
94
+ touched_constraints=(),
95
+ smt_obligations=("SMTVerifier.verify",) if accepted_fixes else (),
96
+ )
97
+ apply_receipt = None
98
+ if mode == "apply":
99
+ apply_receipt = store.apply_patch_plan(plan, state_root=state_root)
100
+
101
+ return TableStoreRepairResult(
102
+ mode=mode,
103
+ target=store.target,
104
+ backend=store.backend,
105
+ issues=issues,
106
+ fixes=[
107
+ {
108
+ "row": fix.fix.row,
109
+ "column": fix.fix.column,
110
+ "old_value": fix.fix.old_value,
111
+ "new_value": fix.fix.new_value,
112
+ "detector_id": fix.fix.detector_id,
113
+ "reason": fix.reason,
114
+ "confidence": fix.confidence,
115
+ "provenance": fix.provenance,
116
+ }
117
+ for fix in accepted_fixes
118
+ ],
119
+ patch_plan=plan,
120
+ apply_receipt=apply_receipt,
121
+ )
@@ -0,0 +1,22 @@
1
+ """Rollback helpers for table-store transactions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from dataforge.stores.base import TableStoreError
8
+ from dataforge.stores.duckdb import load_duckdb_transaction
9
+ from dataforge.transactions.log import load_transaction
10
+ from dataforge.transactions.txn import RepairTransaction
11
+
12
+
13
+ def revert_table_store_transaction(log_path: Path) -> RepairTransaction:
14
+ """Revert a table-store transaction using its recorded backend patch plan."""
15
+ transaction = load_transaction(log_path)
16
+ if transaction.backend == "duckdb":
17
+ store, loaded = load_duckdb_transaction(log_path)
18
+ store.revert_transaction(loaded, log_path=log_path)
19
+ return load_transaction(log_path)
20
+ raise TableStoreError(
21
+ f"Table-store revert is not implemented for backend {transaction.backend!r}."
22
+ )
@@ -0,0 +1,27 @@
1
+ """SQL rendering helpers for DataForge table-store adapters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ _SAFE_RELATION_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*|\.\"[^\"]+\")*$")
8
+ _SAFE_IDENTIFIER_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
9
+
10
+
11
+ def ensure_safe_relation(relation: str) -> str:
12
+ """Return a validated relation identifier or raise ``ValueError``."""
13
+ if not _SAFE_RELATION_RE.fullmatch(relation):
14
+ raise ValueError(f"Unsafe relation identifier: {relation}")
15
+ return relation
16
+
17
+
18
+ def quote_identifier(identifier: str) -> str:
19
+ """Return a double-quoted SQL identifier after strict validation."""
20
+ if not _SAFE_IDENTIFIER_RE.fullmatch(identifier):
21
+ raise ValueError(f"Unsafe column identifier: {identifier}")
22
+ return f'"{identifier}"'
23
+
24
+
25
+ def sql_literal(value: object) -> str:
26
+ """Render a SQL string literal for generated patch statements."""
27
+ return "'" + str(value).replace("'", "''") + "'"
dataforge/table.py ADDED
@@ -0,0 +1,228 @@
1
+ """Small string-preserving table primitives for DataForge core paths.
2
+
3
+ The CLI hot path should not need pandas just to profile or repair a CSV.
4
+ This module provides the narrow DataFrame-like surface that DataForge's
5
+ detectors, repairers, and verifier actually need.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import csv
11
+ import io
12
+ from collections.abc import Iterable, Iterator, Sequence
13
+ from pathlib import Path
14
+ from typing import Any, Protocol, cast, overload
15
+
16
+
17
+ class TableLike(Protocol):
18
+ """Protocol for the tabular surface consumed by DataForge core logic."""
19
+
20
+ @property
21
+ def columns(self) -> Any: ...
22
+
23
+ @property
24
+ def index(self) -> Any: ...
25
+
26
+ @property
27
+ def at(self) -> Any: ...
28
+
29
+ def __getitem__(self, key: str) -> Any: ...
30
+
31
+ def copy(self, deep: bool = True) -> Any: ...
32
+
33
+ def to_csv(
34
+ self,
35
+ buffer: io.StringIO,
36
+ *,
37
+ index: bool = False,
38
+ lineterminator: str = "\n",
39
+ ) -> None: ...
40
+
41
+
42
+ class ColumnView(Sequence[str]):
43
+ """Read-only column view with the small API repairers expect."""
44
+
45
+ def __init__(self, values: Sequence[str]) -> None:
46
+ self._values = values
47
+
48
+ def __iter__(self) -> Iterator[str]:
49
+ return iter(self._values)
50
+
51
+ def __len__(self) -> int:
52
+ return len(self._values)
53
+
54
+ @overload
55
+ def __getitem__(self, index: int) -> str: ...
56
+
57
+ @overload
58
+ def __getitem__(self, index: slice) -> Sequence[str]: ...
59
+
60
+ def __getitem__(self, index: int | slice) -> str | Sequence[str]:
61
+ return self._values[index]
62
+
63
+ def tolist(self) -> list[str]:
64
+ """Return a list copy, matching pandas Series enough for detectors."""
65
+ return list(self._values)
66
+
67
+
68
+ class _AtIndexer:
69
+ """``table.at[row, column]`` getter/setter compatibility shim."""
70
+
71
+ def __init__(self, table: Table) -> None:
72
+ self._table = table
73
+
74
+ def __getitem__(self, key: tuple[int, str]) -> str:
75
+ row, column = key
76
+ return self._table.cell(row, column)
77
+
78
+ def __setitem__(self, key: tuple[int, str], value: object) -> None:
79
+ row, column = key
80
+ self._table.set_cell(row, column, value)
81
+
82
+
83
+ class Table:
84
+ """In-memory CSV table with string-preserving cells."""
85
+
86
+ def __init__(self, columns: Sequence[str], rows: Iterable[dict[str, object]]) -> None:
87
+ self._columns = [str(column) for column in columns]
88
+ self._rows: list[dict[str, str]] = [
89
+ {
90
+ column: "" if row.get(column) is None else str(row.get(column, ""))
91
+ for column in self._columns
92
+ }
93
+ for row in rows
94
+ ]
95
+ self.at = _AtIndexer(self)
96
+
97
+ @property
98
+ def columns(self) -> list[str]:
99
+ """Return column names in CSV order."""
100
+ return list(self._columns)
101
+
102
+ @property
103
+ def index(self) -> range:
104
+ """Return zero-based row positions."""
105
+ return range(len(self._rows))
106
+
107
+ @property
108
+ def empty(self) -> bool:
109
+ """Return whether the table has no rows."""
110
+ return not self._rows
111
+
112
+ @overload
113
+ def __getitem__(self, key: str) -> ColumnView: ...
114
+
115
+ @overload
116
+ def __getitem__(self, key: list[str]) -> Table: ...
117
+
118
+ @overload
119
+ def __getitem__(self, key: tuple[str, ...]) -> Table: ...
120
+
121
+ def __getitem__(self, key: str | list[str] | tuple[str, ...]) -> ColumnView | Table:
122
+ if isinstance(key, str):
123
+ if key not in self._columns:
124
+ raise KeyError(key)
125
+ return ColumnView([row.get(key, "") for row in self._rows])
126
+ columns = [str(column) for column in key]
127
+ for column in columns:
128
+ if column not in self._columns:
129
+ raise KeyError(column)
130
+ return Table(
131
+ columns, ({column: row.get(column, "") for column in columns} for row in self._rows)
132
+ )
133
+
134
+ def __len__(self) -> int:
135
+ return len(self._rows)
136
+
137
+ def copy(self, deep: bool = True) -> Table:
138
+ """Return an independent table copy."""
139
+ del deep
140
+ return Table(self._columns, (dict(row) for row in self._rows))
141
+
142
+ def cell(self, row: int, column: str) -> str:
143
+ """Return a cell value."""
144
+ if column not in self._columns:
145
+ raise KeyError(column)
146
+ return self._rows[row].get(column, "")
147
+
148
+ def set_cell(self, row: int, column: str, value: object) -> None:
149
+ """Set a cell value after validating the column."""
150
+ if column not in self._columns:
151
+ raise KeyError(column)
152
+ self._rows[row][column] = "" if value is None else str(value)
153
+
154
+ def iter_records(self, columns: Sequence[str] | None = None) -> Iterator[dict[str, str]]:
155
+ """Yield row dictionaries in table order."""
156
+ selected = self._columns if columns is None else [str(column) for column in columns]
157
+ for row in self._rows:
158
+ yield {column: row.get(column, "") for column in selected}
159
+
160
+ def to_dict(self, orient: str = "records") -> list[dict[str, str]]:
161
+ """Return records in the pandas-compatible orientation used by DataForge."""
162
+ if orient != "records":
163
+ raise ValueError("Only orient='records' is supported.")
164
+ return list(self.iter_records())
165
+
166
+ def to_csv(
167
+ self, buffer: io.StringIO, *, index: bool = False, lineterminator: str = "\n"
168
+ ) -> None:
169
+ """Write the table as CSV to a text buffer."""
170
+ if index:
171
+ raise ValueError("Table.to_csv does not support index=True.")
172
+ writer = csv.DictWriter(buffer, fieldnames=self._columns, lineterminator=lineterminator)
173
+ writer.writeheader()
174
+ for row in self._rows:
175
+ writer.writerow({column: row.get(column, "") for column in self._columns})
176
+
177
+
178
+ def read_csv(path: Path) -> Table:
179
+ """Read a CSV as a string-preserving ``Table``."""
180
+ with path.open("r", encoding="utf-8-sig", newline="") as handle:
181
+ reader = csv.DictReader(handle)
182
+ columns = list(reader.fieldnames or [])
183
+ return Table(columns, reader)
184
+
185
+
186
+ def table_to_csv_bytes(table: TableLike) -> bytes:
187
+ """Serialize a table-like object to UTF-8 CSV bytes."""
188
+ output = io.StringIO()
189
+ if isinstance(table, Table):
190
+ table.to_csv(output, index=False, lineterminator="\n")
191
+ else:
192
+ # pandas-compatible fallback for tests and optional integrations.
193
+ table.to_csv(output, index=False, lineterminator="\n")
194
+ return output.getvalue().encode("utf-8")
195
+
196
+
197
+ def column_names(table: TableLike) -> list[str]:
198
+ """Return table column names as strings."""
199
+ return [str(column) for column in table.columns]
200
+
201
+
202
+ def row_count(table: TableLike) -> int:
203
+ """Return the number of rows in a table-like object."""
204
+ return len(table.index)
205
+
206
+
207
+ def column_values(table: TableLike, column: str) -> list[Any]:
208
+ """Return all values for one column."""
209
+ values = table[column]
210
+ if hasattr(values, "tolist"):
211
+ return list(values.tolist())
212
+ return list(values)
213
+
214
+
215
+ def cell_value(table: TableLike, row: int, column: str) -> str:
216
+ """Return a cell value as a string."""
217
+ return str(table.at[row, column])
218
+
219
+
220
+ def set_cell_value(table: TableLike, row: int, column: str, value: object) -> None:
221
+ """Set a cell value on a table-like object."""
222
+ table.at[row, column] = value
223
+
224
+
225
+ def copy_table(table: TableLike) -> TableLike:
226
+ """Return a deep copy of a table-like object."""
227
+ copied = table.copy(deep=True)
228
+ return cast(TableLike, copied)
@@ -0,0 +1,34 @@
1
+ """Transaction exports for DataForge."""
2
+
3
+ from dataforge.transactions.log import (
4
+ LEGACY_SCHEMA_NAME,
5
+ SCHEMA_NAME,
6
+ TransactionAuditReport,
7
+ TransactionAuditVerdict,
8
+ append_applied_event,
9
+ append_created_transaction,
10
+ append_reverted_event,
11
+ find_transaction_log,
12
+ load_transaction,
13
+ verify_transaction_log,
14
+ )
15
+ from dataforge.transactions.revert import TransactionRevertError, revert_transaction
16
+ from dataforge.transactions.txn import CellFix, RepairTransaction, generate_txn_id
17
+
18
+ __all__ = [
19
+ "CellFix",
20
+ "LEGACY_SCHEMA_NAME",
21
+ "SCHEMA_NAME",
22
+ "TransactionAuditReport",
23
+ "TransactionAuditVerdict",
24
+ "RepairTransaction",
25
+ "TransactionRevertError",
26
+ "append_applied_event",
27
+ "append_created_transaction",
28
+ "append_reverted_event",
29
+ "find_transaction_log",
30
+ "generate_txn_id",
31
+ "load_transaction",
32
+ "revert_transaction",
33
+ "verify_transaction_log",
34
+ ]