dataforge-07 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge/__init__.py +204 -0
- dataforge/__main__.py +5 -0
- dataforge/agent/__init__.py +16 -0
- dataforge/agent/providers.py +259 -0
- dataforge/agent/scratchpad.py +183 -0
- dataforge/agent/tool_actions.py +343 -0
- dataforge/bench/__init__.py +31 -0
- dataforge/bench/core.py +426 -0
- dataforge/bench/groq_client.py +386 -0
- dataforge/bench/methods.py +443 -0
- dataforge/bench/report.py +309 -0
- dataforge/bench/runner.py +247 -0
- dataforge/causal/__init__.py +21 -0
- dataforge/causal/dag.py +174 -0
- dataforge/causal/pc.py +232 -0
- dataforge/causal/root_cause.py +193 -0
- dataforge/cli/__init__.py +50 -0
- dataforge/cli/audit.py +70 -0
- dataforge/cli/bench.py +154 -0
- dataforge/cli/common.py +267 -0
- dataforge/cli/constraints.py +407 -0
- dataforge/cli/profile.py +147 -0
- dataforge/cli/release.py +166 -0
- dataforge/cli/repair.py +407 -0
- dataforge/cli/revert.py +139 -0
- dataforge/cli/watch.py +144 -0
- dataforge/datasets/__init__.py +25 -0
- dataforge/datasets/embedded/hospital/clean.csv +11 -0
- dataforge/datasets/embedded/hospital/dirty.csv +11 -0
- dataforge/datasets/real_world.py +290 -0
- dataforge/datasets/registry.py +103 -0
- dataforge/detectors/__init__.py +80 -0
- dataforge/detectors/base.py +145 -0
- dataforge/detectors/decimal_shift.py +166 -0
- dataforge/detectors/fd_violation.py +157 -0
- dataforge/detectors/type_mismatch.py +173 -0
- dataforge/engine/__init__.py +39 -0
- dataforge/engine/repair.py +905 -0
- dataforge/env/__init__.py +22 -0
- dataforge/env/environment.py +883 -0
- dataforge/env/observation.py +61 -0
- dataforge/env/openenv_core.py +161 -0
- dataforge/env/reward.py +128 -0
- dataforge/env/server.py +176 -0
- dataforge/evaluation_contract.py +76 -0
- dataforge/fixtures/hospital_10rows.csv +11 -0
- dataforge/fixtures/hospital_schema.yaml +17 -0
- dataforge/http/__init__.py +1 -0
- dataforge/http/problem.py +103 -0
- dataforge/integrations/__init__.py +1 -0
- dataforge/integrations/dbt.py +164 -0
- dataforge/observability.py +76 -0
- dataforge/py.typed +1 -0
- dataforge/release/__init__.py +1 -0
- dataforge/release/doctor.py +367 -0
- dataforge/release/full_vision.py +702 -0
- dataforge/release/gate.py +861 -0
- dataforge/release/playground_check.py +411 -0
- dataforge/repair_contract.py +468 -0
- dataforge/repairers/__init__.py +88 -0
- dataforge/repairers/base.py +77 -0
- dataforge/repairers/decimal_shift.py +43 -0
- dataforge/repairers/fd_violation.py +225 -0
- dataforge/repairers/type_mismatch.py +73 -0
- dataforge/safety/__init__.py +5 -0
- dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
- dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
- dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
- dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
- dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
- dataforge/safety/constitution.py +307 -0
- dataforge/safety/constitutions/default.yaml +40 -0
- dataforge/safety/filter.py +134 -0
- dataforge/schema_inference.py +620 -0
- dataforge/stores/__init__.py +46 -0
- dataforge/stores/base.py +73 -0
- dataforge/stores/cloud.py +78 -0
- dataforge/stores/csv.py +94 -0
- dataforge/stores/duckdb.py +313 -0
- dataforge/stores/patch_plan.py +178 -0
- dataforge/stores/registry.py +82 -0
- dataforge/stores/repair.py +121 -0
- dataforge/stores/revert.py +22 -0
- dataforge/stores/sql.py +27 -0
- dataforge/table.py +228 -0
- dataforge/transactions/__init__.py +34 -0
- dataforge/transactions/files.py +96 -0
- dataforge/transactions/log.py +613 -0
- dataforge/transactions/revert.py +102 -0
- dataforge/transactions/txn.py +104 -0
- dataforge/ui/__init__.py +1 -0
- dataforge/ui/profile_view.py +136 -0
- dataforge/ui/repair_diff.py +91 -0
- dataforge/verifier/__init__.py +55 -0
- dataforge/verifier/constraint_ir.py +155 -0
- dataforge/verifier/explain.py +47 -0
- dataforge/verifier/gate.py +5 -0
- dataforge/verifier/schema.py +111 -0
- dataforge/verifier/smt.py +433 -0
- dataforge_07-0.1.0.dist-info/METADATA +436 -0
- dataforge_07-0.1.0.dist-info/RECORD +150 -0
- dataforge_07-0.1.0.dist-info/WHEEL +5 -0
- dataforge_07-0.1.0.dist-info/entry_points.txt +3 -0
- dataforge_07-0.1.0.dist-info/licenses/LICENSE +176 -0
- dataforge_07-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,613 @@
|
|
|
1
|
+
"""Append-only JSONL transaction journal for DataForge repairs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import enum
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
from dataforge.transactions.txn import RepairTransaction
|
|
16
|
+
|
|
17
|
+
LEGACY_SCHEMA_VERSION = 1
|
|
18
|
+
SCHEMA_VERSION = 2
|
|
19
|
+
LEGACY_SCHEMA_NAME = "transaction_journal_v1"
|
|
20
|
+
SCHEMA_NAME = "transaction_journal_v2"
|
|
21
|
+
_SHA256_RE = re.compile(r"^[0-9a-f]{64}$")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TransactionLogError(Exception):
|
|
25
|
+
"""Raised when a transaction journal cannot be written or replayed."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TransactionAuditVerdict(enum.Enum):
|
|
29
|
+
"""Possible outcomes for transaction log audit verification."""
|
|
30
|
+
|
|
31
|
+
VERIFIED = "verified"
|
|
32
|
+
LEGACY_UNVERIFIED = "legacy_unverified"
|
|
33
|
+
UNREVERTIBLE = "unrevertible"
|
|
34
|
+
TAMPERED = "tampered"
|
|
35
|
+
MISSING = "missing"
|
|
36
|
+
MALFORMED = "malformed"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class TransactionAuditReport(BaseModel):
|
|
40
|
+
"""Machine-readable result of transaction hash-chain verification."""
|
|
41
|
+
|
|
42
|
+
verdict: TransactionAuditVerdict
|
|
43
|
+
log_path: str | None = None
|
|
44
|
+
txn_id: str | None = None
|
|
45
|
+
schema_version: int | None = None
|
|
46
|
+
schema_name: str | None = None
|
|
47
|
+
event_count: int = Field(ge=0)
|
|
48
|
+
head_sha256: str | None = Field(default=None, pattern=r"^[0-9a-f]{64}$")
|
|
49
|
+
errors: tuple[str, ...] = Field(default_factory=tuple)
|
|
50
|
+
|
|
51
|
+
model_config = {"frozen": True}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def sha256_bytes(payload: bytes) -> str:
|
|
55
|
+
"""Return the SHA-256 digest for the given payload."""
|
|
56
|
+
return hashlib.sha256(payload).hexdigest()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def sha256_file(path: Path) -> str:
|
|
60
|
+
"""Return the SHA-256 digest for the file at ``path``."""
|
|
61
|
+
return sha256_bytes(path.read_bytes())
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def dataforge_root_for(source_path: Path) -> Path:
|
|
65
|
+
"""Return the hidden DataForge state directory for a source path."""
|
|
66
|
+
return source_path.resolve().parent / ".dataforge"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def transactions_dir_for(source_path: Path) -> Path:
|
|
70
|
+
"""Return the transaction journal directory for a source path."""
|
|
71
|
+
return dataforge_root_for(source_path) / "transactions"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def snapshots_dir_for(source_path: Path) -> Path:
|
|
75
|
+
"""Return the snapshot directory for a source path."""
|
|
76
|
+
return dataforge_root_for(source_path) / "snapshots"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def cache_dir_for(source_path: Path) -> Path:
|
|
80
|
+
"""Return the cache directory for a source path."""
|
|
81
|
+
return dataforge_root_for(source_path) / "cache"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def snapshot_path_for(source_path: Path, txn_id: str) -> Path:
|
|
85
|
+
"""Return the immutable snapshot path for a transaction."""
|
|
86
|
+
return snapshots_dir_for(source_path) / f"{txn_id}.bin"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def transaction_log_path_for(source_path: Path, txn_id: str) -> Path:
|
|
90
|
+
"""Return the JSONL log path for a transaction."""
|
|
91
|
+
return transactions_dir_for(source_path) / f"{txn_id}.jsonl"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _utc_now() -> datetime:
|
|
95
|
+
"""Return the current UTC timestamp."""
|
|
96
|
+
return datetime.now(UTC)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _canonical_event_bytes(record: dict[str, Any]) -> bytes:
|
|
100
|
+
"""Serialize an audit event into the canonical hash material."""
|
|
101
|
+
unsigned = {key: value for key, value in record.items() if key != "event_sha256"}
|
|
102
|
+
return json.dumps(
|
|
103
|
+
unsigned,
|
|
104
|
+
sort_keys=True,
|
|
105
|
+
separators=(",", ":"),
|
|
106
|
+
ensure_ascii=False,
|
|
107
|
+
).encode("utf-8")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _event_sha256(record: dict[str, Any]) -> str:
|
|
111
|
+
"""Return the canonical SHA-256 hash for an event record."""
|
|
112
|
+
return sha256_bytes(_canonical_event_bytes(record))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _sign_event(record: dict[str, Any]) -> dict[str, Any]:
|
|
116
|
+
"""Return a copy of ``record`` with its canonical event hash attached."""
|
|
117
|
+
signed = dict(record)
|
|
118
|
+
signed["event_sha256"] = _event_sha256(signed)
|
|
119
|
+
return signed
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _write_jsonl_line(path: Path, record: dict[str, Any], *, create: bool = False) -> None:
|
|
123
|
+
"""Append or create a JSONL record on disk.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
path: The target JSONL log path.
|
|
127
|
+
record: JSON-serializable record to write.
|
|
128
|
+
create: When true, fail if the file already exists.
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
TransactionLogError: If the record cannot be written.
|
|
132
|
+
"""
|
|
133
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
134
|
+
mode = "x" if create else "a"
|
|
135
|
+
try:
|
|
136
|
+
with path.open(mode, encoding="utf-8", newline="\n") as handle:
|
|
137
|
+
handle.write(json.dumps(record, sort_keys=True))
|
|
138
|
+
handle.write("\n")
|
|
139
|
+
except OSError as exc:
|
|
140
|
+
raise TransactionLogError(f"Could not write transaction log '{path}': {exc}") from exc
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _read_records(log_path: Path) -> list[dict[str, Any]]:
|
|
144
|
+
"""Read non-empty JSONL records from a transaction log."""
|
|
145
|
+
records: list[dict[str, Any]] = []
|
|
146
|
+
for line_number, raw_line in enumerate(log_path.read_text(encoding="utf-8").splitlines(), 1):
|
|
147
|
+
if not raw_line.strip():
|
|
148
|
+
continue
|
|
149
|
+
try:
|
|
150
|
+
payload = json.loads(raw_line)
|
|
151
|
+
except json.JSONDecodeError as exc:
|
|
152
|
+
raise TransactionLogError(
|
|
153
|
+
f"Malformed JSON at {log_path}:{line_number}: {exc.msg}"
|
|
154
|
+
) from exc
|
|
155
|
+
if not isinstance(payload, dict):
|
|
156
|
+
raise TransactionLogError(f"Malformed transaction event at {log_path}:{line_number}.")
|
|
157
|
+
records.append(payload)
|
|
158
|
+
return records
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _log_schema_version(log_path: Path) -> int | None:
|
|
162
|
+
"""Return the first event schema version for an existing log."""
|
|
163
|
+
if not log_path.exists():
|
|
164
|
+
return None
|
|
165
|
+
records = _read_records(log_path)
|
|
166
|
+
if not records:
|
|
167
|
+
return None
|
|
168
|
+
raw_version = records[0].get("schema_version")
|
|
169
|
+
return raw_version if isinstance(raw_version, int) else None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _next_event_metadata(log_path: Path) -> tuple[int, str | None]:
|
|
173
|
+
"""Return the next v2 event index and previous hash for ``log_path``."""
|
|
174
|
+
records = _read_records(log_path)
|
|
175
|
+
if not records:
|
|
176
|
+
raise TransactionLogError(f"Transaction log '{log_path}' contained no events.")
|
|
177
|
+
previous = records[-1].get("event_sha256")
|
|
178
|
+
if not isinstance(previous, str) or not _SHA256_RE.fullmatch(previous):
|
|
179
|
+
raise TransactionLogError(
|
|
180
|
+
f"Transaction log '{log_path}' is missing a valid previous event hash."
|
|
181
|
+
)
|
|
182
|
+
return len(records), previous
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _v1_created_record(transaction: RepairTransaction) -> dict[str, Any]:
|
|
186
|
+
"""Build a legacy v1 transaction creation event."""
|
|
187
|
+
return {
|
|
188
|
+
"schema_version": LEGACY_SCHEMA_VERSION,
|
|
189
|
+
"event_type": "created",
|
|
190
|
+
"occurred_at": transaction.created_at.isoformat(),
|
|
191
|
+
"transaction": transaction.model_dump(mode="json"),
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _v2_created_record(transaction: RepairTransaction) -> dict[str, Any]:
|
|
196
|
+
"""Build a hash-chained v2 transaction creation event."""
|
|
197
|
+
return _sign_event(
|
|
198
|
+
{
|
|
199
|
+
"schema_version": SCHEMA_VERSION,
|
|
200
|
+
"schema_name": SCHEMA_NAME,
|
|
201
|
+
"event_index": 0,
|
|
202
|
+
"event_type": "created",
|
|
203
|
+
"occurred_at": transaction.created_at.isoformat(),
|
|
204
|
+
"previous_event_sha256": None,
|
|
205
|
+
"transaction": transaction.model_dump(mode="json"),
|
|
206
|
+
}
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _v1_applied_record(txn_id: str, post_sha256: str, applied_at: datetime) -> dict[str, Any]:
|
|
211
|
+
"""Build a legacy v1 applied event."""
|
|
212
|
+
return {
|
|
213
|
+
"schema_version": LEGACY_SCHEMA_VERSION,
|
|
214
|
+
"event_type": "applied",
|
|
215
|
+
"occurred_at": applied_at.isoformat(),
|
|
216
|
+
"txn_id": txn_id,
|
|
217
|
+
"post_sha256": post_sha256,
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _v2_applied_record(
|
|
222
|
+
log_path: Path,
|
|
223
|
+
txn_id: str,
|
|
224
|
+
post_sha256: str,
|
|
225
|
+
applied_at: datetime,
|
|
226
|
+
) -> dict[str, Any]:
|
|
227
|
+
"""Build a hash-chained v2 applied event."""
|
|
228
|
+
event_index, previous_hash = _next_event_metadata(log_path)
|
|
229
|
+
return _sign_event(
|
|
230
|
+
{
|
|
231
|
+
"schema_version": SCHEMA_VERSION,
|
|
232
|
+
"schema_name": SCHEMA_NAME,
|
|
233
|
+
"event_index": event_index,
|
|
234
|
+
"event_type": "applied",
|
|
235
|
+
"occurred_at": applied_at.isoformat(),
|
|
236
|
+
"previous_event_sha256": previous_hash,
|
|
237
|
+
"txn_id": txn_id,
|
|
238
|
+
"post_sha256": post_sha256,
|
|
239
|
+
}
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _v1_reverted_record(txn_id: str, reverted_at: datetime) -> dict[str, Any]:
|
|
244
|
+
"""Build a legacy v1 reverted event."""
|
|
245
|
+
return {
|
|
246
|
+
"schema_version": LEGACY_SCHEMA_VERSION,
|
|
247
|
+
"event_type": "reverted",
|
|
248
|
+
"occurred_at": reverted_at.isoformat(),
|
|
249
|
+
"txn_id": txn_id,
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _v2_reverted_record(log_path: Path, txn_id: str, reverted_at: datetime) -> dict[str, Any]:
|
|
254
|
+
"""Build a hash-chained v2 reverted event."""
|
|
255
|
+
event_index, previous_hash = _next_event_metadata(log_path)
|
|
256
|
+
return _sign_event(
|
|
257
|
+
{
|
|
258
|
+
"schema_version": SCHEMA_VERSION,
|
|
259
|
+
"schema_name": SCHEMA_NAME,
|
|
260
|
+
"event_index": event_index,
|
|
261
|
+
"event_type": "reverted",
|
|
262
|
+
"occurred_at": reverted_at.isoformat(),
|
|
263
|
+
"previous_event_sha256": previous_hash,
|
|
264
|
+
"txn_id": txn_id,
|
|
265
|
+
}
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def append_created_transaction(
|
|
270
|
+
transaction: RepairTransaction,
|
|
271
|
+
*,
|
|
272
|
+
log_root: Path | None = None,
|
|
273
|
+
) -> Path:
|
|
274
|
+
"""Write the immutable transaction creation event.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
transaction: The transaction to serialize.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
The created JSONL log path.
|
|
281
|
+
"""
|
|
282
|
+
source_path = Path(transaction.source_path)
|
|
283
|
+
log_path = (
|
|
284
|
+
transaction_log_path_for(source_path, transaction.txn_id)
|
|
285
|
+
if log_root is None
|
|
286
|
+
else log_root.resolve() / ".dataforge" / "transactions" / f"{transaction.txn_id}.jsonl"
|
|
287
|
+
)
|
|
288
|
+
_write_jsonl_line(log_path, _v2_created_record(transaction), create=True)
|
|
289
|
+
return log_path
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def append_applied_event(
|
|
293
|
+
log_path: Path,
|
|
294
|
+
txn_id: str,
|
|
295
|
+
post_sha256: str,
|
|
296
|
+
*,
|
|
297
|
+
applied_at: datetime | None = None,
|
|
298
|
+
) -> None:
|
|
299
|
+
"""Append an ``applied`` event to an existing transaction log."""
|
|
300
|
+
occurred_at = applied_at or _utc_now()
|
|
301
|
+
record = (
|
|
302
|
+
_v1_applied_record(txn_id, post_sha256, occurred_at)
|
|
303
|
+
if _log_schema_version(log_path) == LEGACY_SCHEMA_VERSION
|
|
304
|
+
else _v2_applied_record(log_path, txn_id, post_sha256, occurred_at)
|
|
305
|
+
)
|
|
306
|
+
_write_jsonl_line(log_path, record, create=False)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def append_reverted_event(
|
|
310
|
+
log_path: Path,
|
|
311
|
+
txn_id: str,
|
|
312
|
+
*,
|
|
313
|
+
reverted_at: datetime | None = None,
|
|
314
|
+
) -> None:
|
|
315
|
+
"""Append a ``reverted`` event to an existing transaction log."""
|
|
316
|
+
occurred_at = reverted_at or _utc_now()
|
|
317
|
+
record = (
|
|
318
|
+
_v1_reverted_record(txn_id, occurred_at)
|
|
319
|
+
if _log_schema_version(log_path) == LEGACY_SCHEMA_VERSION
|
|
320
|
+
else _v2_reverted_record(log_path, txn_id, occurred_at)
|
|
321
|
+
)
|
|
322
|
+
_write_jsonl_line(log_path, record, create=False)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def load_transaction(log_path: Path) -> RepairTransaction:
|
|
326
|
+
"""Replay a transaction log into the latest transaction state.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
log_path: Path to the JSONL log file.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
The latest replayed transaction state.
|
|
333
|
+
|
|
334
|
+
Raises:
|
|
335
|
+
TransactionLogError: If the log is missing or malformed.
|
|
336
|
+
"""
|
|
337
|
+
if not log_path.exists():
|
|
338
|
+
raise TransactionLogError(f"Transaction log not found: {log_path}")
|
|
339
|
+
|
|
340
|
+
transaction: RepairTransaction | None = None
|
|
341
|
+
for payload in _read_records(log_path):
|
|
342
|
+
if payload.get("schema_version") not in {LEGACY_SCHEMA_VERSION, SCHEMA_VERSION}:
|
|
343
|
+
raise TransactionLogError(
|
|
344
|
+
f"Unsupported transaction log schema version in '{log_path}'."
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
event_type = payload.get("event_type")
|
|
348
|
+
if event_type == "created":
|
|
349
|
+
transaction = RepairTransaction.model_validate(payload["transaction"])
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
if transaction is None:
|
|
353
|
+
raise TransactionLogError(
|
|
354
|
+
f"Transaction log '{log_path}' is missing the initial created event."
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
if payload.get("txn_id") != transaction.txn_id:
|
|
358
|
+
raise TransactionLogError(
|
|
359
|
+
f"Transaction log '{log_path}' contains mismatched txn_id values."
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
if event_type == "applied":
|
|
363
|
+
transaction = transaction.model_copy(
|
|
364
|
+
update={
|
|
365
|
+
"applied": True,
|
|
366
|
+
"post_sha256": payload["post_sha256"],
|
|
367
|
+
}
|
|
368
|
+
)
|
|
369
|
+
elif event_type == "reverted":
|
|
370
|
+
transaction = transaction.model_copy(
|
|
371
|
+
update={
|
|
372
|
+
"reverted_at": datetime.fromisoformat(payload["occurred_at"]),
|
|
373
|
+
}
|
|
374
|
+
)
|
|
375
|
+
else:
|
|
376
|
+
raise TransactionLogError(
|
|
377
|
+
f"Unknown transaction log event type '{event_type}' in '{log_path}'."
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
if transaction is None:
|
|
381
|
+
raise TransactionLogError(f"Transaction log '{log_path}' contained no transaction data.")
|
|
382
|
+
|
|
383
|
+
return transaction
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def find_transaction_log(txn_id: str, *, search_root: Path | None = None) -> Path:
|
|
387
|
+
"""Locate a transaction log by identifier under the working tree.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
txn_id: Canonical transaction identifier.
|
|
391
|
+
search_root: Optional root directory to search under.
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
The unique matching JSONL log path.
|
|
395
|
+
|
|
396
|
+
Raises:
|
|
397
|
+
TransactionLogError: If no log or multiple logs are found.
|
|
398
|
+
"""
|
|
399
|
+
root = (search_root or Path.cwd()).resolve()
|
|
400
|
+
direct_candidate = root / ".dataforge" / "transactions" / f"{txn_id}.jsonl"
|
|
401
|
+
if direct_candidate.exists():
|
|
402
|
+
return direct_candidate
|
|
403
|
+
|
|
404
|
+
matches: list[Path] = []
|
|
405
|
+
for candidate in root.rglob(f"{txn_id}.jsonl"):
|
|
406
|
+
if candidate.parent.name == "transactions" and candidate.parent.parent.name == ".dataforge":
|
|
407
|
+
matches.append(candidate)
|
|
408
|
+
|
|
409
|
+
if not matches:
|
|
410
|
+
raise TransactionLogError(f"Could not find transaction '{txn_id}' under '{root}'.")
|
|
411
|
+
if len(matches) > 1:
|
|
412
|
+
raise TransactionLogError(f"Found multiple transaction logs for '{txn_id}' under '{root}'.")
|
|
413
|
+
return matches[0]
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def verify_transaction_log(
|
|
417
|
+
txn_id: str | None = None,
|
|
418
|
+
*,
|
|
419
|
+
log_path: Path | None = None,
|
|
420
|
+
search_root: Path | None = None,
|
|
421
|
+
) -> TransactionAuditReport:
|
|
422
|
+
"""Verify a transaction log's local hash chain.
|
|
423
|
+
|
|
424
|
+
Legacy v1 logs remain replayable but cannot be cryptographically verified,
|
|
425
|
+
so they return ``legacy_unverified`` instead of ``verified``.
|
|
426
|
+
"""
|
|
427
|
+
try:
|
|
428
|
+
resolved_log_path = log_path.resolve() if log_path is not None else None
|
|
429
|
+
if resolved_log_path is None:
|
|
430
|
+
if txn_id is None:
|
|
431
|
+
return TransactionAuditReport(
|
|
432
|
+
verdict=TransactionAuditVerdict.MISSING,
|
|
433
|
+
txn_id=txn_id,
|
|
434
|
+
event_count=0,
|
|
435
|
+
errors=("txn_id or log_path is required.",),
|
|
436
|
+
)
|
|
437
|
+
resolved_log_path = find_transaction_log(txn_id, search_root=search_root)
|
|
438
|
+
except TransactionLogError as exc:
|
|
439
|
+
return TransactionAuditReport(
|
|
440
|
+
verdict=TransactionAuditVerdict.MISSING,
|
|
441
|
+
txn_id=txn_id,
|
|
442
|
+
event_count=0,
|
|
443
|
+
errors=(str(exc),),
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
if not resolved_log_path.exists():
|
|
447
|
+
return TransactionAuditReport(
|
|
448
|
+
verdict=TransactionAuditVerdict.MISSING,
|
|
449
|
+
log_path=str(resolved_log_path),
|
|
450
|
+
txn_id=txn_id,
|
|
451
|
+
event_count=0,
|
|
452
|
+
errors=(f"Transaction log not found: {resolved_log_path}",),
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
try:
|
|
456
|
+
records = _read_records(resolved_log_path)
|
|
457
|
+
except TransactionLogError as exc:
|
|
458
|
+
return TransactionAuditReport(
|
|
459
|
+
verdict=TransactionAuditVerdict.MALFORMED,
|
|
460
|
+
log_path=str(resolved_log_path),
|
|
461
|
+
txn_id=txn_id,
|
|
462
|
+
event_count=0,
|
|
463
|
+
errors=(str(exc),),
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
if not records:
|
|
467
|
+
return TransactionAuditReport(
|
|
468
|
+
verdict=TransactionAuditVerdict.MALFORMED,
|
|
469
|
+
log_path=str(resolved_log_path),
|
|
470
|
+
txn_id=txn_id,
|
|
471
|
+
event_count=0,
|
|
472
|
+
errors=("Transaction log contained no events.",),
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
versions = {record.get("schema_version") for record in records}
|
|
476
|
+
if versions == {LEGACY_SCHEMA_VERSION}:
|
|
477
|
+
try:
|
|
478
|
+
transaction = load_transaction(resolved_log_path)
|
|
479
|
+
except TransactionLogError as exc:
|
|
480
|
+
return TransactionAuditReport(
|
|
481
|
+
verdict=TransactionAuditVerdict.MALFORMED,
|
|
482
|
+
log_path=str(resolved_log_path),
|
|
483
|
+
schema_version=LEGACY_SCHEMA_VERSION,
|
|
484
|
+
schema_name=LEGACY_SCHEMA_NAME,
|
|
485
|
+
event_count=len(records),
|
|
486
|
+
errors=(str(exc),),
|
|
487
|
+
)
|
|
488
|
+
if txn_id is not None and transaction.txn_id != txn_id:
|
|
489
|
+
return TransactionAuditReport(
|
|
490
|
+
verdict=TransactionAuditVerdict.TAMPERED,
|
|
491
|
+
log_path=str(resolved_log_path),
|
|
492
|
+
txn_id=transaction.txn_id,
|
|
493
|
+
schema_version=LEGACY_SCHEMA_VERSION,
|
|
494
|
+
schema_name=LEGACY_SCHEMA_NAME,
|
|
495
|
+
event_count=len(records),
|
|
496
|
+
errors=(f"Expected txn_id '{txn_id}', found '{transaction.txn_id}'.",),
|
|
497
|
+
)
|
|
498
|
+
return TransactionAuditReport(
|
|
499
|
+
verdict=TransactionAuditVerdict.LEGACY_UNVERIFIED,
|
|
500
|
+
log_path=str(resolved_log_path),
|
|
501
|
+
txn_id=transaction.txn_id,
|
|
502
|
+
schema_version=LEGACY_SCHEMA_VERSION,
|
|
503
|
+
schema_name=LEGACY_SCHEMA_NAME,
|
|
504
|
+
event_count=len(records),
|
|
505
|
+
errors=("Legacy v1 logs do not contain event hashes.",),
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
if versions != {SCHEMA_VERSION}:
|
|
509
|
+
return TransactionAuditReport(
|
|
510
|
+
verdict=TransactionAuditVerdict.MALFORMED,
|
|
511
|
+
log_path=str(resolved_log_path),
|
|
512
|
+
txn_id=txn_id,
|
|
513
|
+
event_count=len(records),
|
|
514
|
+
errors=(f"Mixed or unsupported schema versions: {sorted(map(str, versions))}.",),
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
errors: list[str] = []
|
|
518
|
+
previous_hash: str | None = None
|
|
519
|
+
resolved_txn_id: str | None = None
|
|
520
|
+
head_sha256: str | None = None
|
|
521
|
+
for expected_index, record in enumerate(records):
|
|
522
|
+
if record.get("event_index") != expected_index:
|
|
523
|
+
errors.append(f"Event {expected_index} has event_index {record.get('event_index')!r}.")
|
|
524
|
+
if record.get("previous_event_sha256") != previous_hash:
|
|
525
|
+
errors.append(f"Event {expected_index} previous hash does not match.")
|
|
526
|
+
|
|
527
|
+
recorded_hash = record.get("event_sha256")
|
|
528
|
+
if not isinstance(recorded_hash, str) or not _SHA256_RE.fullmatch(recorded_hash):
|
|
529
|
+
errors.append(f"Event {expected_index} is missing a valid event hash.")
|
|
530
|
+
else:
|
|
531
|
+
calculated_hash = _event_sha256(record)
|
|
532
|
+
if calculated_hash != recorded_hash:
|
|
533
|
+
errors.append(f"Event {expected_index} hash does not match its payload.")
|
|
534
|
+
previous_hash = recorded_hash
|
|
535
|
+
head_sha256 = recorded_hash
|
|
536
|
+
|
|
537
|
+
event_type = record.get("event_type")
|
|
538
|
+
if event_type == "created":
|
|
539
|
+
raw_transaction = record.get("transaction")
|
|
540
|
+
if not isinstance(raw_transaction, dict):
|
|
541
|
+
errors.append("Created event is missing a transaction payload.")
|
|
542
|
+
else:
|
|
543
|
+
current_txn_id = raw_transaction.get("txn_id")
|
|
544
|
+
if not isinstance(current_txn_id, str):
|
|
545
|
+
errors.append("Created transaction payload is missing txn_id.")
|
|
546
|
+
elif resolved_txn_id is None:
|
|
547
|
+
resolved_txn_id = current_txn_id
|
|
548
|
+
elif resolved_txn_id != current_txn_id:
|
|
549
|
+
errors.append("Created transaction payload changed txn_id.")
|
|
550
|
+
elif event_type in {"applied", "reverted"}:
|
|
551
|
+
current_txn_id = record.get("txn_id")
|
|
552
|
+
if current_txn_id != resolved_txn_id:
|
|
553
|
+
errors.append(
|
|
554
|
+
f"Event {expected_index} txn_id {current_txn_id!r} does not match created event."
|
|
555
|
+
)
|
|
556
|
+
else:
|
|
557
|
+
errors.append(f"Event {expected_index} has unknown event_type {event_type!r}.")
|
|
558
|
+
|
|
559
|
+
if txn_id is not None and resolved_txn_id is not None and resolved_txn_id != txn_id:
|
|
560
|
+
errors.append(f"Expected txn_id '{txn_id}', found '{resolved_txn_id}'.")
|
|
561
|
+
|
|
562
|
+
try:
|
|
563
|
+
transaction = load_transaction(resolved_log_path)
|
|
564
|
+
except TransactionLogError as exc:
|
|
565
|
+
errors.append(str(exc))
|
|
566
|
+
|
|
567
|
+
if errors:
|
|
568
|
+
return TransactionAuditReport(
|
|
569
|
+
verdict=TransactionAuditVerdict.TAMPERED,
|
|
570
|
+
log_path=str(resolved_log_path),
|
|
571
|
+
txn_id=resolved_txn_id or txn_id,
|
|
572
|
+
schema_version=SCHEMA_VERSION,
|
|
573
|
+
schema_name=SCHEMA_NAME,
|
|
574
|
+
event_count=len(records),
|
|
575
|
+
head_sha256=head_sha256,
|
|
576
|
+
errors=tuple(errors),
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
if transaction.applied and transaction.reverted_at is None:
|
|
580
|
+
source_path = Path(transaction.source_path)
|
|
581
|
+
snapshot_path = Path(transaction.source_snapshot_path)
|
|
582
|
+
revert_errors: list[str] = []
|
|
583
|
+
if transaction.source_kind == "file":
|
|
584
|
+
if not source_path.exists():
|
|
585
|
+
revert_errors.append(f"Source file not found: {source_path}")
|
|
586
|
+
elif (
|
|
587
|
+
transaction.post_sha256 is not None
|
|
588
|
+
and sha256_file(source_path) != transaction.post_sha256
|
|
589
|
+
):
|
|
590
|
+
revert_errors.append("Source file no longer matches the recorded post-state hash.")
|
|
591
|
+
if not snapshot_path.exists():
|
|
592
|
+
revert_errors.append(f"Source snapshot not found: {snapshot_path}")
|
|
593
|
+
if revert_errors:
|
|
594
|
+
return TransactionAuditReport(
|
|
595
|
+
verdict=TransactionAuditVerdict.UNREVERTIBLE,
|
|
596
|
+
log_path=str(resolved_log_path),
|
|
597
|
+
txn_id=resolved_txn_id,
|
|
598
|
+
schema_version=SCHEMA_VERSION,
|
|
599
|
+
schema_name=SCHEMA_NAME,
|
|
600
|
+
event_count=len(records),
|
|
601
|
+
head_sha256=head_sha256,
|
|
602
|
+
errors=tuple(revert_errors),
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
return TransactionAuditReport(
|
|
606
|
+
verdict=TransactionAuditVerdict.VERIFIED,
|
|
607
|
+
log_path=str(resolved_log_path),
|
|
608
|
+
txn_id=resolved_txn_id,
|
|
609
|
+
schema_version=SCHEMA_VERSION,
|
|
610
|
+
schema_name=SCHEMA_NAME,
|
|
611
|
+
event_count=len(records),
|
|
612
|
+
head_sha256=head_sha256,
|
|
613
|
+
)
|