dataforge-07 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. dataforge/__init__.py +204 -0
  2. dataforge/__main__.py +5 -0
  3. dataforge/agent/__init__.py +16 -0
  4. dataforge/agent/providers.py +259 -0
  5. dataforge/agent/scratchpad.py +183 -0
  6. dataforge/agent/tool_actions.py +343 -0
  7. dataforge/bench/__init__.py +31 -0
  8. dataforge/bench/core.py +426 -0
  9. dataforge/bench/groq_client.py +386 -0
  10. dataforge/bench/methods.py +443 -0
  11. dataforge/bench/report.py +309 -0
  12. dataforge/bench/runner.py +247 -0
  13. dataforge/causal/__init__.py +21 -0
  14. dataforge/causal/dag.py +174 -0
  15. dataforge/causal/pc.py +232 -0
  16. dataforge/causal/root_cause.py +193 -0
  17. dataforge/cli/__init__.py +50 -0
  18. dataforge/cli/audit.py +70 -0
  19. dataforge/cli/bench.py +154 -0
  20. dataforge/cli/common.py +267 -0
  21. dataforge/cli/constraints.py +407 -0
  22. dataforge/cli/profile.py +147 -0
  23. dataforge/cli/release.py +166 -0
  24. dataforge/cli/repair.py +407 -0
  25. dataforge/cli/revert.py +139 -0
  26. dataforge/cli/watch.py +144 -0
  27. dataforge/datasets/__init__.py +25 -0
  28. dataforge/datasets/embedded/hospital/clean.csv +11 -0
  29. dataforge/datasets/embedded/hospital/dirty.csv +11 -0
  30. dataforge/datasets/real_world.py +290 -0
  31. dataforge/datasets/registry.py +103 -0
  32. dataforge/detectors/__init__.py +80 -0
  33. dataforge/detectors/base.py +145 -0
  34. dataforge/detectors/decimal_shift.py +166 -0
  35. dataforge/detectors/fd_violation.py +157 -0
  36. dataforge/detectors/type_mismatch.py +173 -0
  37. dataforge/engine/__init__.py +39 -0
  38. dataforge/engine/repair.py +905 -0
  39. dataforge/env/__init__.py +22 -0
  40. dataforge/env/environment.py +883 -0
  41. dataforge/env/observation.py +61 -0
  42. dataforge/env/openenv_core.py +161 -0
  43. dataforge/env/reward.py +128 -0
  44. dataforge/env/server.py +176 -0
  45. dataforge/evaluation_contract.py +76 -0
  46. dataforge/fixtures/hospital_10rows.csv +11 -0
  47. dataforge/fixtures/hospital_schema.yaml +17 -0
  48. dataforge/http/__init__.py +1 -0
  49. dataforge/http/problem.py +103 -0
  50. dataforge/integrations/__init__.py +1 -0
  51. dataforge/integrations/dbt.py +164 -0
  52. dataforge/observability.py +76 -0
  53. dataforge/py.typed +1 -0
  54. dataforge/release/__init__.py +1 -0
  55. dataforge/release/doctor.py +367 -0
  56. dataforge/release/full_vision.py +702 -0
  57. dataforge/release/gate.py +861 -0
  58. dataforge/release/playground_check.py +411 -0
  59. dataforge/repair_contract.py +468 -0
  60. dataforge/repairers/__init__.py +88 -0
  61. dataforge/repairers/base.py +77 -0
  62. dataforge/repairers/decimal_shift.py +43 -0
  63. dataforge/repairers/fd_violation.py +225 -0
  64. dataforge/repairers/type_mismatch.py +73 -0
  65. dataforge/safety/__init__.py +5 -0
  66. dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
  67. dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
  68. dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
  69. dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
  70. dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
  71. dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
  72. dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
  73. dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
  74. dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
  75. dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
  76. dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
  77. dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
  78. dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
  79. dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
  80. dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
  81. dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
  82. dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
  83. dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
  84. dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
  85. dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
  86. dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
  87. dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
  88. dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
  89. dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
  90. dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
  91. dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
  92. dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
  93. dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
  94. dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
  95. dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
  96. dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
  97. dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
  98. dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
  99. dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
  100. dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
  101. dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
  102. dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
  103. dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
  104. dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
  105. dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
  106. dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
  107. dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
  108. dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
  109. dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
  110. dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
  111. dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
  112. dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
  113. dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
  114. dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
  115. dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
  116. dataforge/safety/constitution.py +307 -0
  117. dataforge/safety/constitutions/default.yaml +40 -0
  118. dataforge/safety/filter.py +134 -0
  119. dataforge/schema_inference.py +620 -0
  120. dataforge/stores/__init__.py +46 -0
  121. dataforge/stores/base.py +73 -0
  122. dataforge/stores/cloud.py +78 -0
  123. dataforge/stores/csv.py +94 -0
  124. dataforge/stores/duckdb.py +313 -0
  125. dataforge/stores/patch_plan.py +178 -0
  126. dataforge/stores/registry.py +82 -0
  127. dataforge/stores/repair.py +121 -0
  128. dataforge/stores/revert.py +22 -0
  129. dataforge/stores/sql.py +27 -0
  130. dataforge/table.py +228 -0
  131. dataforge/transactions/__init__.py +34 -0
  132. dataforge/transactions/files.py +96 -0
  133. dataforge/transactions/log.py +613 -0
  134. dataforge/transactions/revert.py +102 -0
  135. dataforge/transactions/txn.py +104 -0
  136. dataforge/ui/__init__.py +1 -0
  137. dataforge/ui/profile_view.py +136 -0
  138. dataforge/ui/repair_diff.py +91 -0
  139. dataforge/verifier/__init__.py +55 -0
  140. dataforge/verifier/constraint_ir.py +155 -0
  141. dataforge/verifier/explain.py +47 -0
  142. dataforge/verifier/gate.py +5 -0
  143. dataforge/verifier/schema.py +111 -0
  144. dataforge/verifier/smt.py +433 -0
  145. dataforge_07-0.1.0.dist-info/METADATA +436 -0
  146. dataforge_07-0.1.0.dist-info/RECORD +150 -0
  147. dataforge_07-0.1.0.dist-info/WHEEL +5 -0
  148. dataforge_07-0.1.0.dist-info/entry_points.txt +3 -0
  149. dataforge_07-0.1.0.dist-info/licenses/LICENSE +176 -0
  150. dataforge_07-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,613 @@
1
+ """Append-only JSONL transaction journal for DataForge repairs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import enum
6
+ import hashlib
7
+ import json
8
+ import re
9
+ from datetime import UTC, datetime
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from pydantic import BaseModel, Field
14
+
15
+ from dataforge.transactions.txn import RepairTransaction
16
+
17
+ LEGACY_SCHEMA_VERSION = 1
18
+ SCHEMA_VERSION = 2
19
+ LEGACY_SCHEMA_NAME = "transaction_journal_v1"
20
+ SCHEMA_NAME = "transaction_journal_v2"
21
+ _SHA256_RE = re.compile(r"^[0-9a-f]{64}$")
22
+
23
+
24
+ class TransactionLogError(Exception):
25
+ """Raised when a transaction journal cannot be written or replayed."""
26
+
27
+
28
+ class TransactionAuditVerdict(enum.Enum):
29
+ """Possible outcomes for transaction log audit verification."""
30
+
31
+ VERIFIED = "verified"
32
+ LEGACY_UNVERIFIED = "legacy_unverified"
33
+ UNREVERTIBLE = "unrevertible"
34
+ TAMPERED = "tampered"
35
+ MISSING = "missing"
36
+ MALFORMED = "malformed"
37
+
38
+
39
+ class TransactionAuditReport(BaseModel):
40
+ """Machine-readable result of transaction hash-chain verification."""
41
+
42
+ verdict: TransactionAuditVerdict
43
+ log_path: str | None = None
44
+ txn_id: str | None = None
45
+ schema_version: int | None = None
46
+ schema_name: str | None = None
47
+ event_count: int = Field(ge=0)
48
+ head_sha256: str | None = Field(default=None, pattern=r"^[0-9a-f]{64}$")
49
+ errors: tuple[str, ...] = Field(default_factory=tuple)
50
+
51
+ model_config = {"frozen": True}
52
+
53
+
54
+ def sha256_bytes(payload: bytes) -> str:
55
+ """Return the SHA-256 digest for the given payload."""
56
+ return hashlib.sha256(payload).hexdigest()
57
+
58
+
59
+ def sha256_file(path: Path) -> str:
60
+ """Return the SHA-256 digest for the file at ``path``."""
61
+ return sha256_bytes(path.read_bytes())
62
+
63
+
64
+ def dataforge_root_for(source_path: Path) -> Path:
65
+ """Return the hidden DataForge state directory for a source path."""
66
+ return source_path.resolve().parent / ".dataforge"
67
+
68
+
69
+ def transactions_dir_for(source_path: Path) -> Path:
70
+ """Return the transaction journal directory for a source path."""
71
+ return dataforge_root_for(source_path) / "transactions"
72
+
73
+
74
+ def snapshots_dir_for(source_path: Path) -> Path:
75
+ """Return the snapshot directory for a source path."""
76
+ return dataforge_root_for(source_path) / "snapshots"
77
+
78
+
79
+ def cache_dir_for(source_path: Path) -> Path:
80
+ """Return the cache directory for a source path."""
81
+ return dataforge_root_for(source_path) / "cache"
82
+
83
+
84
+ def snapshot_path_for(source_path: Path, txn_id: str) -> Path:
85
+ """Return the immutable snapshot path for a transaction."""
86
+ return snapshots_dir_for(source_path) / f"{txn_id}.bin"
87
+
88
+
89
+ def transaction_log_path_for(source_path: Path, txn_id: str) -> Path:
90
+ """Return the JSONL log path for a transaction."""
91
+ return transactions_dir_for(source_path) / f"{txn_id}.jsonl"
92
+
93
+
94
+ def _utc_now() -> datetime:
95
+ """Return the current UTC timestamp."""
96
+ return datetime.now(UTC)
97
+
98
+
99
+ def _canonical_event_bytes(record: dict[str, Any]) -> bytes:
100
+ """Serialize an audit event into the canonical hash material."""
101
+ unsigned = {key: value for key, value in record.items() if key != "event_sha256"}
102
+ return json.dumps(
103
+ unsigned,
104
+ sort_keys=True,
105
+ separators=(",", ":"),
106
+ ensure_ascii=False,
107
+ ).encode("utf-8")
108
+
109
+
110
+ def _event_sha256(record: dict[str, Any]) -> str:
111
+ """Return the canonical SHA-256 hash for an event record."""
112
+ return sha256_bytes(_canonical_event_bytes(record))
113
+
114
+
115
+ def _sign_event(record: dict[str, Any]) -> dict[str, Any]:
116
+ """Return a copy of ``record`` with its canonical event hash attached."""
117
+ signed = dict(record)
118
+ signed["event_sha256"] = _event_sha256(signed)
119
+ return signed
120
+
121
+
122
+ def _write_jsonl_line(path: Path, record: dict[str, Any], *, create: bool = False) -> None:
123
+ """Append or create a JSONL record on disk.
124
+
125
+ Args:
126
+ path: The target JSONL log path.
127
+ record: JSON-serializable record to write.
128
+ create: When true, fail if the file already exists.
129
+
130
+ Raises:
131
+ TransactionLogError: If the record cannot be written.
132
+ """
133
+ path.parent.mkdir(parents=True, exist_ok=True)
134
+ mode = "x" if create else "a"
135
+ try:
136
+ with path.open(mode, encoding="utf-8", newline="\n") as handle:
137
+ handle.write(json.dumps(record, sort_keys=True))
138
+ handle.write("\n")
139
+ except OSError as exc:
140
+ raise TransactionLogError(f"Could not write transaction log '{path}': {exc}") from exc
141
+
142
+
143
+ def _read_records(log_path: Path) -> list[dict[str, Any]]:
144
+ """Read non-empty JSONL records from a transaction log."""
145
+ records: list[dict[str, Any]] = []
146
+ for line_number, raw_line in enumerate(log_path.read_text(encoding="utf-8").splitlines(), 1):
147
+ if not raw_line.strip():
148
+ continue
149
+ try:
150
+ payload = json.loads(raw_line)
151
+ except json.JSONDecodeError as exc:
152
+ raise TransactionLogError(
153
+ f"Malformed JSON at {log_path}:{line_number}: {exc.msg}"
154
+ ) from exc
155
+ if not isinstance(payload, dict):
156
+ raise TransactionLogError(f"Malformed transaction event at {log_path}:{line_number}.")
157
+ records.append(payload)
158
+ return records
159
+
160
+
161
+ def _log_schema_version(log_path: Path) -> int | None:
162
+ """Return the first event schema version for an existing log."""
163
+ if not log_path.exists():
164
+ return None
165
+ records = _read_records(log_path)
166
+ if not records:
167
+ return None
168
+ raw_version = records[0].get("schema_version")
169
+ return raw_version if isinstance(raw_version, int) else None
170
+
171
+
172
+ def _next_event_metadata(log_path: Path) -> tuple[int, str | None]:
173
+ """Return the next v2 event index and previous hash for ``log_path``."""
174
+ records = _read_records(log_path)
175
+ if not records:
176
+ raise TransactionLogError(f"Transaction log '{log_path}' contained no events.")
177
+ previous = records[-1].get("event_sha256")
178
+ if not isinstance(previous, str) or not _SHA256_RE.fullmatch(previous):
179
+ raise TransactionLogError(
180
+ f"Transaction log '{log_path}' is missing a valid previous event hash."
181
+ )
182
+ return len(records), previous
183
+
184
+
185
+ def _v1_created_record(transaction: RepairTransaction) -> dict[str, Any]:
186
+ """Build a legacy v1 transaction creation event."""
187
+ return {
188
+ "schema_version": LEGACY_SCHEMA_VERSION,
189
+ "event_type": "created",
190
+ "occurred_at": transaction.created_at.isoformat(),
191
+ "transaction": transaction.model_dump(mode="json"),
192
+ }
193
+
194
+
195
+ def _v2_created_record(transaction: RepairTransaction) -> dict[str, Any]:
196
+ """Build a hash-chained v2 transaction creation event."""
197
+ return _sign_event(
198
+ {
199
+ "schema_version": SCHEMA_VERSION,
200
+ "schema_name": SCHEMA_NAME,
201
+ "event_index": 0,
202
+ "event_type": "created",
203
+ "occurred_at": transaction.created_at.isoformat(),
204
+ "previous_event_sha256": None,
205
+ "transaction": transaction.model_dump(mode="json"),
206
+ }
207
+ )
208
+
209
+
210
+ def _v1_applied_record(txn_id: str, post_sha256: str, applied_at: datetime) -> dict[str, Any]:
211
+ """Build a legacy v1 applied event."""
212
+ return {
213
+ "schema_version": LEGACY_SCHEMA_VERSION,
214
+ "event_type": "applied",
215
+ "occurred_at": applied_at.isoformat(),
216
+ "txn_id": txn_id,
217
+ "post_sha256": post_sha256,
218
+ }
219
+
220
+
221
+ def _v2_applied_record(
222
+ log_path: Path,
223
+ txn_id: str,
224
+ post_sha256: str,
225
+ applied_at: datetime,
226
+ ) -> dict[str, Any]:
227
+ """Build a hash-chained v2 applied event."""
228
+ event_index, previous_hash = _next_event_metadata(log_path)
229
+ return _sign_event(
230
+ {
231
+ "schema_version": SCHEMA_VERSION,
232
+ "schema_name": SCHEMA_NAME,
233
+ "event_index": event_index,
234
+ "event_type": "applied",
235
+ "occurred_at": applied_at.isoformat(),
236
+ "previous_event_sha256": previous_hash,
237
+ "txn_id": txn_id,
238
+ "post_sha256": post_sha256,
239
+ }
240
+ )
241
+
242
+
243
+ def _v1_reverted_record(txn_id: str, reverted_at: datetime) -> dict[str, Any]:
244
+ """Build a legacy v1 reverted event."""
245
+ return {
246
+ "schema_version": LEGACY_SCHEMA_VERSION,
247
+ "event_type": "reverted",
248
+ "occurred_at": reverted_at.isoformat(),
249
+ "txn_id": txn_id,
250
+ }
251
+
252
+
253
+ def _v2_reverted_record(log_path: Path, txn_id: str, reverted_at: datetime) -> dict[str, Any]:
254
+ """Build a hash-chained v2 reverted event."""
255
+ event_index, previous_hash = _next_event_metadata(log_path)
256
+ return _sign_event(
257
+ {
258
+ "schema_version": SCHEMA_VERSION,
259
+ "schema_name": SCHEMA_NAME,
260
+ "event_index": event_index,
261
+ "event_type": "reverted",
262
+ "occurred_at": reverted_at.isoformat(),
263
+ "previous_event_sha256": previous_hash,
264
+ "txn_id": txn_id,
265
+ }
266
+ )
267
+
268
+
269
+ def append_created_transaction(
270
+ transaction: RepairTransaction,
271
+ *,
272
+ log_root: Path | None = None,
273
+ ) -> Path:
274
+ """Write the immutable transaction creation event.
275
+
276
+ Args:
277
+ transaction: The transaction to serialize.
278
+
279
+ Returns:
280
+ The created JSONL log path.
281
+ """
282
+ source_path = Path(transaction.source_path)
283
+ log_path = (
284
+ transaction_log_path_for(source_path, transaction.txn_id)
285
+ if log_root is None
286
+ else log_root.resolve() / ".dataforge" / "transactions" / f"{transaction.txn_id}.jsonl"
287
+ )
288
+ _write_jsonl_line(log_path, _v2_created_record(transaction), create=True)
289
+ return log_path
290
+
291
+
292
+ def append_applied_event(
293
+ log_path: Path,
294
+ txn_id: str,
295
+ post_sha256: str,
296
+ *,
297
+ applied_at: datetime | None = None,
298
+ ) -> None:
299
+ """Append an ``applied`` event to an existing transaction log."""
300
+ occurred_at = applied_at or _utc_now()
301
+ record = (
302
+ _v1_applied_record(txn_id, post_sha256, occurred_at)
303
+ if _log_schema_version(log_path) == LEGACY_SCHEMA_VERSION
304
+ else _v2_applied_record(log_path, txn_id, post_sha256, occurred_at)
305
+ )
306
+ _write_jsonl_line(log_path, record, create=False)
307
+
308
+
309
+ def append_reverted_event(
310
+ log_path: Path,
311
+ txn_id: str,
312
+ *,
313
+ reverted_at: datetime | None = None,
314
+ ) -> None:
315
+ """Append a ``reverted`` event to an existing transaction log."""
316
+ occurred_at = reverted_at or _utc_now()
317
+ record = (
318
+ _v1_reverted_record(txn_id, occurred_at)
319
+ if _log_schema_version(log_path) == LEGACY_SCHEMA_VERSION
320
+ else _v2_reverted_record(log_path, txn_id, occurred_at)
321
+ )
322
+ _write_jsonl_line(log_path, record, create=False)
323
+
324
+
325
+ def load_transaction(log_path: Path) -> RepairTransaction:
326
+ """Replay a transaction log into the latest transaction state.
327
+
328
+ Args:
329
+ log_path: Path to the JSONL log file.
330
+
331
+ Returns:
332
+ The latest replayed transaction state.
333
+
334
+ Raises:
335
+ TransactionLogError: If the log is missing or malformed.
336
+ """
337
+ if not log_path.exists():
338
+ raise TransactionLogError(f"Transaction log not found: {log_path}")
339
+
340
+ transaction: RepairTransaction | None = None
341
+ for payload in _read_records(log_path):
342
+ if payload.get("schema_version") not in {LEGACY_SCHEMA_VERSION, SCHEMA_VERSION}:
343
+ raise TransactionLogError(
344
+ f"Unsupported transaction log schema version in '{log_path}'."
345
+ )
346
+
347
+ event_type = payload.get("event_type")
348
+ if event_type == "created":
349
+ transaction = RepairTransaction.model_validate(payload["transaction"])
350
+ continue
351
+
352
+ if transaction is None:
353
+ raise TransactionLogError(
354
+ f"Transaction log '{log_path}' is missing the initial created event."
355
+ )
356
+
357
+ if payload.get("txn_id") != transaction.txn_id:
358
+ raise TransactionLogError(
359
+ f"Transaction log '{log_path}' contains mismatched txn_id values."
360
+ )
361
+
362
+ if event_type == "applied":
363
+ transaction = transaction.model_copy(
364
+ update={
365
+ "applied": True,
366
+ "post_sha256": payload["post_sha256"],
367
+ }
368
+ )
369
+ elif event_type == "reverted":
370
+ transaction = transaction.model_copy(
371
+ update={
372
+ "reverted_at": datetime.fromisoformat(payload["occurred_at"]),
373
+ }
374
+ )
375
+ else:
376
+ raise TransactionLogError(
377
+ f"Unknown transaction log event type '{event_type}' in '{log_path}'."
378
+ )
379
+
380
+ if transaction is None:
381
+ raise TransactionLogError(f"Transaction log '{log_path}' contained no transaction data.")
382
+
383
+ return transaction
384
+
385
+
386
+ def find_transaction_log(txn_id: str, *, search_root: Path | None = None) -> Path:
387
+ """Locate a transaction log by identifier under the working tree.
388
+
389
+ Args:
390
+ txn_id: Canonical transaction identifier.
391
+ search_root: Optional root directory to search under.
392
+
393
+ Returns:
394
+ The unique matching JSONL log path.
395
+
396
+ Raises:
397
+ TransactionLogError: If no log or multiple logs are found.
398
+ """
399
+ root = (search_root or Path.cwd()).resolve()
400
+ direct_candidate = root / ".dataforge" / "transactions" / f"{txn_id}.jsonl"
401
+ if direct_candidate.exists():
402
+ return direct_candidate
403
+
404
+ matches: list[Path] = []
405
+ for candidate in root.rglob(f"{txn_id}.jsonl"):
406
+ if candidate.parent.name == "transactions" and candidate.parent.parent.name == ".dataforge":
407
+ matches.append(candidate)
408
+
409
+ if not matches:
410
+ raise TransactionLogError(f"Could not find transaction '{txn_id}' under '{root}'.")
411
+ if len(matches) > 1:
412
+ raise TransactionLogError(f"Found multiple transaction logs for '{txn_id}' under '{root}'.")
413
+ return matches[0]
414
+
415
+
416
+ def verify_transaction_log(
417
+ txn_id: str | None = None,
418
+ *,
419
+ log_path: Path | None = None,
420
+ search_root: Path | None = None,
421
+ ) -> TransactionAuditReport:
422
+ """Verify a transaction log's local hash chain.
423
+
424
+ Legacy v1 logs remain replayable but cannot be cryptographically verified,
425
+ so they return ``legacy_unverified`` instead of ``verified``.
426
+ """
427
+ try:
428
+ resolved_log_path = log_path.resolve() if log_path is not None else None
429
+ if resolved_log_path is None:
430
+ if txn_id is None:
431
+ return TransactionAuditReport(
432
+ verdict=TransactionAuditVerdict.MISSING,
433
+ txn_id=txn_id,
434
+ event_count=0,
435
+ errors=("txn_id or log_path is required.",),
436
+ )
437
+ resolved_log_path = find_transaction_log(txn_id, search_root=search_root)
438
+ except TransactionLogError as exc:
439
+ return TransactionAuditReport(
440
+ verdict=TransactionAuditVerdict.MISSING,
441
+ txn_id=txn_id,
442
+ event_count=0,
443
+ errors=(str(exc),),
444
+ )
445
+
446
+ if not resolved_log_path.exists():
447
+ return TransactionAuditReport(
448
+ verdict=TransactionAuditVerdict.MISSING,
449
+ log_path=str(resolved_log_path),
450
+ txn_id=txn_id,
451
+ event_count=0,
452
+ errors=(f"Transaction log not found: {resolved_log_path}",),
453
+ )
454
+
455
+ try:
456
+ records = _read_records(resolved_log_path)
457
+ except TransactionLogError as exc:
458
+ return TransactionAuditReport(
459
+ verdict=TransactionAuditVerdict.MALFORMED,
460
+ log_path=str(resolved_log_path),
461
+ txn_id=txn_id,
462
+ event_count=0,
463
+ errors=(str(exc),),
464
+ )
465
+
466
+ if not records:
467
+ return TransactionAuditReport(
468
+ verdict=TransactionAuditVerdict.MALFORMED,
469
+ log_path=str(resolved_log_path),
470
+ txn_id=txn_id,
471
+ event_count=0,
472
+ errors=("Transaction log contained no events.",),
473
+ )
474
+
475
+ versions = {record.get("schema_version") for record in records}
476
+ if versions == {LEGACY_SCHEMA_VERSION}:
477
+ try:
478
+ transaction = load_transaction(resolved_log_path)
479
+ except TransactionLogError as exc:
480
+ return TransactionAuditReport(
481
+ verdict=TransactionAuditVerdict.MALFORMED,
482
+ log_path=str(resolved_log_path),
483
+ schema_version=LEGACY_SCHEMA_VERSION,
484
+ schema_name=LEGACY_SCHEMA_NAME,
485
+ event_count=len(records),
486
+ errors=(str(exc),),
487
+ )
488
+ if txn_id is not None and transaction.txn_id != txn_id:
489
+ return TransactionAuditReport(
490
+ verdict=TransactionAuditVerdict.TAMPERED,
491
+ log_path=str(resolved_log_path),
492
+ txn_id=transaction.txn_id,
493
+ schema_version=LEGACY_SCHEMA_VERSION,
494
+ schema_name=LEGACY_SCHEMA_NAME,
495
+ event_count=len(records),
496
+ errors=(f"Expected txn_id '{txn_id}', found '{transaction.txn_id}'.",),
497
+ )
498
+ return TransactionAuditReport(
499
+ verdict=TransactionAuditVerdict.LEGACY_UNVERIFIED,
500
+ log_path=str(resolved_log_path),
501
+ txn_id=transaction.txn_id,
502
+ schema_version=LEGACY_SCHEMA_VERSION,
503
+ schema_name=LEGACY_SCHEMA_NAME,
504
+ event_count=len(records),
505
+ errors=("Legacy v1 logs do not contain event hashes.",),
506
+ )
507
+
508
+ if versions != {SCHEMA_VERSION}:
509
+ return TransactionAuditReport(
510
+ verdict=TransactionAuditVerdict.MALFORMED,
511
+ log_path=str(resolved_log_path),
512
+ txn_id=txn_id,
513
+ event_count=len(records),
514
+ errors=(f"Mixed or unsupported schema versions: {sorted(map(str, versions))}.",),
515
+ )
516
+
517
+ errors: list[str] = []
518
+ previous_hash: str | None = None
519
+ resolved_txn_id: str | None = None
520
+ head_sha256: str | None = None
521
+ for expected_index, record in enumerate(records):
522
+ if record.get("event_index") != expected_index:
523
+ errors.append(f"Event {expected_index} has event_index {record.get('event_index')!r}.")
524
+ if record.get("previous_event_sha256") != previous_hash:
525
+ errors.append(f"Event {expected_index} previous hash does not match.")
526
+
527
+ recorded_hash = record.get("event_sha256")
528
+ if not isinstance(recorded_hash, str) or not _SHA256_RE.fullmatch(recorded_hash):
529
+ errors.append(f"Event {expected_index} is missing a valid event hash.")
530
+ else:
531
+ calculated_hash = _event_sha256(record)
532
+ if calculated_hash != recorded_hash:
533
+ errors.append(f"Event {expected_index} hash does not match its payload.")
534
+ previous_hash = recorded_hash
535
+ head_sha256 = recorded_hash
536
+
537
+ event_type = record.get("event_type")
538
+ if event_type == "created":
539
+ raw_transaction = record.get("transaction")
540
+ if not isinstance(raw_transaction, dict):
541
+ errors.append("Created event is missing a transaction payload.")
542
+ else:
543
+ current_txn_id = raw_transaction.get("txn_id")
544
+ if not isinstance(current_txn_id, str):
545
+ errors.append("Created transaction payload is missing txn_id.")
546
+ elif resolved_txn_id is None:
547
+ resolved_txn_id = current_txn_id
548
+ elif resolved_txn_id != current_txn_id:
549
+ errors.append("Created transaction payload changed txn_id.")
550
+ elif event_type in {"applied", "reverted"}:
551
+ current_txn_id = record.get("txn_id")
552
+ if current_txn_id != resolved_txn_id:
553
+ errors.append(
554
+ f"Event {expected_index} txn_id {current_txn_id!r} does not match created event."
555
+ )
556
+ else:
557
+ errors.append(f"Event {expected_index} has unknown event_type {event_type!r}.")
558
+
559
+ if txn_id is not None and resolved_txn_id is not None and resolved_txn_id != txn_id:
560
+ errors.append(f"Expected txn_id '{txn_id}', found '{resolved_txn_id}'.")
561
+
562
+ try:
563
+ transaction = load_transaction(resolved_log_path)
564
+ except TransactionLogError as exc:
565
+ errors.append(str(exc))
566
+
567
+ if errors:
568
+ return TransactionAuditReport(
569
+ verdict=TransactionAuditVerdict.TAMPERED,
570
+ log_path=str(resolved_log_path),
571
+ txn_id=resolved_txn_id or txn_id,
572
+ schema_version=SCHEMA_VERSION,
573
+ schema_name=SCHEMA_NAME,
574
+ event_count=len(records),
575
+ head_sha256=head_sha256,
576
+ errors=tuple(errors),
577
+ )
578
+
579
+ if transaction.applied and transaction.reverted_at is None:
580
+ source_path = Path(transaction.source_path)
581
+ snapshot_path = Path(transaction.source_snapshot_path)
582
+ revert_errors: list[str] = []
583
+ if transaction.source_kind == "file":
584
+ if not source_path.exists():
585
+ revert_errors.append(f"Source file not found: {source_path}")
586
+ elif (
587
+ transaction.post_sha256 is not None
588
+ and sha256_file(source_path) != transaction.post_sha256
589
+ ):
590
+ revert_errors.append("Source file no longer matches the recorded post-state hash.")
591
+ if not snapshot_path.exists():
592
+ revert_errors.append(f"Source snapshot not found: {snapshot_path}")
593
+ if revert_errors:
594
+ return TransactionAuditReport(
595
+ verdict=TransactionAuditVerdict.UNREVERTIBLE,
596
+ log_path=str(resolved_log_path),
597
+ txn_id=resolved_txn_id,
598
+ schema_version=SCHEMA_VERSION,
599
+ schema_name=SCHEMA_NAME,
600
+ event_count=len(records),
601
+ head_sha256=head_sha256,
602
+ errors=tuple(revert_errors),
603
+ )
604
+
605
+ return TransactionAuditReport(
606
+ verdict=TransactionAuditVerdict.VERIFIED,
607
+ log_path=str(resolved_log_path),
608
+ txn_id=resolved_txn_id,
609
+ schema_version=SCHEMA_VERSION,
610
+ schema_name=SCHEMA_NAME,
611
+ event_count=len(records),
612
+ head_sha256=head_sha256,
613
+ )