serverless-data-mesh 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- serverless_data_mesh/__init__.py +93 -0
- serverless_data_mesh/catalog/__init__.py +6 -0
- serverless_data_mesh/catalog/glue_connector.py +17 -0
- serverless_data_mesh/catalog/glue_rest.py +134 -0
- serverless_data_mesh/cli.py +165 -0
- serverless_data_mesh/config.py +42 -0
- serverless_data_mesh/dashboard/__init__.py +5 -0
- serverless_data_mesh/dashboard/cloudwatch.py +80 -0
- serverless_data_mesh/dashboard/trust.py +162 -0
- serverless_data_mesh/exceptions.py +23 -0
- serverless_data_mesh/governance/__init__.py +9 -0
- serverless_data_mesh/governance/consumer_sla.py +109 -0
- serverless_data_mesh/lineage/__init__.py +5 -0
- serverless_data_mesh/lineage/openlineage.py +96 -0
- serverless_data_mesh/local/__init__.py +5 -0
- serverless_data_mesh/local/runtime.py +380 -0
- serverless_data_mesh/metrics/__init__.py +5 -0
- serverless_data_mesh/metrics/mesh_trust.py +56 -0
- serverless_data_mesh/orchestration/__init__.py +28 -0
- serverless_data_mesh/orchestration/canary.py +127 -0
- serverless_data_mesh/orchestration/coordinator.py +265 -0
- serverless_data_mesh/orchestration/durable_steps.py +74 -0
- serverless_data_mesh/orchestration/reprocess.py +143 -0
- serverless_data_mesh/orchestration/state.py +16 -0
- serverless_data_mesh/py.typed +0 -0
- serverless_data_mesh/rules/__init__.py +8 -0
- serverless_data_mesh/rules/sparkrules_connector.py +193 -0
- serverless_data_mesh/scaffold/__init__.py +5 -0
- serverless_data_mesh/scaffold/init_domain.py +210 -0
- serverless_data_mesh/types/__init__.py +21 -0
- serverless_data_mesh/types/workload.py +123 -0
- serverless_data_mesh/verification/__init__.py +21 -0
- serverless_data_mesh/verification/backend.py +41 -0
- serverless_data_mesh/verification/fallback.py +200 -0
- serverless_data_mesh/verification/vrp.py +202 -0
- serverless_data_mesh-0.2.0.dist-info/METADATA +143 -0
- serverless_data_mesh-0.2.0.dist-info/RECORD +40 -0
- serverless_data_mesh-0.2.0.dist-info/WHEEL +4 -0
- serverless_data_mesh-0.2.0.dist-info/entry_points.txt +2 -0
- serverless_data_mesh-0.2.0.dist-info/licenses/LICENSE +17 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""IceGuard + Durable Execution coordinator for cross-domain lakehouse writes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from aws_durable_execution_sdk_python import DurableContext
|
|
9
|
+
from iceguard import protect
|
|
10
|
+
from iceguard.exceptions import IceGuardRollbackError
|
|
11
|
+
|
|
12
|
+
from serverless_data_mesh.catalog.glue_rest import GlueRestCatalogAdapter
|
|
13
|
+
from serverless_data_mesh.exceptions import VerificationRejectedError
|
|
14
|
+
from serverless_data_mesh.orchestration.durable_steps import (
|
|
15
|
+
durable_commit_metadata,
|
|
16
|
+
durable_write_chunk,
|
|
17
|
+
)
|
|
18
|
+
from serverless_data_mesh.orchestration.state import OrchestrationState
|
|
19
|
+
from serverless_data_mesh.types.workload import (
|
|
20
|
+
BatchWriterFn,
|
|
21
|
+
DataWriteWorkload,
|
|
22
|
+
SourceReaderFn,
|
|
23
|
+
WriteOutcome,
|
|
24
|
+
)
|
|
25
|
+
from serverless_data_mesh.metrics.mesh_trust import publish_vrp_metric
|
|
26
|
+
from serverless_data_mesh.orchestration.reprocess import attempt_vrp_repair
|
|
27
|
+
from serverless_data_mesh.verification.vrp import VRPProofGenerator, validate_then_commit
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _missing_from_repair(
|
|
33
|
+
source: list[dict[str, Any]],
|
|
34
|
+
sink: list[dict[str, Any]],
|
|
35
|
+
identity_fields: tuple[str, ...],
|
|
36
|
+
) -> list[dict[str, Any]]:
|
|
37
|
+
sink_ids = {"|".join(str(r.get(f, "")) for f in identity_fields) for r in sink}
|
|
38
|
+
return [
|
|
39
|
+
r
|
|
40
|
+
for r in source
|
|
41
|
+
if "|".join(str(r.get(f, "")) for f in identity_fields) not in sink_ids
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class IceGuardDurableCoordinator:
|
|
46
|
+
"""Coordinate large lakehouse writes across chained durable steps.
|
|
47
|
+
|
|
48
|
+
Composes IceGuard SafeWriter, AWS Durable Execution, veridata-recon proofs,
|
|
49
|
+
and Glue REST catalog commits into a single governed transaction boundary.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
*,
|
|
55
|
+
durable_context: DurableContext,
|
|
56
|
+
lambda_context: Any,
|
|
57
|
+
proof_generator: VRPProofGenerator,
|
|
58
|
+
catalog_adapter: GlueRestCatalogAdapter | None = None,
|
|
59
|
+
checkpoint_interval: int = 5000,
|
|
60
|
+
rollback_threshold_ms: int = 30_000,
|
|
61
|
+
) -> None:
|
|
62
|
+
self._durable = durable_context
|
|
63
|
+
self._lambda = lambda_context
|
|
64
|
+
self._proofs = proof_generator
|
|
65
|
+
self._catalog = catalog_adapter
|
|
66
|
+
self._checkpoint_interval = checkpoint_interval
|
|
67
|
+
self._rollback_threshold_ms = rollback_threshold_ms
|
|
68
|
+
|
|
69
|
+
def _initial_state(self, workload: DataWriteWorkload) -> OrchestrationState:
|
|
70
|
+
return OrchestrationState(workload_id=workload.workload_id)
|
|
71
|
+
|
|
72
|
+
def _state_dict(self, state: OrchestrationState) -> dict[str, Any]:
|
|
73
|
+
return {
|
|
74
|
+
"workload_id": state.workload_id,
|
|
75
|
+
"next_offset": state.next_offset,
|
|
76
|
+
"committed_chunks": state.committed_chunks,
|
|
77
|
+
"last_proof_hash": state.last_proof_hash,
|
|
78
|
+
"all_parquet_paths": state.all_parquet_paths,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
def _workload_dict(self, workload: DataWriteWorkload) -> dict[str, Any]:
|
|
82
|
+
return {
|
|
83
|
+
"workload_id": workload.workload_id,
|
|
84
|
+
"source_uri": workload.source_uri,
|
|
85
|
+
"target_uri": workload.target_uri,
|
|
86
|
+
"total_records": workload.total_records,
|
|
87
|
+
"checkpoint_bucket": workload.checkpoint_bucket,
|
|
88
|
+
"proof_bucket": workload.proof_bucket,
|
|
89
|
+
"content_fields": list(workload.content_fields),
|
|
90
|
+
"identity_fields": list(workload.identity_fields),
|
|
91
|
+
"boundary": {
|
|
92
|
+
"domain_id": workload.boundary.domain_id,
|
|
93
|
+
"source_namespace": workload.boundary.source_namespace,
|
|
94
|
+
"target_table": workload.boundary.target_table,
|
|
95
|
+
"partition_spec": workload.boundary.partition_spec,
|
|
96
|
+
"quality_policy_id": workload.boundary.quality_policy_id,
|
|
97
|
+
"max_chunk_records": workload.boundary.max_chunk_records,
|
|
98
|
+
},
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
def execute_workload(
|
|
102
|
+
self,
|
|
103
|
+
workload: DataWriteWorkload,
|
|
104
|
+
*,
|
|
105
|
+
batch_writer: BatchWriterFn,
|
|
106
|
+
source_reader: SourceReaderFn,
|
|
107
|
+
resume_state: OrchestrationState | None = None,
|
|
108
|
+
sink_reader: SourceReaderFn | None = None,
|
|
109
|
+
enable_auto_repair: bool = False,
|
|
110
|
+
) -> dict[str, Any]:
|
|
111
|
+
"""Run a large write as durable, resumable chunks under IceGuard protection."""
|
|
112
|
+
state = resume_state or self._initial_state(workload)
|
|
113
|
+
outcome = WriteOutcome.RESUMED if state.next_offset > 0 else WriteOutcome.COMMITTED
|
|
114
|
+
|
|
115
|
+
adapter = self._catalog or GlueRestCatalogAdapter.from_environment(
|
|
116
|
+
namespace=workload.boundary.source_namespace,
|
|
117
|
+
table_name=workload.boundary.target_table,
|
|
118
|
+
)
|
|
119
|
+
adapter.connect()
|
|
120
|
+
|
|
121
|
+
chunk_paths_by_batch: dict[tuple[int, int], list[str]] = {}
|
|
122
|
+
chunk_index = state.committed_chunks
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
with protect(
|
|
126
|
+
self._lambda,
|
|
127
|
+
table_format="iceberg",
|
|
128
|
+
s3_bucket=workload.checkpoint_bucket,
|
|
129
|
+
coordinator_id=workload.workload_id,
|
|
130
|
+
durable_context=self._durable,
|
|
131
|
+
rollback_threshold_ms=self._rollback_threshold_ms,
|
|
132
|
+
checkpoint_interval=self._checkpoint_interval,
|
|
133
|
+
catalog=adapter.catalog,
|
|
134
|
+
table_identifier=(
|
|
135
|
+
f"{workload.boundary.source_namespace}.{workload.boundary.target_table}"
|
|
136
|
+
),
|
|
137
|
+
) as writer:
|
|
138
|
+
|
|
139
|
+
def guarded_batch_writer(start: int, end: int) -> None:
|
|
140
|
+
nonlocal chunk_index, state
|
|
141
|
+
|
|
142
|
+
paths = batch_writer(start, end)
|
|
143
|
+
chunk_paths_by_batch[(start, end)] = paths
|
|
144
|
+
|
|
145
|
+
source_records = source_reader(start, end)
|
|
146
|
+
sink_records = sink_reader(start, end) if sink_reader else source_records
|
|
147
|
+
proof = self._proofs.build_proof(
|
|
148
|
+
source_records=source_records,
|
|
149
|
+
sink_records=sink_records,
|
|
150
|
+
workload=workload,
|
|
151
|
+
chunk_start=start,
|
|
152
|
+
chunk_end=end,
|
|
153
|
+
prev_proof_hash=state.last_proof_hash,
|
|
154
|
+
)
|
|
155
|
+
verification = validate_then_commit(proof)
|
|
156
|
+
if verification.outcome != "PASS" and enable_auto_repair and sink_reader:
|
|
157
|
+
|
|
158
|
+
def _repair_write(missing: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
159
|
+
batch_writer(start, start + len(missing))
|
|
160
|
+
return sink_records + missing
|
|
161
|
+
|
|
162
|
+
repair = attempt_vrp_repair(
|
|
163
|
+
source_records=source_records,
|
|
164
|
+
sink_records=sink_records,
|
|
165
|
+
workload=workload,
|
|
166
|
+
chunk_start=start,
|
|
167
|
+
chunk_end=end,
|
|
168
|
+
proof_generator=self._proofs,
|
|
169
|
+
write_repair_fn=_repair_write,
|
|
170
|
+
)
|
|
171
|
+
if repair.outcome == "repaired_pass" and repair.proof is not None:
|
|
172
|
+
proof = repair.proof
|
|
173
|
+
verification = validate_then_commit(proof)
|
|
174
|
+
sink_records = sink_records + _missing_from_repair(
|
|
175
|
+
source_records, sink_records, workload.identity_fields
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
publish_vrp_metric(
|
|
179
|
+
domain_id=workload.boundary.domain_id,
|
|
180
|
+
verdict=proof["reconciliation"]["verdict"],
|
|
181
|
+
row_count=len(sink_records),
|
|
182
|
+
workload_id=workload.workload_id,
|
|
183
|
+
)
|
|
184
|
+
if verification.outcome != "PASS":
|
|
185
|
+
raise VerificationRejectedError(
|
|
186
|
+
f"VRP blocked chunk [{start}, {end}): {verification.reason}"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
proof_uri = self._proofs.persist_proof(
|
|
190
|
+
proof,
|
|
191
|
+
bucket=workload.proof_bucket,
|
|
192
|
+
key_prefix=f"{workload.boundary.domain_id}/{workload.workload_id}",
|
|
193
|
+
chunk_index=chunk_index,
|
|
194
|
+
)
|
|
195
|
+
state.last_proof_hash = proof["proof_id"]
|
|
196
|
+
|
|
197
|
+
step_result = self._durable.step(
|
|
198
|
+
durable_write_chunk(
|
|
199
|
+
workload_payload=self._workload_dict(workload),
|
|
200
|
+
state_payload=self._state_dict(state),
|
|
201
|
+
chunk_index=chunk_index,
|
|
202
|
+
chunk_start=start,
|
|
203
|
+
chunk_end=end,
|
|
204
|
+
parquet_paths=paths,
|
|
205
|
+
proof_s3_uri=proof_uri,
|
|
206
|
+
verification_passed=True,
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
state = OrchestrationState(**step_result["state"])
|
|
210
|
+
chunk_index += 1
|
|
211
|
+
|
|
212
|
+
writer.write(
|
|
213
|
+
path=workload.target_uri,
|
|
214
|
+
total_records=workload.total_records,
|
|
215
|
+
batch_writer=guarded_batch_writer,
|
|
216
|
+
track_paths=lambda start, end: chunk_paths_by_batch.get((start, end), []),
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
commit_result = self._durable.step(
|
|
220
|
+
durable_commit_metadata(
|
|
221
|
+
namespace=workload.boundary.source_namespace,
|
|
222
|
+
table_name=workload.boundary.target_table,
|
|
223
|
+
parquet_paths=state.all_parquet_paths,
|
|
224
|
+
snapshot_properties={
|
|
225
|
+
"app-id": "serverless-data-mesh",
|
|
226
|
+
"workload-id": workload.workload_id,
|
|
227
|
+
"domain-id": workload.boundary.domain_id,
|
|
228
|
+
},
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
return {
|
|
232
|
+
"outcome": outcome.value,
|
|
233
|
+
"workload_id": workload.workload_id,
|
|
234
|
+
"records_written": state.next_offset,
|
|
235
|
+
"chunks": state.committed_chunks,
|
|
236
|
+
"snapshot_id": commit_result["snapshot_id"],
|
|
237
|
+
"proof_chain_tail": state.last_proof_hash,
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
except VerificationRejectedError as exc:
|
|
241
|
+
logger.error("Data quality gate failed for %s: %s", workload.workload_id, exc)
|
|
242
|
+
adapter.abort()
|
|
243
|
+
return {
|
|
244
|
+
"outcome": WriteOutcome.VERIFICATION_FAILED.value,
|
|
245
|
+
"workload_id": workload.workload_id,
|
|
246
|
+
"resume_offset": state.next_offset,
|
|
247
|
+
"message": str(exc),
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
except IceGuardRollbackError as exc:
|
|
251
|
+
logger.warning(
|
|
252
|
+
"IceGuard rollback for workload %s near timeout (remaining=%sms)",
|
|
253
|
+
workload.workload_id,
|
|
254
|
+
exc.remaining_time_ms,
|
|
255
|
+
)
|
|
256
|
+
adapter.abort()
|
|
257
|
+
return {
|
|
258
|
+
"outcome": WriteOutcome.ROLLED_BACK.value,
|
|
259
|
+
"workload_id": workload.workload_id,
|
|
260
|
+
"resume_offset": state.next_offset,
|
|
261
|
+
"message": (
|
|
262
|
+
"Physical files rolled back; re-invoke Lambda to resume "
|
|
263
|
+
"from durable checkpoint without duplicating committed chunks."
|
|
264
|
+
),
|
|
265
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""AWS Durable Execution step functions for mesh writes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from aws_durable_execution_sdk_python import StepContext, durable_step
|
|
8
|
+
|
|
9
|
+
from serverless_data_mesh.catalog.glue_rest import GlueRestCatalogAdapter
|
|
10
|
+
from serverless_data_mesh.orchestration.state import OrchestrationState
|
|
11
|
+
from serverless_data_mesh.types.workload import ChunkWriteResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@durable_step
|
|
15
|
+
def durable_write_chunk(
|
|
16
|
+
step_context: StepContext,
|
|
17
|
+
*,
|
|
18
|
+
workload_payload: dict[str, Any],
|
|
19
|
+
state_payload: dict[str, Any],
|
|
20
|
+
chunk_index: int,
|
|
21
|
+
chunk_start: int,
|
|
22
|
+
chunk_end: int,
|
|
23
|
+
parquet_paths: list[str],
|
|
24
|
+
proof_s3_uri: str,
|
|
25
|
+
verification_passed: bool,
|
|
26
|
+
) -> dict[str, Any]:
|
|
27
|
+
"""Checkpoint one successfully verified chunk (idempotent on replay)."""
|
|
28
|
+
_ = step_context
|
|
29
|
+
result = ChunkWriteResult(
|
|
30
|
+
chunk_index=chunk_index,
|
|
31
|
+
record_offset=chunk_start,
|
|
32
|
+
record_end=chunk_end,
|
|
33
|
+
parquet_paths=parquet_paths,
|
|
34
|
+
proof_s3_uri=proof_s3_uri,
|
|
35
|
+
verification_passed=verification_passed,
|
|
36
|
+
)
|
|
37
|
+
state = OrchestrationState(**state_payload)
|
|
38
|
+
state.next_offset = chunk_end
|
|
39
|
+
state.committed_chunks += 1
|
|
40
|
+
state.all_parquet_paths.extend(parquet_paths)
|
|
41
|
+
return {
|
|
42
|
+
"chunk": {
|
|
43
|
+
"chunk_index": result.chunk_index,
|
|
44
|
+
"record_offset": result.record_offset,
|
|
45
|
+
"record_end": result.record_end,
|
|
46
|
+
"parquet_paths": result.parquet_paths,
|
|
47
|
+
"proof_s3_uri": result.proof_s3_uri,
|
|
48
|
+
"verification_passed": result.verification_passed,
|
|
49
|
+
},
|
|
50
|
+
"state": {
|
|
51
|
+
"workload_id": state.workload_id,
|
|
52
|
+
"next_offset": state.next_offset,
|
|
53
|
+
"committed_chunks": state.committed_chunks,
|
|
54
|
+
"last_proof_hash": state.last_proof_hash,
|
|
55
|
+
"all_parquet_paths": state.all_parquet_paths,
|
|
56
|
+
},
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@durable_step
|
|
61
|
+
def durable_commit_metadata(
|
|
62
|
+
step_context: StepContext,
|
|
63
|
+
*,
|
|
64
|
+
namespace: str,
|
|
65
|
+
table_name: str,
|
|
66
|
+
parquet_paths: list[str],
|
|
67
|
+
snapshot_properties: dict[str, str] | None = None,
|
|
68
|
+
) -> dict[str, Any]:
|
|
69
|
+
"""Checkpoint the Iceberg REST metadata commit (2PC phase-2)."""
|
|
70
|
+
_ = step_context
|
|
71
|
+
adapter = GlueRestCatalogAdapter.from_environment(namespace=namespace, table_name=table_name)
|
|
72
|
+
adapter.prepare_commit(parquet_paths)
|
|
73
|
+
snapshot_id = adapter.commit(snapshot_properties=snapshot_properties)
|
|
74
|
+
return {"snapshot_id": snapshot_id, "file_count": len(parquet_paths)}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Automatic VRP-triggered reprocessing: detect, repair, re-proof, commit or escalate."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any, Callable
|
|
8
|
+
|
|
9
|
+
from serverless_data_mesh.types.workload import DataWriteWorkload
|
|
10
|
+
from serverless_data_mesh.verification.backend import create_proof_generator
|
|
11
|
+
from serverless_data_mesh.verification.vrp import validate_then_commit
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True, slots=True)
|
|
17
|
+
class ReprocessResult:
|
|
18
|
+
"""Outcome of automatic repair after VRP FAIL."""
|
|
19
|
+
|
|
20
|
+
outcome: str # repaired_pass | escalated
|
|
21
|
+
attempts: int
|
|
22
|
+
missing_before: int
|
|
23
|
+
missing_after: int
|
|
24
|
+
final_verdict: str
|
|
25
|
+
message: str
|
|
26
|
+
proof: dict[str, Any] | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _identity_key(record: dict[str, Any], fields: tuple[str, ...]) -> str:
|
|
30
|
+
return "|".join(str(record.get(f, "")) for f in fields)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _find_missing_records(
|
|
34
|
+
source: list[dict[str, Any]],
|
|
35
|
+
sink: list[dict[str, Any]],
|
|
36
|
+
identity_fields: tuple[str, ...],
|
|
37
|
+
) -> list[dict[str, Any]]:
|
|
38
|
+
sink_ids = {_identity_key(r, identity_fields) for r in sink}
|
|
39
|
+
return [r for r in source if _identity_key(r, identity_fields) not in sink_ids]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def attempt_vrp_repair(
|
|
43
|
+
*,
|
|
44
|
+
source_records: list[dict[str, Any]],
|
|
45
|
+
sink_records: list[dict[str, Any]],
|
|
46
|
+
workload: DataWriteWorkload,
|
|
47
|
+
chunk_start: int,
|
|
48
|
+
chunk_end: int,
|
|
49
|
+
proof_generator: Any | None = None,
|
|
50
|
+
max_attempts: int = 2,
|
|
51
|
+
write_repair_fn: Callable[[list[dict[str, Any]]], list[dict[str, Any]]] | None = None,
|
|
52
|
+
) -> ReprocessResult:
|
|
53
|
+
"""On VRP FAIL, re-read missing records, repair sink, regenerate proof.
|
|
54
|
+
|
|
55
|
+
Flow:
|
|
56
|
+
VRP FAIL -> identify missing IDs -> merge into sink -> new VRP
|
|
57
|
+
-> PASS: repaired_pass | still FAIL: escalated
|
|
58
|
+
"""
|
|
59
|
+
if proof_generator is None:
|
|
60
|
+
gen, _ = create_proof_generator()
|
|
61
|
+
else:
|
|
62
|
+
gen = proof_generator
|
|
63
|
+
sink = list(sink_records)
|
|
64
|
+
missing_before = len(
|
|
65
|
+
_find_missing_records(source_records, sink, workload.identity_fields)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if missing_before == 0:
|
|
69
|
+
proof = gen.build_proof(
|
|
70
|
+
source_records=source_records,
|
|
71
|
+
sink_records=sink,
|
|
72
|
+
workload=workload,
|
|
73
|
+
chunk_start=chunk_start,
|
|
74
|
+
chunk_end=chunk_end,
|
|
75
|
+
)
|
|
76
|
+
verdict = validate_then_commit(proof).outcome
|
|
77
|
+
if verdict == "PASS":
|
|
78
|
+
return ReprocessResult(
|
|
79
|
+
outcome="repaired_pass",
|
|
80
|
+
attempts=0,
|
|
81
|
+
missing_before=0,
|
|
82
|
+
missing_after=0,
|
|
83
|
+
final_verdict=verdict,
|
|
84
|
+
message="No missing records; original proof issue was mutation/duplicate",
|
|
85
|
+
proof=proof,
|
|
86
|
+
)
|
|
87
|
+
return ReprocessResult(
|
|
88
|
+
outcome="escalated",
|
|
89
|
+
attempts=0,
|
|
90
|
+
missing_before=0,
|
|
91
|
+
missing_after=0,
|
|
92
|
+
final_verdict=verdict,
|
|
93
|
+
message="VRP FAIL without drops; escalate to human",
|
|
94
|
+
proof=proof,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
for attempt in range(1, max_attempts + 1):
|
|
98
|
+
missing = _find_missing_records(source_records, sink, workload.identity_fields)
|
|
99
|
+
if write_repair_fn:
|
|
100
|
+
sink = write_repair_fn(missing)
|
|
101
|
+
else:
|
|
102
|
+
sink = sink + missing
|
|
103
|
+
|
|
104
|
+
proof = gen.build_proof(
|
|
105
|
+
source_records=source_records,
|
|
106
|
+
sink_records=sink,
|
|
107
|
+
workload=workload,
|
|
108
|
+
chunk_start=chunk_start,
|
|
109
|
+
chunk_end=chunk_end,
|
|
110
|
+
)
|
|
111
|
+
verdict = validate_then_commit(proof).outcome
|
|
112
|
+
missing_after = len(
|
|
113
|
+
_find_missing_records(source_records, sink, workload.identity_fields)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
logger.info(
|
|
117
|
+
"VRP repair attempt %s: verdict=%s missing_before=%s missing_after=%s",
|
|
118
|
+
attempt,
|
|
119
|
+
verdict,
|
|
120
|
+
missing_before,
|
|
121
|
+
missing_after,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if verdict == "PASS":
|
|
125
|
+
return ReprocessResult(
|
|
126
|
+
outcome="repaired_pass",
|
|
127
|
+
attempts=attempt,
|
|
128
|
+
missing_before=missing_before,
|
|
129
|
+
missing_after=missing_after,
|
|
130
|
+
final_verdict=verdict,
|
|
131
|
+
message=f"Repaired {missing_before} missing records on attempt {attempt}",
|
|
132
|
+
proof=proof,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return ReprocessResult(
|
|
136
|
+
outcome="escalated",
|
|
137
|
+
attempts=max_attempts,
|
|
138
|
+
missing_before=missing_before,
|
|
139
|
+
missing_after=missing_after,
|
|
140
|
+
final_verdict=verdict,
|
|
141
|
+
message=f"VRP still FAIL after {max_attempts} repair attempts; escalate to human",
|
|
142
|
+
proof=proof,
|
|
143
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Durable orchestration state models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(slots=True)
|
|
9
|
+
class OrchestrationState:
|
|
10
|
+
"""Checkpointed progress for a multi-chunk domain write."""
|
|
11
|
+
|
|
12
|
+
workload_id: str
|
|
13
|
+
next_offset: int = 0
|
|
14
|
+
committed_chunks: int = 0
|
|
15
|
+
last_proof_hash: str | None = None
|
|
16
|
+
all_parquet_paths: list[str] = field(default_factory=list)
|
|
File without changes
|