serverless-data-mesh 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. serverless_data_mesh/__init__.py +93 -0
  2. serverless_data_mesh/catalog/__init__.py +6 -0
  3. serverless_data_mesh/catalog/glue_connector.py +17 -0
  4. serverless_data_mesh/catalog/glue_rest.py +134 -0
  5. serverless_data_mesh/cli.py +165 -0
  6. serverless_data_mesh/config.py +42 -0
  7. serverless_data_mesh/dashboard/__init__.py +5 -0
  8. serverless_data_mesh/dashboard/cloudwatch.py +80 -0
  9. serverless_data_mesh/dashboard/trust.py +162 -0
  10. serverless_data_mesh/exceptions.py +23 -0
  11. serverless_data_mesh/governance/__init__.py +9 -0
  12. serverless_data_mesh/governance/consumer_sla.py +109 -0
  13. serverless_data_mesh/lineage/__init__.py +5 -0
  14. serverless_data_mesh/lineage/openlineage.py +96 -0
  15. serverless_data_mesh/local/__init__.py +5 -0
  16. serverless_data_mesh/local/runtime.py +380 -0
  17. serverless_data_mesh/metrics/__init__.py +5 -0
  18. serverless_data_mesh/metrics/mesh_trust.py +56 -0
  19. serverless_data_mesh/orchestration/__init__.py +28 -0
  20. serverless_data_mesh/orchestration/canary.py +127 -0
  21. serverless_data_mesh/orchestration/coordinator.py +265 -0
  22. serverless_data_mesh/orchestration/durable_steps.py +74 -0
  23. serverless_data_mesh/orchestration/reprocess.py +143 -0
  24. serverless_data_mesh/orchestration/state.py +16 -0
  25. serverless_data_mesh/py.typed +0 -0
  26. serverless_data_mesh/rules/__init__.py +8 -0
  27. serverless_data_mesh/rules/sparkrules_connector.py +193 -0
  28. serverless_data_mesh/scaffold/__init__.py +5 -0
  29. serverless_data_mesh/scaffold/init_domain.py +210 -0
  30. serverless_data_mesh/types/__init__.py +21 -0
  31. serverless_data_mesh/types/workload.py +123 -0
  32. serverless_data_mesh/verification/__init__.py +21 -0
  33. serverless_data_mesh/verification/backend.py +41 -0
  34. serverless_data_mesh/verification/fallback.py +200 -0
  35. serverless_data_mesh/verification/vrp.py +202 -0
  36. serverless_data_mesh-0.2.0.dist-info/METADATA +143 -0
  37. serverless_data_mesh-0.2.0.dist-info/RECORD +40 -0
  38. serverless_data_mesh-0.2.0.dist-info/WHEEL +4 -0
  39. serverless_data_mesh-0.2.0.dist-info/entry_points.txt +2 -0
  40. serverless_data_mesh-0.2.0.dist-info/licenses/LICENSE +17 -0
@@ -0,0 +1,265 @@
1
+ """IceGuard + Durable Execution coordinator for cross-domain lakehouse writes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from aws_durable_execution_sdk_python import DurableContext
9
+ from iceguard import protect
10
+ from iceguard.exceptions import IceGuardRollbackError
11
+
12
+ from serverless_data_mesh.catalog.glue_rest import GlueRestCatalogAdapter
13
+ from serverless_data_mesh.exceptions import VerificationRejectedError
14
+ from serverless_data_mesh.orchestration.durable_steps import (
15
+ durable_commit_metadata,
16
+ durable_write_chunk,
17
+ )
18
+ from serverless_data_mesh.orchestration.state import OrchestrationState
19
+ from serverless_data_mesh.types.workload import (
20
+ BatchWriterFn,
21
+ DataWriteWorkload,
22
+ SourceReaderFn,
23
+ WriteOutcome,
24
+ )
25
+ from serverless_data_mesh.metrics.mesh_trust import publish_vrp_metric
26
+ from serverless_data_mesh.orchestration.reprocess import attempt_vrp_repair
27
+ from serverless_data_mesh.verification.vrp import VRPProofGenerator, validate_then_commit
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def _missing_from_repair(
33
+ source: list[dict[str, Any]],
34
+ sink: list[dict[str, Any]],
35
+ identity_fields: tuple[str, ...],
36
+ ) -> list[dict[str, Any]]:
37
+ sink_ids = {"|".join(str(r.get(f, "")) for f in identity_fields) for r in sink}
38
+ return [
39
+ r
40
+ for r in source
41
+ if "|".join(str(r.get(f, "")) for f in identity_fields) not in sink_ids
42
+ ]
43
+
44
+
45
+ class IceGuardDurableCoordinator:
46
+ """Coordinate large lakehouse writes across chained durable steps.
47
+
48
+ Composes IceGuard SafeWriter, AWS Durable Execution, veridata-recon proofs,
49
+ and Glue REST catalog commits into a single governed transaction boundary.
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ *,
55
+ durable_context: DurableContext,
56
+ lambda_context: Any,
57
+ proof_generator: VRPProofGenerator,
58
+ catalog_adapter: GlueRestCatalogAdapter | None = None,
59
+ checkpoint_interval: int = 5000,
60
+ rollback_threshold_ms: int = 30_000,
61
+ ) -> None:
62
+ self._durable = durable_context
63
+ self._lambda = lambda_context
64
+ self._proofs = proof_generator
65
+ self._catalog = catalog_adapter
66
+ self._checkpoint_interval = checkpoint_interval
67
+ self._rollback_threshold_ms = rollback_threshold_ms
68
+
69
+ def _initial_state(self, workload: DataWriteWorkload) -> OrchestrationState:
70
+ return OrchestrationState(workload_id=workload.workload_id)
71
+
72
+ def _state_dict(self, state: OrchestrationState) -> dict[str, Any]:
73
+ return {
74
+ "workload_id": state.workload_id,
75
+ "next_offset": state.next_offset,
76
+ "committed_chunks": state.committed_chunks,
77
+ "last_proof_hash": state.last_proof_hash,
78
+ "all_parquet_paths": state.all_parquet_paths,
79
+ }
80
+
81
+ def _workload_dict(self, workload: DataWriteWorkload) -> dict[str, Any]:
82
+ return {
83
+ "workload_id": workload.workload_id,
84
+ "source_uri": workload.source_uri,
85
+ "target_uri": workload.target_uri,
86
+ "total_records": workload.total_records,
87
+ "checkpoint_bucket": workload.checkpoint_bucket,
88
+ "proof_bucket": workload.proof_bucket,
89
+ "content_fields": list(workload.content_fields),
90
+ "identity_fields": list(workload.identity_fields),
91
+ "boundary": {
92
+ "domain_id": workload.boundary.domain_id,
93
+ "source_namespace": workload.boundary.source_namespace,
94
+ "target_table": workload.boundary.target_table,
95
+ "partition_spec": workload.boundary.partition_spec,
96
+ "quality_policy_id": workload.boundary.quality_policy_id,
97
+ "max_chunk_records": workload.boundary.max_chunk_records,
98
+ },
99
+ }
100
+
101
+ def execute_workload(
102
+ self,
103
+ workload: DataWriteWorkload,
104
+ *,
105
+ batch_writer: BatchWriterFn,
106
+ source_reader: SourceReaderFn,
107
+ resume_state: OrchestrationState | None = None,
108
+ sink_reader: SourceReaderFn | None = None,
109
+ enable_auto_repair: bool = False,
110
+ ) -> dict[str, Any]:
111
+ """Run a large write as durable, resumable chunks under IceGuard protection."""
112
+ state = resume_state or self._initial_state(workload)
113
+ outcome = WriteOutcome.RESUMED if state.next_offset > 0 else WriteOutcome.COMMITTED
114
+
115
+ adapter = self._catalog or GlueRestCatalogAdapter.from_environment(
116
+ namespace=workload.boundary.source_namespace,
117
+ table_name=workload.boundary.target_table,
118
+ )
119
+ adapter.connect()
120
+
121
+ chunk_paths_by_batch: dict[tuple[int, int], list[str]] = {}
122
+ chunk_index = state.committed_chunks
123
+
124
+ try:
125
+ with protect(
126
+ self._lambda,
127
+ table_format="iceberg",
128
+ s3_bucket=workload.checkpoint_bucket,
129
+ coordinator_id=workload.workload_id,
130
+ durable_context=self._durable,
131
+ rollback_threshold_ms=self._rollback_threshold_ms,
132
+ checkpoint_interval=self._checkpoint_interval,
133
+ catalog=adapter.catalog,
134
+ table_identifier=(
135
+ f"{workload.boundary.source_namespace}.{workload.boundary.target_table}"
136
+ ),
137
+ ) as writer:
138
+
139
+ def guarded_batch_writer(start: int, end: int) -> None:
140
+ nonlocal chunk_index, state
141
+
142
+ paths = batch_writer(start, end)
143
+ chunk_paths_by_batch[(start, end)] = paths
144
+
145
+ source_records = source_reader(start, end)
146
+ sink_records = sink_reader(start, end) if sink_reader else source_records
147
+ proof = self._proofs.build_proof(
148
+ source_records=source_records,
149
+ sink_records=sink_records,
150
+ workload=workload,
151
+ chunk_start=start,
152
+ chunk_end=end,
153
+ prev_proof_hash=state.last_proof_hash,
154
+ )
155
+ verification = validate_then_commit(proof)
156
+ if verification.outcome != "PASS" and enable_auto_repair and sink_reader:
157
+
158
+ def _repair_write(missing: list[dict[str, Any]]) -> list[dict[str, Any]]:
159
+ batch_writer(start, start + len(missing))
160
+ return sink_records + missing
161
+
162
+ repair = attempt_vrp_repair(
163
+ source_records=source_records,
164
+ sink_records=sink_records,
165
+ workload=workload,
166
+ chunk_start=start,
167
+ chunk_end=end,
168
+ proof_generator=self._proofs,
169
+ write_repair_fn=_repair_write,
170
+ )
171
+ if repair.outcome == "repaired_pass" and repair.proof is not None:
172
+ proof = repair.proof
173
+ verification = validate_then_commit(proof)
174
+ sink_records = sink_records + _missing_from_repair(
175
+ source_records, sink_records, workload.identity_fields
176
+ )
177
+
178
+ publish_vrp_metric(
179
+ domain_id=workload.boundary.domain_id,
180
+ verdict=proof["reconciliation"]["verdict"],
181
+ row_count=len(sink_records),
182
+ workload_id=workload.workload_id,
183
+ )
184
+ if verification.outcome != "PASS":
185
+ raise VerificationRejectedError(
186
+ f"VRP blocked chunk [{start}, {end}): {verification.reason}"
187
+ )
188
+
189
+ proof_uri = self._proofs.persist_proof(
190
+ proof,
191
+ bucket=workload.proof_bucket,
192
+ key_prefix=f"{workload.boundary.domain_id}/{workload.workload_id}",
193
+ chunk_index=chunk_index,
194
+ )
195
+ state.last_proof_hash = proof["proof_id"]
196
+
197
+ step_result = self._durable.step(
198
+ durable_write_chunk(
199
+ workload_payload=self._workload_dict(workload),
200
+ state_payload=self._state_dict(state),
201
+ chunk_index=chunk_index,
202
+ chunk_start=start,
203
+ chunk_end=end,
204
+ parquet_paths=paths,
205
+ proof_s3_uri=proof_uri,
206
+ verification_passed=True,
207
+ )
208
+ )
209
+ state = OrchestrationState(**step_result["state"])
210
+ chunk_index += 1
211
+
212
+ writer.write(
213
+ path=workload.target_uri,
214
+ total_records=workload.total_records,
215
+ batch_writer=guarded_batch_writer,
216
+ track_paths=lambda start, end: chunk_paths_by_batch.get((start, end), []),
217
+ )
218
+
219
+ commit_result = self._durable.step(
220
+ durable_commit_metadata(
221
+ namespace=workload.boundary.source_namespace,
222
+ table_name=workload.boundary.target_table,
223
+ parquet_paths=state.all_parquet_paths,
224
+ snapshot_properties={
225
+ "app-id": "serverless-data-mesh",
226
+ "workload-id": workload.workload_id,
227
+ "domain-id": workload.boundary.domain_id,
228
+ },
229
+ )
230
+ )
231
+ return {
232
+ "outcome": outcome.value,
233
+ "workload_id": workload.workload_id,
234
+ "records_written": state.next_offset,
235
+ "chunks": state.committed_chunks,
236
+ "snapshot_id": commit_result["snapshot_id"],
237
+ "proof_chain_tail": state.last_proof_hash,
238
+ }
239
+
240
+ except VerificationRejectedError as exc:
241
+ logger.error("Data quality gate failed for %s: %s", workload.workload_id, exc)
242
+ adapter.abort()
243
+ return {
244
+ "outcome": WriteOutcome.VERIFICATION_FAILED.value,
245
+ "workload_id": workload.workload_id,
246
+ "resume_offset": state.next_offset,
247
+ "message": str(exc),
248
+ }
249
+
250
+ except IceGuardRollbackError as exc:
251
+ logger.warning(
252
+ "IceGuard rollback for workload %s near timeout (remaining=%sms)",
253
+ workload.workload_id,
254
+ exc.remaining_time_ms,
255
+ )
256
+ adapter.abort()
257
+ return {
258
+ "outcome": WriteOutcome.ROLLED_BACK.value,
259
+ "workload_id": workload.workload_id,
260
+ "resume_offset": state.next_offset,
261
+ "message": (
262
+ "Physical files rolled back; re-invoke Lambda to resume "
263
+ "from durable checkpoint without duplicating committed chunks."
264
+ ),
265
+ }
@@ -0,0 +1,74 @@
1
+ """AWS Durable Execution step functions for mesh writes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from aws_durable_execution_sdk_python import StepContext, durable_step
8
+
9
+ from serverless_data_mesh.catalog.glue_rest import GlueRestCatalogAdapter
10
+ from serverless_data_mesh.orchestration.state import OrchestrationState
11
+ from serverless_data_mesh.types.workload import ChunkWriteResult
12
+
13
+
14
+ @durable_step
15
+ def durable_write_chunk(
16
+ step_context: StepContext,
17
+ *,
18
+ workload_payload: dict[str, Any],
19
+ state_payload: dict[str, Any],
20
+ chunk_index: int,
21
+ chunk_start: int,
22
+ chunk_end: int,
23
+ parquet_paths: list[str],
24
+ proof_s3_uri: str,
25
+ verification_passed: bool,
26
+ ) -> dict[str, Any]:
27
+ """Checkpoint one successfully verified chunk (idempotent on replay)."""
28
+ _ = step_context
29
+ result = ChunkWriteResult(
30
+ chunk_index=chunk_index,
31
+ record_offset=chunk_start,
32
+ record_end=chunk_end,
33
+ parquet_paths=parquet_paths,
34
+ proof_s3_uri=proof_s3_uri,
35
+ verification_passed=verification_passed,
36
+ )
37
+ state = OrchestrationState(**state_payload)
38
+ state.next_offset = chunk_end
39
+ state.committed_chunks += 1
40
+ state.all_parquet_paths.extend(parquet_paths)
41
+ return {
42
+ "chunk": {
43
+ "chunk_index": result.chunk_index,
44
+ "record_offset": result.record_offset,
45
+ "record_end": result.record_end,
46
+ "parquet_paths": result.parquet_paths,
47
+ "proof_s3_uri": result.proof_s3_uri,
48
+ "verification_passed": result.verification_passed,
49
+ },
50
+ "state": {
51
+ "workload_id": state.workload_id,
52
+ "next_offset": state.next_offset,
53
+ "committed_chunks": state.committed_chunks,
54
+ "last_proof_hash": state.last_proof_hash,
55
+ "all_parquet_paths": state.all_parquet_paths,
56
+ },
57
+ }
58
+
59
+
60
+ @durable_step
61
+ def durable_commit_metadata(
62
+ step_context: StepContext,
63
+ *,
64
+ namespace: str,
65
+ table_name: str,
66
+ parquet_paths: list[str],
67
+ snapshot_properties: dict[str, str] | None = None,
68
+ ) -> dict[str, Any]:
69
+ """Checkpoint the Iceberg REST metadata commit (2PC phase-2)."""
70
+ _ = step_context
71
+ adapter = GlueRestCatalogAdapter.from_environment(namespace=namespace, table_name=table_name)
72
+ adapter.prepare_commit(parquet_paths)
73
+ snapshot_id = adapter.commit(snapshot_properties=snapshot_properties)
74
+ return {"snapshot_id": snapshot_id, "file_count": len(parquet_paths)}
@@ -0,0 +1,143 @@
1
+ """Automatic VRP-triggered reprocessing: detect, repair, re-proof, commit or escalate."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass
7
+ from typing import Any, Callable
8
+
9
+ from serverless_data_mesh.types.workload import DataWriteWorkload
10
+ from serverless_data_mesh.verification.backend import create_proof_generator
11
+ from serverless_data_mesh.verification.vrp import validate_then_commit
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass(frozen=True, slots=True)
17
+ class ReprocessResult:
18
+ """Outcome of automatic repair after VRP FAIL."""
19
+
20
+ outcome: str # repaired_pass | escalated
21
+ attempts: int
22
+ missing_before: int
23
+ missing_after: int
24
+ final_verdict: str
25
+ message: str
26
+ proof: dict[str, Any] | None = None
27
+
28
+
29
+ def _identity_key(record: dict[str, Any], fields: tuple[str, ...]) -> str:
30
+ return "|".join(str(record.get(f, "")) for f in fields)
31
+
32
+
33
+ def _find_missing_records(
34
+ source: list[dict[str, Any]],
35
+ sink: list[dict[str, Any]],
36
+ identity_fields: tuple[str, ...],
37
+ ) -> list[dict[str, Any]]:
38
+ sink_ids = {_identity_key(r, identity_fields) for r in sink}
39
+ return [r for r in source if _identity_key(r, identity_fields) not in sink_ids]
40
+
41
+
42
+ def attempt_vrp_repair(
43
+ *,
44
+ source_records: list[dict[str, Any]],
45
+ sink_records: list[dict[str, Any]],
46
+ workload: DataWriteWorkload,
47
+ chunk_start: int,
48
+ chunk_end: int,
49
+ proof_generator: Any | None = None,
50
+ max_attempts: int = 2,
51
+ write_repair_fn: Callable[[list[dict[str, Any]]], list[dict[str, Any]]] | None = None,
52
+ ) -> ReprocessResult:
53
+ """On VRP FAIL, re-read missing records, repair sink, regenerate proof.
54
+
55
+ Flow:
56
+ VRP FAIL -> identify missing IDs -> merge into sink -> new VRP
57
+ -> PASS: repaired_pass | still FAIL: escalated
58
+ """
59
+ if proof_generator is None:
60
+ gen, _ = create_proof_generator()
61
+ else:
62
+ gen = proof_generator
63
+ sink = list(sink_records)
64
+ missing_before = len(
65
+ _find_missing_records(source_records, sink, workload.identity_fields)
66
+ )
67
+
68
+ if missing_before == 0:
69
+ proof = gen.build_proof(
70
+ source_records=source_records,
71
+ sink_records=sink,
72
+ workload=workload,
73
+ chunk_start=chunk_start,
74
+ chunk_end=chunk_end,
75
+ )
76
+ verdict = validate_then_commit(proof).outcome
77
+ if verdict == "PASS":
78
+ return ReprocessResult(
79
+ outcome="repaired_pass",
80
+ attempts=0,
81
+ missing_before=0,
82
+ missing_after=0,
83
+ final_verdict=verdict,
84
+ message="No missing records; original proof issue was mutation/duplicate",
85
+ proof=proof,
86
+ )
87
+ return ReprocessResult(
88
+ outcome="escalated",
89
+ attempts=0,
90
+ missing_before=0,
91
+ missing_after=0,
92
+ final_verdict=verdict,
93
+ message="VRP FAIL without drops; escalate to human",
94
+ proof=proof,
95
+ )
96
+
97
+ for attempt in range(1, max_attempts + 1):
98
+ missing = _find_missing_records(source_records, sink, workload.identity_fields)
99
+ if write_repair_fn:
100
+ sink = write_repair_fn(missing)
101
+ else:
102
+ sink = sink + missing
103
+
104
+ proof = gen.build_proof(
105
+ source_records=source_records,
106
+ sink_records=sink,
107
+ workload=workload,
108
+ chunk_start=chunk_start,
109
+ chunk_end=chunk_end,
110
+ )
111
+ verdict = validate_then_commit(proof).outcome
112
+ missing_after = len(
113
+ _find_missing_records(source_records, sink, workload.identity_fields)
114
+ )
115
+
116
+ logger.info(
117
+ "VRP repair attempt %s: verdict=%s missing_before=%s missing_after=%s",
118
+ attempt,
119
+ verdict,
120
+ missing_before,
121
+ missing_after,
122
+ )
123
+
124
+ if verdict == "PASS":
125
+ return ReprocessResult(
126
+ outcome="repaired_pass",
127
+ attempts=attempt,
128
+ missing_before=missing_before,
129
+ missing_after=missing_after,
130
+ final_verdict=verdict,
131
+ message=f"Repaired {missing_before} missing records on attempt {attempt}",
132
+ proof=proof,
133
+ )
134
+
135
+ return ReprocessResult(
136
+ outcome="escalated",
137
+ attempts=max_attempts,
138
+ missing_before=missing_before,
139
+ missing_after=missing_after,
140
+ final_verdict=verdict,
141
+ message=f"VRP still FAIL after {max_attempts} repair attempts; escalate to human",
142
+ proof=proof,
143
+ )
@@ -0,0 +1,16 @@
1
+ """Durable orchestration state models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass(slots=True)
9
+ class OrchestrationState:
10
+ """Checkpointed progress for a multi-chunk domain write."""
11
+
12
+ workload_id: str
13
+ next_offset: int = 0
14
+ committed_chunks: int = 0
15
+ last_proof_hash: str | None = None
16
+ all_parquet_paths: list[str] = field(default_factory=list)
File without changes
@@ -0,0 +1,8 @@
1
+ """Business rules connectors for mesh domain writers."""
2
+
3
+ from serverless_data_mesh.rules.sparkrules_connector import (
4
+ RuleFireSummary,
5
+ SparkRulesConnector,
6
+ )
7
+
8
+ __all__ = ["RuleFireSummary", "SparkRulesConnector"]