serverless-data-mesh 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- serverless_data_mesh/__init__.py +93 -0
- serverless_data_mesh/catalog/__init__.py +6 -0
- serverless_data_mesh/catalog/glue_connector.py +17 -0
- serverless_data_mesh/catalog/glue_rest.py +134 -0
- serverless_data_mesh/cli.py +165 -0
- serverless_data_mesh/config.py +42 -0
- serverless_data_mesh/dashboard/__init__.py +5 -0
- serverless_data_mesh/dashboard/cloudwatch.py +80 -0
- serverless_data_mesh/dashboard/trust.py +162 -0
- serverless_data_mesh/exceptions.py +23 -0
- serverless_data_mesh/governance/__init__.py +9 -0
- serverless_data_mesh/governance/consumer_sla.py +109 -0
- serverless_data_mesh/lineage/__init__.py +5 -0
- serverless_data_mesh/lineage/openlineage.py +96 -0
- serverless_data_mesh/local/__init__.py +5 -0
- serverless_data_mesh/local/runtime.py +380 -0
- serverless_data_mesh/metrics/__init__.py +5 -0
- serverless_data_mesh/metrics/mesh_trust.py +56 -0
- serverless_data_mesh/orchestration/__init__.py +28 -0
- serverless_data_mesh/orchestration/canary.py +127 -0
- serverless_data_mesh/orchestration/coordinator.py +265 -0
- serverless_data_mesh/orchestration/durable_steps.py +74 -0
- serverless_data_mesh/orchestration/reprocess.py +143 -0
- serverless_data_mesh/orchestration/state.py +16 -0
- serverless_data_mesh/py.typed +0 -0
- serverless_data_mesh/rules/__init__.py +8 -0
- serverless_data_mesh/rules/sparkrules_connector.py +193 -0
- serverless_data_mesh/scaffold/__init__.py +5 -0
- serverless_data_mesh/scaffold/init_domain.py +210 -0
- serverless_data_mesh/types/__init__.py +21 -0
- serverless_data_mesh/types/workload.py +123 -0
- serverless_data_mesh/verification/__init__.py +21 -0
- serverless_data_mesh/verification/backend.py +41 -0
- serverless_data_mesh/verification/fallback.py +200 -0
- serverless_data_mesh/verification/vrp.py +202 -0
- serverless_data_mesh-0.2.0.dist-info/METADATA +143 -0
- serverless_data_mesh-0.2.0.dist-info/RECORD +40 -0
- serverless_data_mesh-0.2.0.dist-info/WHEEL +4 -0
- serverless_data_mesh-0.2.0.dist-info/entry_points.txt +2 -0
- serverless_data_mesh-0.2.0.dist-info/licenses/LICENSE +17 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Pure-Python multiset verifier when veridata-recon wheels are unavailable."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import uuid
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from serverless_data_mesh.types.workload import DataWriteWorkload
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _coerce_records(records: list[dict[str, Any]]) -> list[dict[str, str]]:
|
|
17
|
+
return [{key: str(value) for key, value in record.items()} for record in records]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _identity_rule(fields: tuple[str, ...]) -> str:
|
|
21
|
+
if len(fields) == 1:
|
|
22
|
+
return f"field:{fields[0]}"
|
|
23
|
+
return f"composite:[{','.join(fields)}]"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _record_fingerprint(
|
|
27
|
+
record: dict[str, str],
|
|
28
|
+
*,
|
|
29
|
+
identity_fields: tuple[str, ...],
|
|
30
|
+
content_fields: tuple[str, ...],
|
|
31
|
+
salt: str,
|
|
32
|
+
) -> str:
|
|
33
|
+
identity = "|".join(record.get(f, "") for f in identity_fields)
|
|
34
|
+
content = "|".join(record.get(f, "") for f in content_fields)
|
|
35
|
+
payload = f"{salt}:{identity}:{content}".encode()
|
|
36
|
+
return hashlib.sha256(payload).hexdigest()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def reconcile_multiset(
|
|
40
|
+
*,
|
|
41
|
+
source: list[dict[str, Any]],
|
|
42
|
+
sink: list[dict[str, Any]],
|
|
43
|
+
identity_fields: tuple[str, ...],
|
|
44
|
+
content_fields: tuple[str, ...],
|
|
45
|
+
salt: str = "sdm-fallback-salt",
|
|
46
|
+
) -> dict[str, Any]:
|
|
47
|
+
"""Compare source and sink multisets; same verdict shape as veridata-recon."""
|
|
48
|
+
src = _coerce_records(source)
|
|
49
|
+
snk = _coerce_records(sink)
|
|
50
|
+
|
|
51
|
+
src_fps = [
|
|
52
|
+
_record_fingerprint(
|
|
53
|
+
row,
|
|
54
|
+
identity_fields=identity_fields,
|
|
55
|
+
content_fields=content_fields,
|
|
56
|
+
salt=salt,
|
|
57
|
+
)
|
|
58
|
+
for row in src
|
|
59
|
+
]
|
|
60
|
+
snk_fps = [
|
|
61
|
+
_record_fingerprint(
|
|
62
|
+
row,
|
|
63
|
+
identity_fields=identity_fields,
|
|
64
|
+
content_fields=content_fields,
|
|
65
|
+
salt=salt,
|
|
66
|
+
)
|
|
67
|
+
for row in snk
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
src_counter = Counter(src_fps)
|
|
71
|
+
snk_counter = Counter(snk_fps)
|
|
72
|
+
|
|
73
|
+
missing: list[str] = []
|
|
74
|
+
duplicated: list[str] = []
|
|
75
|
+
mutated: list[str] = []
|
|
76
|
+
|
|
77
|
+
all_keys = set(src_counter) | set(snk_counter)
|
|
78
|
+
for key in sorted(all_keys):
|
|
79
|
+
src_n = src_counter.get(key, 0)
|
|
80
|
+
snk_n = snk_counter.get(key, 0)
|
|
81
|
+
if snk_n < src_n:
|
|
82
|
+
missing.extend([key] * (src_n - snk_n))
|
|
83
|
+
if snk_n > src_n:
|
|
84
|
+
duplicated.extend([key] * (snk_n - src_n))
|
|
85
|
+
|
|
86
|
+
# Mutation: same identity, different content fingerprint in sink vs source
|
|
87
|
+
if identity_fields:
|
|
88
|
+
src_by_id: dict[str, str] = {}
|
|
89
|
+
snk_by_id: dict[str, str] = {}
|
|
90
|
+
for row in src:
|
|
91
|
+
ident = "|".join(row.get(f, "") for f in identity_fields)
|
|
92
|
+
fp = _record_fingerprint(
|
|
93
|
+
row,
|
|
94
|
+
identity_fields=identity_fields,
|
|
95
|
+
content_fields=content_fields,
|
|
96
|
+
salt=salt,
|
|
97
|
+
)
|
|
98
|
+
src_by_id[ident] = fp
|
|
99
|
+
for row in snk:
|
|
100
|
+
ident = "|".join(row.get(f, "") for f in identity_fields)
|
|
101
|
+
fp = _record_fingerprint(
|
|
102
|
+
row,
|
|
103
|
+
identity_fields=identity_fields,
|
|
104
|
+
content_fields=content_fields,
|
|
105
|
+
salt=salt,
|
|
106
|
+
)
|
|
107
|
+
snk_by_id[ident] = fp
|
|
108
|
+
for ident, src_fp in src_by_id.items():
|
|
109
|
+
snk_fp = snk_by_id.get(ident)
|
|
110
|
+
if snk_fp is not None and snk_fp != src_fp:
|
|
111
|
+
mutated.append(ident)
|
|
112
|
+
|
|
113
|
+
verdict = "PASS" if not missing and not duplicated and not mutated else "FAIL"
|
|
114
|
+
return {
|
|
115
|
+
"verdict": verdict,
|
|
116
|
+
"missing": missing[:100],
|
|
117
|
+
"duplicated": duplicated[:100],
|
|
118
|
+
"mutated": mutated[:100],
|
|
119
|
+
"source_count": len(src),
|
|
120
|
+
"sink_count": len(snk),
|
|
121
|
+
"backend": "pure-python-fallback",
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class FallbackProofGenerator:
|
|
126
|
+
"""Drop-in proof generator using pure-Python multiset reconciliation."""
|
|
127
|
+
|
|
128
|
+
producer: str = "serverless-data-mesh/fallback-verifier"
|
|
129
|
+
|
|
130
|
+
def __init__(self, *, salt_hex: str | None = None) -> None:
|
|
131
|
+
self.salt_hex = salt_hex or uuid.uuid4().hex
|
|
132
|
+
|
|
133
|
+
def build_proof(
|
|
134
|
+
self,
|
|
135
|
+
*,
|
|
136
|
+
source_records: list[dict[str, Any]],
|
|
137
|
+
sink_records: list[dict[str, Any]],
|
|
138
|
+
workload: DataWriteWorkload,
|
|
139
|
+
chunk_start: int,
|
|
140
|
+
chunk_end: int,
|
|
141
|
+
prev_proof_hash: str | None = None,
|
|
142
|
+
) -> dict[str, Any]:
|
|
143
|
+
reconciliation = reconcile_multiset(
|
|
144
|
+
source=source_records,
|
|
145
|
+
sink=sink_records,
|
|
146
|
+
identity_fields=workload.identity_fields,
|
|
147
|
+
content_fields=workload.content_fields,
|
|
148
|
+
salt=self.salt_hex,
|
|
149
|
+
)
|
|
150
|
+
boundary_value = base64.b64encode(
|
|
151
|
+
json.dumps(
|
|
152
|
+
{
|
|
153
|
+
"workload_id": workload.workload_id,
|
|
154
|
+
"domain_id": workload.boundary.domain_id,
|
|
155
|
+
"start": chunk_start,
|
|
156
|
+
"end": chunk_end,
|
|
157
|
+
"partition": workload.boundary.partition_spec,
|
|
158
|
+
},
|
|
159
|
+
separators=(",", ":"),
|
|
160
|
+
sort_keys=True,
|
|
161
|
+
).encode()
|
|
162
|
+
).decode("ascii")
|
|
163
|
+
|
|
164
|
+
document: dict[str, Any] = {
|
|
165
|
+
"proof_version": "0.1-fallback",
|
|
166
|
+
"created_at": datetime.now(timezone.utc).replace(microsecond=0).isoformat(),
|
|
167
|
+
"producer": self.producer,
|
|
168
|
+
"boundary": {"mode": "OFFSET_RANGE", "value": boundary_value},
|
|
169
|
+
"source_ref": workload.source_uri,
|
|
170
|
+
"sink_ref": workload.target_uri,
|
|
171
|
+
"hash_algorithm": "sha256",
|
|
172
|
+
"salt": self.salt_hex,
|
|
173
|
+
"identity_rule": _identity_rule(workload.identity_fields),
|
|
174
|
+
"content_fields": list(workload.content_fields),
|
|
175
|
+
"reconciliation": reconciliation,
|
|
176
|
+
"public_key": None,
|
|
177
|
+
"chain": {"prev_proof_hash": prev_proof_hash},
|
|
178
|
+
}
|
|
179
|
+
document["proof_id"] = hashlib.sha256(
|
|
180
|
+
json.dumps(document, separators=(",", ":"), sort_keys=True).encode()
|
|
181
|
+
).hexdigest()
|
|
182
|
+
return document
|
|
183
|
+
|
|
184
|
+
def persist_proof(
|
|
185
|
+
self,
|
|
186
|
+
proof: dict[str, Any],
|
|
187
|
+
*,
|
|
188
|
+
bucket: str,
|
|
189
|
+
key_prefix: str,
|
|
190
|
+
chunk_index: int,
|
|
191
|
+
s3_client: Any | None = None,
|
|
192
|
+
) -> str:
|
|
193
|
+
from pathlib import Path
|
|
194
|
+
|
|
195
|
+
root = Path(bucket)
|
|
196
|
+
rel = f"{key_prefix.rstrip('/')}/proofs/chunk-{chunk_index:06d}.vrp.json"
|
|
197
|
+
dest = root / rel
|
|
198
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
dest.write_text(json.dumps(proof, indent=2, sort_keys=True), encoding="utf-8")
|
|
200
|
+
return f"file://{dest}"
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Cryptographic verification via veridata-recon (VRP v0.1)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import tempfile
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import boto3
|
|
15
|
+
|
|
16
|
+
from serverless_data_mesh.types.workload import DataWriteWorkload
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _vr() -> Any:
|
|
22
|
+
try:
|
|
23
|
+
import veridata_recon as vr
|
|
24
|
+
|
|
25
|
+
return vr
|
|
26
|
+
except ImportError as exc:
|
|
27
|
+
msg = (
|
|
28
|
+
"veridata-recon is required for VRPProofGenerator. "
|
|
29
|
+
"Use create_proof_generator() for automatic fallback, or "
|
|
30
|
+
"pip install veridata-recon (Python 3.12+, Linux/macOS wheels)."
|
|
31
|
+
)
|
|
32
|
+
raise ImportError(msg) from exc
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True, slots=True)
|
|
36
|
+
class VerifyResult:
|
|
37
|
+
"""Outcome of a validate-then-commit reconciliation check."""
|
|
38
|
+
|
|
39
|
+
outcome: str
|
|
40
|
+
reason: str | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
ValidateResult = VerifyResult
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _identity_rule(fields: tuple[str, ...]) -> str:
|
|
47
|
+
if len(fields) == 1:
|
|
48
|
+
return f"field:{fields[0]}"
|
|
49
|
+
return f"composite:[{','.join(fields)}]"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _coerce_records(records: list[dict[str, Any]]) -> list[dict[str, str]]:
|
|
53
|
+
"""veridata-recon requires string field values for canonical hashing."""
|
|
54
|
+
return [{key: str(value) for key, value in record.items()} for record in records]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(slots=True)
|
|
58
|
+
class VRPProofGenerator:
|
|
59
|
+
"""Generate offline, tamper-evident reconciliation proofs for pipeline chunks."""
|
|
60
|
+
|
|
61
|
+
private_key_b64: str
|
|
62
|
+
public_key_b64: str
|
|
63
|
+
salt_hex: str
|
|
64
|
+
producer: str = "serverless-data-mesh/0.1.0"
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_env(cls) -> VRPProofGenerator:
|
|
68
|
+
"""Load signing keys from ``VRP_SIGNING_KEY_B64`` or generate ephemeral ones."""
|
|
69
|
+
raw = os.environ.get("VRP_SIGNING_KEY_B64")
|
|
70
|
+
vr = _vr()
|
|
71
|
+
if raw:
|
|
72
|
+
keys = vr.keypair_from_private(raw)
|
|
73
|
+
else:
|
|
74
|
+
keys = vr.generate_keypair()
|
|
75
|
+
return cls(
|
|
76
|
+
private_key_b64=keys["private_key"],
|
|
77
|
+
public_key_b64=keys["public_key"],
|
|
78
|
+
salt_hex=vr.generate_salt(),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def build_proof(
|
|
82
|
+
self,
|
|
83
|
+
*,
|
|
84
|
+
source_records: list[dict[str, Any]],
|
|
85
|
+
sink_records: list[dict[str, Any]],
|
|
86
|
+
workload: DataWriteWorkload,
|
|
87
|
+
chunk_start: int,
|
|
88
|
+
chunk_end: int,
|
|
89
|
+
prev_proof_hash: str | None = None,
|
|
90
|
+
) -> dict[str, Any]:
|
|
91
|
+
"""Hash and compare source vs target partition, returning a proof envelope."""
|
|
92
|
+
vr = _vr()
|
|
93
|
+
identity_rule = _identity_rule(workload.identity_fields)
|
|
94
|
+
content_fields = list(workload.content_fields)
|
|
95
|
+
|
|
96
|
+
reconciliation = vr.reconcile(
|
|
97
|
+
source=_coerce_records(source_records),
|
|
98
|
+
sink=_coerce_records(sink_records),
|
|
99
|
+
identity_rule=identity_rule,
|
|
100
|
+
content_fields=content_fields,
|
|
101
|
+
salt=self.salt_hex,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
boundary_value = base64.b64encode(
|
|
105
|
+
json.dumps(
|
|
106
|
+
{
|
|
107
|
+
"workload_id": workload.workload_id,
|
|
108
|
+
"domain_id": workload.boundary.domain_id,
|
|
109
|
+
"start": chunk_start,
|
|
110
|
+
"end": chunk_end,
|
|
111
|
+
"partition": workload.boundary.partition_spec,
|
|
112
|
+
},
|
|
113
|
+
separators=(",", ":"),
|
|
114
|
+
sort_keys=True,
|
|
115
|
+
).encode()
|
|
116
|
+
).decode("ascii")
|
|
117
|
+
|
|
118
|
+
document: dict[str, Any] = {
|
|
119
|
+
"proof_version": "0.1",
|
|
120
|
+
"created_at": datetime.now(timezone.utc).replace(microsecond=0).isoformat(),
|
|
121
|
+
"producer": self.producer,
|
|
122
|
+
"boundary": {"mode": "OFFSET_RANGE", "value": boundary_value},
|
|
123
|
+
"source_ref": workload.source_uri,
|
|
124
|
+
"sink_ref": workload.target_uri,
|
|
125
|
+
"hash_algorithm": "sha256",
|
|
126
|
+
"salt": self.salt_hex,
|
|
127
|
+
"identity_rule": identity_rule,
|
|
128
|
+
"content_fields": content_fields,
|
|
129
|
+
"reconciliation": reconciliation,
|
|
130
|
+
"public_key": self.public_key_b64,
|
|
131
|
+
"chain": {"prev_proof_hash": prev_proof_hash},
|
|
132
|
+
}
|
|
133
|
+
document["proof_id"] = vr.hash_bytes(
|
|
134
|
+
json.dumps(document, separators=(",", ":"), sort_keys=True).encode("utf-8")
|
|
135
|
+
)
|
|
136
|
+
return document
|
|
137
|
+
|
|
138
|
+
def persist_proof(
|
|
139
|
+
self,
|
|
140
|
+
proof: dict[str, Any],
|
|
141
|
+
*,
|
|
142
|
+
bucket: str,
|
|
143
|
+
key_prefix: str,
|
|
144
|
+
chunk_index: int,
|
|
145
|
+
s3_client: Any | None = None,
|
|
146
|
+
) -> str:
|
|
147
|
+
"""Write the reconciliation proof JSON alongside IceGuard Parquet artifacts."""
|
|
148
|
+
client = s3_client or boto3.client("s3")
|
|
149
|
+
key = f"{key_prefix.rstrip('/')}/proofs/chunk-{chunk_index:06d}.vrp.json"
|
|
150
|
+
body = json.dumps(proof, indent=2, sort_keys=True).encode("utf-8")
|
|
151
|
+
verdict = proof["reconciliation"]["verdict"]
|
|
152
|
+
client.put_object(
|
|
153
|
+
Bucket=bucket,
|
|
154
|
+
Key=key,
|
|
155
|
+
Body=body,
|
|
156
|
+
ContentType="application/json",
|
|
157
|
+
Metadata={"proof-id": proof["proof_id"], "verdict": verdict},
|
|
158
|
+
)
|
|
159
|
+
uri = f"s3://{bucket}/{key}"
|
|
160
|
+
logger.info("Persisted VRP proof to %s (verdict=%s)", uri, verdict)
|
|
161
|
+
return uri
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def validate_then_commit(
|
|
165
|
+
proof: dict[str, Any],
|
|
166
|
+
*,
|
|
167
|
+
public_key_b64: str | None = None,
|
|
168
|
+
) -> VerifyResult:
|
|
169
|
+
"""Validate-then-commit hook: verify reconciliation before metadata commit."""
|
|
170
|
+
verdict = proof["reconciliation"]["verdict"]
|
|
171
|
+
if verdict == "PASS":
|
|
172
|
+
return VerifyResult("PASS")
|
|
173
|
+
|
|
174
|
+
missing = len(proof["reconciliation"].get("missing", []))
|
|
175
|
+
mutated = len(proof["reconciliation"].get("mutated", []))
|
|
176
|
+
duplicated = len(proof["reconciliation"].get("duplicated", []))
|
|
177
|
+
reason = (
|
|
178
|
+
f"reconciliation {verdict}: missing={missing}, "
|
|
179
|
+
f"mutated={mutated}, duplicated={duplicated}"
|
|
180
|
+
)
|
|
181
|
+
logger.error("VRP validation blocked metadata commit: %s", reason)
|
|
182
|
+
|
|
183
|
+
pubkey = public_key_b64 or proof.get("public_key")
|
|
184
|
+
if pubkey:
|
|
185
|
+
try:
|
|
186
|
+
vr = _vr()
|
|
187
|
+
with tempfile.NamedTemporaryFile(
|
|
188
|
+
mode="w",
|
|
189
|
+
suffix=".vrp.json",
|
|
190
|
+
delete=False,
|
|
191
|
+
encoding="utf-8",
|
|
192
|
+
) as handle:
|
|
193
|
+
json.dump(proof, handle, indent=2, sort_keys=True)
|
|
194
|
+
temp_path = handle.name
|
|
195
|
+
offline = vr.verify_proof(temp_path, pubkey)
|
|
196
|
+
os.unlink(temp_path)
|
|
197
|
+
if offline != verdict:
|
|
198
|
+
reason = f"{reason}; offline_verify={offline}"
|
|
199
|
+
except Exception as exc:
|
|
200
|
+
logger.warning("Offline veridata-recon verify_proof skipped: %s", exc)
|
|
201
|
+
|
|
202
|
+
return VerifyResult(verdict, reason)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: serverless-data-mesh
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Federated data mesh framework for proof-gated, exactly-once Iceberg data product publication on AWS Lambda.
|
|
5
|
+
Project-URL: Homepage, https://github.com/vaquarkhan/aws-serverless-datamesh-framework
|
|
6
|
+
Project-URL: Documentation, https://github.com/vaquarkhan/aws-serverless-datamesh-framework#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/vaquarkhan/aws-serverless-datamesh-framework
|
|
8
|
+
Project-URL: Changelog, https://github.com/vaquarkhan/aws-serverless-datamesh-framework/blob/main/CHANGELOG.md
|
|
9
|
+
Project-URL: PyPI, https://pypi.org/project/serverless-data-mesh/
|
|
10
|
+
Author: Serverless Data Mesh Contributors
|
|
11
|
+
License-Expression: Apache-2.0
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: aws,data-as-a-product,data-mesh,domain-oriented,durable-execution,federated-data-mesh,federated-governance,iceberg,iceguard,lakehouse,lambda,pyiceberg,rule-engine,serverless,sparkrules,vaquar-pattern,veridata-recon
|
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Database
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
23
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.12
|
|
26
|
+
Requires-Dist: aws-durable-execution-sdk-python>=1.0.0
|
|
27
|
+
Requires-Dist: boto3>=1.34
|
|
28
|
+
Requires-Dist: iceguard>=1.0.0
|
|
29
|
+
Requires-Dist: pyiceberg[glue,rest-sigv4]>=0.7.0
|
|
30
|
+
Requires-Dist: veridata-recon>=0.1.0
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'all'
|
|
33
|
+
Requires-Dist: sparkrules>=1.2.0; extra == 'all'
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
36
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
37
|
+
Requires-Dist: pre-commit>=4; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-cov>=5; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
40
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
41
|
+
Requires-Dist: twine>=5; extra == 'dev'
|
|
42
|
+
Provides-Extra: publish
|
|
43
|
+
Requires-Dist: build>=1.2; extra == 'publish'
|
|
44
|
+
Requires-Dist: twine>=5; extra == 'publish'
|
|
45
|
+
Provides-Extra: rules
|
|
46
|
+
Requires-Dist: sparkrules>=1.2.0; extra == 'rules'
|
|
47
|
+
Provides-Extra: spark
|
|
48
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'spark'
|
|
49
|
+
Requires-Dist: sparkrules>=1.2.0; extra == 'spark'
|
|
50
|
+
Description-Content-Type: text/markdown
|
|
51
|
+
|
|
52
|
+
# Serverless Data Mesh
|
|
53
|
+
|
|
54
|
+
**Federated data mesh lakehouse publication on AWS Lambda - with cryptographic proof, not just green job logs.**
|
|
55
|
+
|
|
56
|
+
An open Python framework for **domain-oriented**, **federated data mesh** teams on AWS. Implements **data as a product**, **self-serve write infrastructure**, and **federated computational governance** at the Iceberg lakehouse layer.
|
|
57
|
+
|
|
58
|
+
- **Producer** domains publish governed **data products**
|
|
59
|
+
- **Steward** notaries enforce mesh contracts with VRP proofs
|
|
60
|
+
- **Publisher** zones expose consumer-ready **Iceberg data products**
|
|
61
|
+
|
|
62
|
+
Introduces the **[Vaquar Pattern](https://github.com/vaquarkhan/aws-serverless-datamesh-framework/blob/main/docs/vaquar-pattern.md)** (PVDM): Physical → Verify → Durable → Metadata. Invariant: `commit_metadata ⟹ VRP = PASS`.
|
|
63
|
+
|
|
64
|
+
## Install
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install serverless-data-mesh
|
|
68
|
+
serverless-data-mesh demo # <60s local PVDM demo, no AWS
|
|
69
|
+
pip install "serverless-data-mesh[rules]" # + SparkRules on Lambda
|
|
70
|
+
pip install "serverless-data-mesh[spark]" # + PySpark + SparkRules
|
|
71
|
+
pip install "serverless-data-mesh[all]" # rules + spark
|
|
72
|
+
pip install "serverless-data-mesh[dev]" # pytest, ruff, pre-commit
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
**Works on **Windows, Mac, and Linux**. Uses veridata-recon cryptographic VRP when wheels are available; **pure-Python fallback** otherwise (same PASS/FAIL gate).**
|
|
76
|
+
|
|
77
|
+
## What it solves
|
|
78
|
+
|
|
79
|
+
| Problem | Answer |
|
|
80
|
+
|---------|--------|
|
|
81
|
+
| Silent data loss on backfill | VRP `FAIL` blocks Iceberg snapshot |
|
|
82
|
+
| "Job succeeded" is not proof | Cryptographic multiset proof per chunk |
|
|
83
|
+
| Lambda 15-minute limit | Durable Execution + Step Functions (90+ min) |
|
|
84
|
+
| Retry duplicates data | IceGuard rollback + `workload_id` checkpoints |
|
|
85
|
+
| Central ETL bottleneck | Per-domain Lambda writer + transaction boundary |
|
|
86
|
+
| Federated blast radius | Producer · Steward · Publisher accounts |
|
|
87
|
+
|
|
88
|
+
## Building blocks
|
|
89
|
+
|
|
90
|
+
| Package | Role |
|
|
91
|
+
|---------|------|
|
|
92
|
+
| [iceguard](https://pypi.org/project/iceguard/) | Physical SafeWriter, timeout rollback, S3 resume |
|
|
93
|
+
| [veridata-recon](https://pypi.org/project/veridata-recon/) | VRP proof generation and validation |
|
|
94
|
+
| [aws-durable-execution-sdk-python](https://pypi.org/project/aws-durable-execution-sdk-python/) | Cross-invocation step replay |
|
|
95
|
+
| [pyiceberg](https://pypi.org/project/pyiceberg/) | Glue Iceberg REST metadata via `GlueCatalogConnector` |
|
|
96
|
+
| [sparkrules](https://pypi.org/project/sparkrules/) | Optional DRL business rules (`[rules]` extra) |
|
|
97
|
+
|
|
98
|
+
## Quick example
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from serverless_data_mesh import (
|
|
102
|
+
IceGuardDurableCoordinator,
|
|
103
|
+
DomainTransactionBoundary,
|
|
104
|
+
VRPProofGenerator,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
boundary = DomainTransactionBoundary(
|
|
108
|
+
domain_id="orders-domain",
|
|
109
|
+
source_namespace="raw_orders",
|
|
110
|
+
target_table="orders_curated",
|
|
111
|
+
partition_spec={"dt": "2026-06-14"},
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
coordinator = IceGuardDurableCoordinator(
|
|
115
|
+
durable_context=durable_ctx,
|
|
116
|
+
lambda_context=lambda_ctx,
|
|
117
|
+
proof_generator=VRPProofGenerator(),
|
|
118
|
+
catalog_adapter=glue_adapter,
|
|
119
|
+
)
|
|
120
|
+
outcome = coordinator.run_workload(workload)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Optional extras
|
|
124
|
+
|
|
125
|
+
| Extra | Adds |
|
|
126
|
+
|-------|------|
|
|
127
|
+
| `rules` | SparkRules DRL on Lambda |
|
|
128
|
+
| `spark` | PySpark + SparkRules |
|
|
129
|
+
| `all` | `rules` + `spark` |
|
|
130
|
+
| `dev` | pytest, ruff, mypy, pre-commit |
|
|
131
|
+
| `publish` | build, twine |
|
|
132
|
+
|
|
133
|
+
## Documentation
|
|
134
|
+
|
|
135
|
+
- [GitHub README](https://github.com/vaquarkhan/aws-serverless-datamesh-framework#readme)
|
|
136
|
+
- [Vaquar Pattern](https://github.com/vaquarkhan/aws-serverless-datamesh-framework/blob/main/docs/vaquar-pattern.md)
|
|
137
|
+
- [Getting started](https://github.com/vaquarkhan/aws-serverless-datamesh-framework/blob/main/docs/getting-started.md)
|
|
138
|
+
- [PyPI guide](https://github.com/vaquarkhan/aws-serverless-datamesh-framework/blob/main/docs/pypi.md)
|
|
139
|
+
- [Full documentation](https://github.com/vaquarkhan/aws-serverless-datamesh-framework/tree/main/docs)
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
Apache-2.0
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
serverless_data_mesh/__init__.py,sha256=FZjdbKzWnsjXoLwC1Y5gvRVU2zCUYamqlQHulVyKjvM,2893
|
|
2
|
+
serverless_data_mesh/cli.py,sha256=Ui1gaaNEAcCYRGIdI1BNa4LOVmpLjAbWF4jxIDzXg6c,6821
|
|
3
|
+
serverless_data_mesh/config.py,sha256=SAIu88NV_cD6UgglCzenp8INJjvQtj5HCZBVDnBKFpg,1561
|
|
4
|
+
serverless_data_mesh/exceptions.py,sha256=PhqfjsZtKIX_Q87GfsG1T09ds3Dhv5eS9FMJoNKCsRY,717
|
|
5
|
+
serverless_data_mesh/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
serverless_data_mesh/catalog/__init__.py,sha256=CGzY2FK-biSn2mLoq2C7RhW1CTOdyvV25ht43DwuT3Y,278
|
|
7
|
+
serverless_data_mesh/catalog/glue_connector.py,sha256=qXQ8xNirQKhMsMElb4aIgCnfZJxOQwExH_-2rN59srM,762
|
|
8
|
+
serverless_data_mesh/catalog/glue_rest.py,sha256=8ZnBVpxYzSKxqppmBPHbFkdfWd6yVUxpNzgowt9aAOo,5217
|
|
9
|
+
serverless_data_mesh/dashboard/__init__.py,sha256=wAKG7m8Ttiamrji1xGOMyNMc-rpCTuPudnlfuL5sab4,139
|
|
10
|
+
serverless_data_mesh/dashboard/cloudwatch.py,sha256=Cozj4u0SdB1MLvoMmhsz3O7CAwL3DBJLeX8KMJULZ3E,2675
|
|
11
|
+
serverless_data_mesh/dashboard/trust.py,sha256=jCZ-AIWFElZgFYsZfPEe3tNKf1MlbrP9y43U2MdrsP4,5582
|
|
12
|
+
serverless_data_mesh/governance/__init__.py,sha256=fpue3dnoX3ejHxPBJ4uKbJM1nsgPfJkF8SvYt-ZyDEM,303
|
|
13
|
+
serverless_data_mesh/governance/consumer_sla.py,sha256=_ToDdWvxqtdhC8hrr8Yt79TrSi2MWIPGYcofnT9CT9M,3475
|
|
14
|
+
serverless_data_mesh/lineage/__init__.py,sha256=QveXUN3hAvcPnBwCp6vuzi-VWgF6q3nzIUaLEq2io10,178
|
|
15
|
+
serverless_data_mesh/lineage/openlineage.py,sha256=y9704cnVWt5-pGCyGblwfy04omuvKRgTZrrtW0PV1ak,3125
|
|
16
|
+
serverless_data_mesh/local/__init__.py,sha256=m9IQo8ChzXnK2sePE90Q3NW_zSTwgatNmE-Fp4GnfK0,197
|
|
17
|
+
serverless_data_mesh/local/runtime.py,sha256=5NgpeqYcPYLNdKja_Pl96DbOPqE65pTjW8hFdH1izJY,14447
|
|
18
|
+
serverless_data_mesh/metrics/__init__.py,sha256=eiExi2A4Q40FtHxd6flaeJ4p83CkVeHjVtA8Kv9-5O8,159
|
|
19
|
+
serverless_data_mesh/metrics/mesh_trust.py,sha256=drWNVxHumYz-1uDS_uo5_x5bLe-vHWjvWXS5xXSFJRc,1530
|
|
20
|
+
serverless_data_mesh/orchestration/__init__.py,sha256=igvZy4X8EXMS8fpAvYdnF1CCQuM68oFAfpxJHZqmlMc,900
|
|
21
|
+
serverless_data_mesh/orchestration/canary.py,sha256=yGosZWu8GxVVPktw7FVChgMUA-4-4-FZLUTIWgh7AxY,4209
|
|
22
|
+
serverless_data_mesh/orchestration/coordinator.py,sha256=hi29bUP7DEsV4K9O6v30PgbDc4F_LRgB2XdwwJmuGpI,11237
|
|
23
|
+
serverless_data_mesh/orchestration/durable_steps.py,sha256=MKKYpeo_t-sHySVFf7ciLPKY6-9xRbGpP1wdRpmrGl4,2496
|
|
24
|
+
serverless_data_mesh/orchestration/reprocess.py,sha256=ISJ4zwNGoBEPvR-cRiiRnxbC3SMz96U6SmxTuLydpaA,4599
|
|
25
|
+
serverless_data_mesh/orchestration/state.py,sha256=SJa6KtaqV5Ik3RoP3tSyBQcmrmIP_mbOyfJR-D81uGQ,414
|
|
26
|
+
serverless_data_mesh/rules/__init__.py,sha256=uBYdsx7FOTT9lbLjE4Gsdr2BBtFUiqAT15lhtg3kOn8,222
|
|
27
|
+
serverless_data_mesh/rules/sparkrules_connector.py,sha256=AtRZBHvKPYW-1gwYZdy7zFfzNVbTh0wgUzWztiyq7cs,6858
|
|
28
|
+
serverless_data_mesh/scaffold/__init__.py,sha256=lMgJiPbQmk1286pUXtxbDsVe9CI2gvCSX7-4VaY2L48,152
|
|
29
|
+
serverless_data_mesh/scaffold/init_domain.py,sha256=Pg3mXGr8sc4ZcXhqeoMbhGgB9DXvOBfs4aCyS4QpG1w,5940
|
|
30
|
+
serverless_data_mesh/types/__init__.py,sha256=Fjr6s4FbgdXy6I28hGc5z7OY9SzzU2wB2efsMpGCzqU,441
|
|
31
|
+
serverless_data_mesh/types/workload.py,sha256=jBr3UAlcKWOdC4KsKL_Pn5jtyuDTRJ1SU2GJEgfSjFQ,3986
|
|
32
|
+
serverless_data_mesh/verification/__init__.py,sha256=wywzfPwErYPSv41I98B4E01k3aRllhOOhuxP9rp96wE,619
|
|
33
|
+
serverless_data_mesh/verification/backend.py,sha256=rmvpXNIzG5fUR2Tm1dF74Vmp2w5viIf4TNBVi09t5Yc,1175
|
|
34
|
+
serverless_data_mesh/verification/fallback.py,sha256=f3Gh32E8D-F6GxuF8bpjb943_u0o-AHXuDoVNwXZLLI,6489
|
|
35
|
+
serverless_data_mesh/verification/vrp.py,sha256=puxMNcST3zssZ61KPQ7pS0hWDNHIH9HrHeebSgKubQU,6660
|
|
36
|
+
serverless_data_mesh-0.2.0.dist-info/METADATA,sha256=g8qYTlRJ1KFM2nwpy_sAZSZOTZvMkrVfe_3Tf_IB-e8,6312
|
|
37
|
+
serverless_data_mesh-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
38
|
+
serverless_data_mesh-0.2.0.dist-info/entry_points.txt,sha256=1_fRcmiQPvr5xbO3aPnzHUyPCPsdsr6ICRs2NdwsG5E,71
|
|
39
|
+
serverless_data_mesh-0.2.0.dist-info/licenses/LICENSE,sha256=aHcoSYkPQgkZb342Axbb4Gj0EMM7A_1wriqtBBC0MgQ,762
|
|
40
|
+
serverless_data_mesh-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Copyright 2026 Serverless Data Mesh Contributors
|
|
6
|
+
|
|
7
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
you may not use this file except in compliance with the License.
|
|
9
|
+
You may obtain a copy of the License at
|
|
10
|
+
|
|
11
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
|
|
13
|
+
Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
See the License for the specific language governing permissions and
|
|
17
|
+
limitations under the License.
|