dc43-service-backends 0.0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dc43_service_backends-0.0.3.0/PKG-INFO +20 -0
- dc43_service_backends-0.0.3.0/README.md +5 -0
- dc43_service_backends-0.0.3.0/pyproject.toml +32 -0
- dc43_service_backends-0.0.3.0/setup.cfg +4 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/__init__.py +20 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/__init__.py +11 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/backend/__init__.py +7 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/backend/drafting.py +541 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/backend/interface.py +41 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/backend/local.py +50 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/backend/stores/__init__.py +27 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/backend/stores/collibra.py +389 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/backend/stores/delta.py +151 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/backend/stores/filesystem.py +76 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/backend/stores/interface.py +42 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/contracts/drafting.py +19 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/data_quality/__init__.py +19 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/data_quality/backend/__init__.py +18 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/data_quality/backend/engine.py +315 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/data_quality/backend/interface.py +29 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/data_quality/backend/local.py +33 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/data_quality/backend/manager.py +64 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/data_quality/backend/predicates.py +90 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/governance/__init__.py +5 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/governance/backend/__init__.py +6 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/governance/backend/interface.py +110 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/governance/backend/local.py +464 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/governance/backend/stubs/__init__.py +5 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends/governance/backend/stubs/filesystem.py +393 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends.egg-info/PKG-INFO +20 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends.egg-info/SOURCES.txt +35 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends.egg-info/dependency_links.txt +1 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends.egg-info/requires.txt +5 -0
- dc43_service_backends-0.0.3.0/src/dc43_service_backends.egg-info/top_level.txt +1 -0
- dc43_service_backends-0.0.3.0/tests/test_collibra_integration.py +134 -0
- dc43_service_backends-0.0.3.0/tests/test_contract_drafter.py +218 -0
- dc43_service_backends-0.0.3.0/tests/test_local_contract_backend.py +61 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dc43-service-backends
|
|
3
|
+
Version: 0.0.3.0
|
|
4
|
+
Summary: Backend service implementations for dc43
|
|
5
|
+
Author: Andy Petrella
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: dc43-service-clients>=0.0.3
|
|
12
|
+
Requires-Dist: open-data-contract-standard==3.0.2
|
|
13
|
+
Provides-Extra: spark
|
|
14
|
+
Requires-Dist: pyspark>=3.4; extra == "spark"
|
|
15
|
+
|
|
16
|
+
# dc43-service-backends
|
|
17
|
+
|
|
18
|
+
Backend-facing components that fulfill the dc43 service contracts live in this package.
|
|
19
|
+
Install it alongside `dc43-service-clients` when wiring custom storage, governance, or
|
|
20
|
+
quality enforcement backends.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dc43-service-backends"
|
|
7
|
+
version = "0.0.3.0"
|
|
8
|
+
description = "Backend service implementations for dc43"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Andy Petrella" }
|
|
13
|
+
]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"dc43-service-clients>=0.0.3",
|
|
21
|
+
"open-data-contract-standard==3.0.2",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
spark = ["pyspark>=3.4"]
|
|
26
|
+
|
|
27
|
+
[tool.setuptools]
|
|
28
|
+
package-dir = {"" = "src"}
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
where = ["src"]
|
|
32
|
+
include = ["dc43_service_backends*"]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Service backend implementations for dc43."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib import import_module
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
__all__ = ["contracts", "data_quality", "governance"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def __getattr__(name: str) -> Any:
|
|
12
|
+
if name in __all__:
|
|
13
|
+
module = import_module(f".{name}", __name__)
|
|
14
|
+
globals()[name] = module
|
|
15
|
+
return module
|
|
16
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def __dir__() -> list[str]:
|
|
20
|
+
return sorted(set(globals()) | set(__all__))
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Service backend implementations for contract management."""
|
|
2
|
+
|
|
3
|
+
from .backend import ContractServiceBackend, LocalContractServiceBackend, ContractStore
|
|
4
|
+
from .backend import drafting
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ContractServiceBackend",
|
|
8
|
+
"LocalContractServiceBackend",
|
|
9
|
+
"ContractStore",
|
|
10
|
+
"drafting",
|
|
11
|
+
]
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Backend contracts and stubs for contract management services."""
|
|
2
|
+
|
|
3
|
+
from .interface import ContractServiceBackend
|
|
4
|
+
from .local import LocalContractServiceBackend
|
|
5
|
+
from .stores.interface import ContractStore
|
|
6
|
+
|
|
7
|
+
__all__ = ["ContractServiceBackend", "LocalContractServiceBackend", "ContractStore"]
|
|
@@ -0,0 +1,541 @@
|
|
|
1
|
+
"""Helpers to generate ODCS drafts from runtime observations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
|
|
8
|
+
from uuid import uuid4
|
|
9
|
+
|
|
10
|
+
from open_data_contract_standard.model import ( # type: ignore
|
|
11
|
+
CustomProperty,
|
|
12
|
+
DataQuality,
|
|
13
|
+
OpenDataContractStandard,
|
|
14
|
+
SchemaProperty,
|
|
15
|
+
Server,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from dc43.odcs import as_odcs_dict, contract_identity, normalise_custom_properties, to_model
|
|
19
|
+
from dc43_service_clients.data_quality import ValidationResult
|
|
20
|
+
from dc43.versioning import SemVer
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
_INVALID_IDENTIFIER = re.compile(r"[^0-9A-Za-z-]+")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _normalise_identifier(value: str | None) -> Optional[str]:
|
|
27
|
+
"""Return a semver-friendly identifier derived from ``value``."""
|
|
28
|
+
|
|
29
|
+
if value is None:
|
|
30
|
+
return None
|
|
31
|
+
token = _INVALID_IDENTIFIER.sub("-", str(value)).strip("-")
|
|
32
|
+
return token or None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _pipeline_hint(context: Mapping[str, Any] | None) -> Optional[str]:
|
|
36
|
+
"""Return a reviewer friendly label describing the draft origin."""
|
|
37
|
+
|
|
38
|
+
if not context:
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
for key in ("pipeline", "job", "project", "module", "function", "qualname", "source"):
|
|
42
|
+
value = context.get(key)
|
|
43
|
+
if value:
|
|
44
|
+
token = _normalise_identifier(str(value))
|
|
45
|
+
if token:
|
|
46
|
+
return token
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _draft_version_suffix(
|
|
51
|
+
*,
|
|
52
|
+
dataset_id: Optional[str],
|
|
53
|
+
dataset_version: Optional[str],
|
|
54
|
+
draft_context: Optional[Mapping[str, Any]],
|
|
55
|
+
) -> str:
|
|
56
|
+
"""Return the pre-release suffix used to guarantee draft version uniqueness."""
|
|
57
|
+
|
|
58
|
+
tokens: List[str] = ["draft"]
|
|
59
|
+
|
|
60
|
+
for candidate in (dataset_version, dataset_id):
|
|
61
|
+
token = _normalise_identifier(candidate)
|
|
62
|
+
if token:
|
|
63
|
+
tokens.append(token)
|
|
64
|
+
|
|
65
|
+
pipeline_token = _pipeline_hint(draft_context)
|
|
66
|
+
if pipeline_token:
|
|
67
|
+
tokens.append(pipeline_token)
|
|
68
|
+
|
|
69
|
+
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S%fZ")
|
|
70
|
+
tokens.append(timestamp)
|
|
71
|
+
|
|
72
|
+
entropy = uuid4().hex[:8]
|
|
73
|
+
tokens.append(entropy)
|
|
74
|
+
|
|
75
|
+
return "-".join(tokens)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _resolve_observed_type(
|
|
79
|
+
info: Mapping[str, Any] | None,
|
|
80
|
+
fallback: str | None,
|
|
81
|
+
) -> Tuple[str, Optional[bool]]:
|
|
82
|
+
"""Return the preferred ODCS physical type and nullable flag."""
|
|
83
|
+
|
|
84
|
+
observed_type = str(
|
|
85
|
+
(info or {}).get("odcs_type")
|
|
86
|
+
or (info or {}).get("type")
|
|
87
|
+
or (info or {}).get("backend_type")
|
|
88
|
+
or fallback
|
|
89
|
+
or "string"
|
|
90
|
+
)
|
|
91
|
+
nullable = None
|
|
92
|
+
if info is not None and "nullable" in info:
|
|
93
|
+
nullable = bool(info.get("nullable", False))
|
|
94
|
+
return observed_type, nullable
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _quality_rule_key(field: SchemaProperty, dq: DataQuality) -> Optional[Tuple[str, str]]:
|
|
98
|
+
"""Return the expectation rule prefix and human readable label."""
|
|
99
|
+
|
|
100
|
+
name = field.name or ""
|
|
101
|
+
if not name:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
if dq.mustBeGreaterThan is not None:
|
|
105
|
+
return "gt", f"mustBeGreaterThan {dq.mustBeGreaterThan}"
|
|
106
|
+
if dq.mustBeGreaterOrEqualTo is not None:
|
|
107
|
+
return "ge", f"mustBeGreaterOrEqualTo {dq.mustBeGreaterOrEqualTo}"
|
|
108
|
+
if dq.mustBeLessThan is not None:
|
|
109
|
+
return "lt", f"mustBeLessThan {dq.mustBeLessThan}"
|
|
110
|
+
if dq.mustBeLessOrEqualTo is not None:
|
|
111
|
+
return "le", f"mustBeLessOrEqualTo {dq.mustBeLessOrEqualTo}"
|
|
112
|
+
|
|
113
|
+
rule = (dq.rule or "").lower()
|
|
114
|
+
if rule == "unique":
|
|
115
|
+
return "unique", "unique"
|
|
116
|
+
if rule == "enum" and isinstance(dq.mustBe, Iterable):
|
|
117
|
+
return "enum", "enum"
|
|
118
|
+
if rule == "regex" and dq.mustBe:
|
|
119
|
+
return "regex", "regex"
|
|
120
|
+
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _quality_metric_value(
|
|
125
|
+
*,
|
|
126
|
+
metrics: Mapping[str, Any],
|
|
127
|
+
rule_prefix: str,
|
|
128
|
+
field_name: str,
|
|
129
|
+
) -> Optional[float]:
|
|
130
|
+
key = f"violations.{rule_prefix}_{field_name}"
|
|
131
|
+
value = metrics.get(key)
|
|
132
|
+
if value is None:
|
|
133
|
+
return None
|
|
134
|
+
if isinstance(value, (int, float)):
|
|
135
|
+
return float(value)
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _extract_values(candidate: Any) -> List[Any]:
|
|
140
|
+
"""Normalise different iterable payloads into a flat list of values."""
|
|
141
|
+
|
|
142
|
+
if candidate is None:
|
|
143
|
+
return []
|
|
144
|
+
if isinstance(candidate, Mapping):
|
|
145
|
+
values: List[Any] = []
|
|
146
|
+
for key in ("new", "new_values", "unexpected", "unexpected_values", "values", "items"):
|
|
147
|
+
inner = candidate.get(key)
|
|
148
|
+
if isinstance(inner, (list, tuple, set)):
|
|
149
|
+
values.extend(inner)
|
|
150
|
+
elif inner is not None:
|
|
151
|
+
values.append(inner)
|
|
152
|
+
return values
|
|
153
|
+
if isinstance(candidate, (list, tuple, set)):
|
|
154
|
+
return list(candidate)
|
|
155
|
+
return [candidate]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _enum_extension(
|
|
159
|
+
*,
|
|
160
|
+
dq: DataQuality,
|
|
161
|
+
metrics: Mapping[str, Any],
|
|
162
|
+
field_name: str,
|
|
163
|
+
) -> Optional[Tuple[List[Any], List[Any]]]:
|
|
164
|
+
"""Return updated enum values plus additions derived from observations."""
|
|
165
|
+
|
|
166
|
+
if not field_name:
|
|
167
|
+
return None
|
|
168
|
+
base_values: List[Any]
|
|
169
|
+
if isinstance(dq.mustBe, (list, tuple, set)):
|
|
170
|
+
base_values = list(dq.mustBe)
|
|
171
|
+
else:
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
observed_sources = [
|
|
175
|
+
metrics.get(f"observed.enum_{field_name}"),
|
|
176
|
+
metrics.get("observed.enum", {}),
|
|
177
|
+
]
|
|
178
|
+
observed_values: List[Any] = []
|
|
179
|
+
for source in observed_sources:
|
|
180
|
+
if isinstance(source, Mapping) and field_name in source:
|
|
181
|
+
observed_values.extend(_extract_values(source.get(field_name)))
|
|
182
|
+
else:
|
|
183
|
+
observed_values.extend(_extract_values(source))
|
|
184
|
+
|
|
185
|
+
if not observed_values:
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
seen = {str(v) for v in base_values}
|
|
189
|
+
additions: List[Any] = []
|
|
190
|
+
for value in observed_values:
|
|
191
|
+
key = str(value)
|
|
192
|
+
if key not in seen:
|
|
193
|
+
additions.append(value)
|
|
194
|
+
seen.add(key)
|
|
195
|
+
|
|
196
|
+
if not additions:
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
updated = list(base_values) + additions
|
|
200
|
+
|
|
201
|
+
return updated, additions
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def draft_from_validation_result(
|
|
205
|
+
*,
|
|
206
|
+
validation: ValidationResult,
|
|
207
|
+
base_contract: OpenDataContractStandard,
|
|
208
|
+
bump: str = "minor",
|
|
209
|
+
dataset_id: Optional[str] = None,
|
|
210
|
+
dataset_version: Optional[str] = None,
|
|
211
|
+
data_format: Optional[str] = None,
|
|
212
|
+
dq_status: Optional[str] = None,
|
|
213
|
+
dq_feedback: Optional[Mapping[str, Any]] = None,
|
|
214
|
+
draft_context: Optional[Mapping[str, Any]] = None,
|
|
215
|
+
) -> Optional[OpenDataContractStandard]:
|
|
216
|
+
"""Return a draft contract derived from validation feedback."""
|
|
217
|
+
|
|
218
|
+
metrics = validation.metrics or {}
|
|
219
|
+
schema = validation.schema or {}
|
|
220
|
+
|
|
221
|
+
has_errors = bool(validation.errors)
|
|
222
|
+
has_warnings = bool(validation.warnings)
|
|
223
|
+
if not has_errors and not has_warnings:
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
contract_id, version = contract_identity(base_contract)
|
|
227
|
+
bump_version = SemVer.parse(version).bump(bump)
|
|
228
|
+
|
|
229
|
+
if hasattr(base_contract, "model_copy"):
|
|
230
|
+
draft = base_contract.model_copy(deep=True) # type: ignore[attr-defined]
|
|
231
|
+
else:
|
|
232
|
+
draft = to_model(as_odcs_dict(base_contract))
|
|
233
|
+
draft.version = str(bump_version)
|
|
234
|
+
draft.status = "draft"
|
|
235
|
+
|
|
236
|
+
suffix = _draft_version_suffix(
|
|
237
|
+
dataset_id=dataset_id,
|
|
238
|
+
dataset_version=dataset_version,
|
|
239
|
+
draft_context=draft_context,
|
|
240
|
+
)
|
|
241
|
+
draft.version = f"{draft.version}-{suffix}"
|
|
242
|
+
|
|
243
|
+
context_payload: Dict[str, Any] = dict(draft_context or {})
|
|
244
|
+
if dataset_id and "dataset_id" not in context_payload:
|
|
245
|
+
context_payload["dataset_id"] = dataset_id
|
|
246
|
+
if dataset_version and "dataset_version" not in context_payload:
|
|
247
|
+
context_payload["dataset_version"] = dataset_version
|
|
248
|
+
|
|
249
|
+
pipeline_token = _pipeline_hint(draft_context)
|
|
250
|
+
pipeline_value: Optional[str] = None
|
|
251
|
+
if draft_context:
|
|
252
|
+
for key in ("pipeline", "job", "project", "module", "function", "qualname", "source"):
|
|
253
|
+
raw = draft_context.get(key)
|
|
254
|
+
if raw:
|
|
255
|
+
pipeline_value = str(raw)
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
change_log: List[Dict[str, Any]] = []
|
|
259
|
+
change_log = _apply_schema_feedback(
|
|
260
|
+
draft,
|
|
261
|
+
schema=schema,
|
|
262
|
+
metrics=metrics,
|
|
263
|
+
change_log=change_log,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
if validation.errors:
|
|
267
|
+
change_log.append(
|
|
268
|
+
{
|
|
269
|
+
"status": "error",
|
|
270
|
+
"kind": "validation",
|
|
271
|
+
"messages": list(validation.errors),
|
|
272
|
+
}
|
|
273
|
+
)
|
|
274
|
+
if validation.warnings:
|
|
275
|
+
change_log.append(
|
|
276
|
+
{
|
|
277
|
+
"status": "warning",
|
|
278
|
+
"kind": "validation",
|
|
279
|
+
"messages": list(validation.warnings),
|
|
280
|
+
}
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
custom_properties = list(normalise_custom_properties(getattr(draft, "customProperties", None)))
|
|
284
|
+
|
|
285
|
+
if dq_status or dq_feedback:
|
|
286
|
+
feedback = dict(dq_feedback or {})
|
|
287
|
+
if dq_status:
|
|
288
|
+
feedback.setdefault("status", dq_status)
|
|
289
|
+
custom_properties.append(CustomProperty(property="dq_feedback", value=feedback))
|
|
290
|
+
|
|
291
|
+
custom_properties.append(
|
|
292
|
+
CustomProperty(
|
|
293
|
+
property="validation_metrics",
|
|
294
|
+
value={"metrics": metrics, "schema": schema},
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
if data_format:
|
|
299
|
+
custom_properties.append(CustomProperty(property="data_format", value=data_format))
|
|
300
|
+
|
|
301
|
+
custom_properties.append(
|
|
302
|
+
CustomProperty(
|
|
303
|
+
property="base_contract",
|
|
304
|
+
value={"id": contract_id, "version": version},
|
|
305
|
+
)
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
custom_properties.append(
|
|
309
|
+
CustomProperty(
|
|
310
|
+
property="validation_outcome",
|
|
311
|
+
value={"errors": validation.errors, "warnings": validation.warnings},
|
|
312
|
+
)
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
if context_payload:
|
|
316
|
+
if pipeline_value and "module" not in context_payload:
|
|
317
|
+
module_hint = pipeline_value.rsplit(".", 1)[0]
|
|
318
|
+
context_payload.setdefault("module", module_hint)
|
|
319
|
+
custom_properties.append(
|
|
320
|
+
CustomProperty(property="draft_context", value=context_payload)
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
if pipeline_value:
|
|
324
|
+
custom_properties.append(
|
|
325
|
+
CustomProperty(property="draft_pipeline", value=pipeline_value)
|
|
326
|
+
)
|
|
327
|
+
elif pipeline_token:
|
|
328
|
+
custom_properties.append(
|
|
329
|
+
CustomProperty(property="draft_pipeline", value=pipeline_token)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
provenance: Dict[str, Any] = {}
|
|
333
|
+
if dataset_version:
|
|
334
|
+
provenance["dataset_version"] = dataset_version
|
|
335
|
+
if dataset_id:
|
|
336
|
+
provenance["dataset_id"] = dataset_id
|
|
337
|
+
if provenance:
|
|
338
|
+
custom_properties.append(
|
|
339
|
+
CustomProperty(property="provenance", value=provenance)
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
if dataset_id or dataset_version:
|
|
343
|
+
reference = {
|
|
344
|
+
"dataset_id": dataset_id,
|
|
345
|
+
"dataset_version": dataset_version,
|
|
346
|
+
"collected_at": datetime.now(timezone.utc).isoformat(),
|
|
347
|
+
}
|
|
348
|
+
custom_properties.append(
|
|
349
|
+
CustomProperty(property="validation_reference", value=reference)
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
custom_properties.append(
|
|
353
|
+
CustomProperty(property="draft_change_log", value=change_log)
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
draft.customProperties = custom_properties
|
|
357
|
+
|
|
358
|
+
return draft
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def draft_from_observations(
|
|
362
|
+
*,
|
|
363
|
+
observations: Mapping[str, Mapping[str, Any]] | None,
|
|
364
|
+
base_contract: OpenDataContractStandard,
|
|
365
|
+
dataset_id: Optional[str] = None,
|
|
366
|
+
dataset_version: Optional[str] = None,
|
|
367
|
+
draft_context: Optional[Mapping[str, Any]] = None,
|
|
368
|
+
) -> OpenDataContractStandard:
|
|
369
|
+
"""Return a draft contract using observed schema information only."""
|
|
370
|
+
|
|
371
|
+
if hasattr(base_contract, "model_copy"):
|
|
372
|
+
draft = base_contract.model_copy(deep=True) # type: ignore[attr-defined]
|
|
373
|
+
else:
|
|
374
|
+
draft = to_model(as_odcs_dict(base_contract))
|
|
375
|
+
contract_id, version = contract_identity(base_contract)
|
|
376
|
+
bump_version = SemVer.parse(version).bump("patch")
|
|
377
|
+
|
|
378
|
+
suffix = _draft_version_suffix(
|
|
379
|
+
dataset_id=dataset_id,
|
|
380
|
+
dataset_version=dataset_version,
|
|
381
|
+
draft_context=draft_context,
|
|
382
|
+
)
|
|
383
|
+
draft.version = f"{bump_version}-{suffix}"
|
|
384
|
+
draft.status = "draft"
|
|
385
|
+
|
|
386
|
+
context_payload: Dict[str, Any] = dict(draft_context or {})
|
|
387
|
+
if dataset_id and "dataset_id" not in context_payload:
|
|
388
|
+
context_payload["dataset_id"] = dataset_id
|
|
389
|
+
if dataset_version and "dataset_version" not in context_payload:
|
|
390
|
+
context_payload["dataset_version"] = dataset_version
|
|
391
|
+
|
|
392
|
+
pipeline_token = _pipeline_hint(draft_context)
|
|
393
|
+
pipeline_value: Optional[str] = None
|
|
394
|
+
if draft_context:
|
|
395
|
+
for key in ("pipeline", "job", "project", "module", "function", "qualname", "source"):
|
|
396
|
+
raw = draft_context.get(key)
|
|
397
|
+
if raw:
|
|
398
|
+
pipeline_value = str(raw)
|
|
399
|
+
break
|
|
400
|
+
|
|
401
|
+
change_log = _apply_schema_feedback(
|
|
402
|
+
draft,
|
|
403
|
+
schema=observations or {},
|
|
404
|
+
metrics={},
|
|
405
|
+
change_log=[],
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
custom_properties = list(normalise_custom_properties(getattr(draft, "customProperties", None)))
|
|
409
|
+
custom_properties.append(
|
|
410
|
+
CustomProperty(
|
|
411
|
+
property="base_contract",
|
|
412
|
+
value={"id": contract_id, "version": version},
|
|
413
|
+
)
|
|
414
|
+
)
|
|
415
|
+
custom_properties.append(
|
|
416
|
+
CustomProperty(property="observed_schema", value=observations or {})
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
if context_payload:
|
|
420
|
+
if pipeline_value and "module" not in context_payload:
|
|
421
|
+
module_hint = pipeline_value.rsplit(".", 1)[0]
|
|
422
|
+
context_payload.setdefault("module", module_hint)
|
|
423
|
+
custom_properties.append(
|
|
424
|
+
CustomProperty(property="draft_context", value=context_payload)
|
|
425
|
+
)
|
|
426
|
+
if pipeline_value:
|
|
427
|
+
custom_properties.append(
|
|
428
|
+
CustomProperty(property="draft_pipeline", value=pipeline_value)
|
|
429
|
+
)
|
|
430
|
+
elif pipeline_token:
|
|
431
|
+
custom_properties.append(
|
|
432
|
+
CustomProperty(property="draft_pipeline", value=pipeline_token)
|
|
433
|
+
)
|
|
434
|
+
custom_properties.append(
|
|
435
|
+
CustomProperty(property="draft_change_log", value=change_log)
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
draft.customProperties = custom_properties
|
|
439
|
+
|
|
440
|
+
return draft
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _apply_schema_feedback(
|
|
444
|
+
draft: OpenDataContractStandard,
|
|
445
|
+
*,
|
|
446
|
+
schema: Mapping[str, Mapping[str, Any]],
|
|
447
|
+
metrics: Mapping[str, Any],
|
|
448
|
+
change_log: Optional[List[Dict[str, Any]]] = None,
|
|
449
|
+
) -> List[Dict[str, Any]]:
|
|
450
|
+
"""Update ``draft`` schema using observed field metadata."""
|
|
451
|
+
|
|
452
|
+
log: List[Dict[str, Any]] = change_log if change_log is not None else []
|
|
453
|
+
|
|
454
|
+
for obj in draft.schema_ or []:
|
|
455
|
+
for field in obj.properties or []:
|
|
456
|
+
name = field.name
|
|
457
|
+
if not name:
|
|
458
|
+
continue
|
|
459
|
+
observed = schema.get(name) or {}
|
|
460
|
+
observed_type, nullable = _resolve_observed_type(
|
|
461
|
+
observed,
|
|
462
|
+
field.physicalType or field.logicalType,
|
|
463
|
+
)
|
|
464
|
+
if observed_type:
|
|
465
|
+
field.physicalType = observed_type
|
|
466
|
+
was_required = bool(field.required)
|
|
467
|
+
if nullable is not None:
|
|
468
|
+
field.required = not nullable
|
|
469
|
+
if was_required and not field.required:
|
|
470
|
+
log.append({
|
|
471
|
+
"field": name,
|
|
472
|
+
"status": "relaxed",
|
|
473
|
+
"constraint": "required",
|
|
474
|
+
})
|
|
475
|
+
if observed:
|
|
476
|
+
field.description = field.description or ""
|
|
477
|
+
field.description = (
|
|
478
|
+
f"{field.description}\nObserved metadata: {observed}".strip()
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
updated_quality: List[DataQuality] = []
|
|
482
|
+
for dq in list(field.quality or []):
|
|
483
|
+
result = _quality_rule_key(field, dq)
|
|
484
|
+
if not result:
|
|
485
|
+
updated_quality.append(dq)
|
|
486
|
+
continue
|
|
487
|
+
prefix, label = result
|
|
488
|
+
value = _quality_metric_value(
|
|
489
|
+
metrics=metrics,
|
|
490
|
+
rule_prefix=prefix,
|
|
491
|
+
field_name=name,
|
|
492
|
+
)
|
|
493
|
+
if prefix == "enum":
|
|
494
|
+
extension = _enum_extension(dq=dq, metrics=metrics, field_name=name)
|
|
495
|
+
if extension:
|
|
496
|
+
updated, additions = extension
|
|
497
|
+
dq.mustBe = updated
|
|
498
|
+
log.append({
|
|
499
|
+
"field": name,
|
|
500
|
+
"rule": "enum",
|
|
501
|
+
"status": "updated",
|
|
502
|
+
"details": {"added_values": additions},
|
|
503
|
+
})
|
|
504
|
+
else:
|
|
505
|
+
log.append({
|
|
506
|
+
"field": name,
|
|
507
|
+
"rule": "enum",
|
|
508
|
+
"status": "kept",
|
|
509
|
+
})
|
|
510
|
+
updated_quality.append(dq)
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
if value and value > 0:
|
|
514
|
+
log.append({
|
|
515
|
+
"field": name,
|
|
516
|
+
"rule": label,
|
|
517
|
+
"status": "removed",
|
|
518
|
+
"details": {"violations": value},
|
|
519
|
+
})
|
|
520
|
+
continue
|
|
521
|
+
|
|
522
|
+
log.append({
|
|
523
|
+
"field": name,
|
|
524
|
+
"rule": label,
|
|
525
|
+
"status": "kept",
|
|
526
|
+
})
|
|
527
|
+
dq.description = dq.description or ""
|
|
528
|
+
dq.description = (
|
|
529
|
+
f"{dq.description}\nObserved {label}: {value}".strip()
|
|
530
|
+
)
|
|
531
|
+
updated_quality.append(dq)
|
|
532
|
+
|
|
533
|
+
field.quality = updated_quality or None
|
|
534
|
+
|
|
535
|
+
return log
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
__all__ = [
|
|
539
|
+
"draft_from_observations",
|
|
540
|
+
"draft_from_validation_result",
|
|
541
|
+
]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Interfaces for implementing contract management backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Protocol, Sequence
|
|
6
|
+
|
|
7
|
+
from open_data_contract_standard.model import OpenDataContractStandard # type: ignore
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ContractServiceBackend(Protocol):
|
|
11
|
+
"""Operations exposed by a contract management service runtime."""
|
|
12
|
+
|
|
13
|
+
def get(self, contract_id: str, contract_version: str) -> OpenDataContractStandard:
|
|
14
|
+
...
|
|
15
|
+
|
|
16
|
+
def latest(self, contract_id: str) -> Optional[OpenDataContractStandard]:
|
|
17
|
+
...
|
|
18
|
+
|
|
19
|
+
def list_versions(self, contract_id: str) -> Sequence[str]:
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
def link_dataset_contract(
|
|
23
|
+
self,
|
|
24
|
+
*,
|
|
25
|
+
dataset_id: str,
|
|
26
|
+
dataset_version: str,
|
|
27
|
+
contract_id: str,
|
|
28
|
+
contract_version: str,
|
|
29
|
+
) -> None:
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
def get_linked_contract_version(
|
|
33
|
+
self,
|
|
34
|
+
*,
|
|
35
|
+
dataset_id: str,
|
|
36
|
+
dataset_version: Optional[str] = None,
|
|
37
|
+
) -> Optional[str]:
|
|
38
|
+
...
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
__all__ = ["ContractServiceBackend"]
|