dagster-datacontract 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
|
|
1
|
+
import json
|
2
|
+
import textwrap
|
3
|
+
from datetime import timedelta
|
4
|
+
from typing import Any
|
5
|
+
|
6
|
+
import dagster as dg
|
7
|
+
from dagster import TableColumnLineage, TableSchema
|
8
|
+
from datacontract.data_contract import DataContract
|
9
|
+
from datacontract.model.run import ResultEnum
|
10
|
+
|
11
|
+
|
12
|
+
class DataContractLoader:
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
asset_name: str,
|
16
|
+
data_contract: DataContract,
|
17
|
+
):
|
18
|
+
self.asset_name = asset_name
|
19
|
+
self.asset_key = dg.AssetKey(path=self.asset_name)
|
20
|
+
self.data_contract = data_contract
|
21
|
+
self.data_contract_specification = (
|
22
|
+
self.data_contract.get_data_contract_specification()
|
23
|
+
)
|
24
|
+
self.metadata = self._load_metadata()
|
25
|
+
self.tags = self._load_tags()
|
26
|
+
self.description = self._load_description()
|
27
|
+
self.owner = self._load_owner()
|
28
|
+
self.version = self._load_version()
|
29
|
+
self.cron_schedule = self._load_cron_schedule()
|
30
|
+
|
31
|
+
def _load_metadata(
|
32
|
+
self,
|
33
|
+
) -> dict[str, TableColumnLineage | TableSchema | Any] | None:
|
34
|
+
fields = self.data_contract_specification.models.get(self.asset_name).fields
|
35
|
+
|
36
|
+
columns = []
|
37
|
+
deps_by_column = {}
|
38
|
+
|
39
|
+
for column_name, column_field in fields.items():
|
40
|
+
columns.append(
|
41
|
+
dg.TableColumn(
|
42
|
+
name=column_name,
|
43
|
+
type=column_field.type,
|
44
|
+
description=column_field.description,
|
45
|
+
)
|
46
|
+
)
|
47
|
+
|
48
|
+
lineage = json.loads(column_field.model_dump_json()).get("lineage")
|
49
|
+
if not lineage:
|
50
|
+
deps_by_column[column_name] = []
|
51
|
+
else:
|
52
|
+
lineage_entries = lineage.get("inputFields")
|
53
|
+
|
54
|
+
deps_by_column[column_name] = [
|
55
|
+
dg.TableColumnDep(
|
56
|
+
asset_key=dg.AssetKey(lineage_entry["name"]),
|
57
|
+
column_name=lineage_entry["field"],
|
58
|
+
)
|
59
|
+
for lineage_entry in lineage_entries
|
60
|
+
]
|
61
|
+
|
62
|
+
return {
|
63
|
+
"dagster/column_schema": dg.TableSchema(columns=columns),
|
64
|
+
"dagster/column_lineage": dg.TableColumnLineage(
|
65
|
+
deps_by_column=deps_by_column
|
66
|
+
),
|
67
|
+
}
|
68
|
+
|
69
|
+
def _load_tags(self) -> dict[str, str]:
|
70
|
+
tags = {
|
71
|
+
item.split(":")[0].strip(): item.split(":")[1].strip()
|
72
|
+
if ":" in item
|
73
|
+
else ""
|
74
|
+
for item in self.data_contract_specification.tags
|
75
|
+
}
|
76
|
+
|
77
|
+
return tags
|
78
|
+
|
79
|
+
def _load_description(self) -> str | None:
|
80
|
+
model_description = self.data_contract_specification.models.get(
|
81
|
+
self.asset_name
|
82
|
+
).description.replace("\n", "\n\n")
|
83
|
+
info_description = self.data_contract_specification.info.description.replace(
|
84
|
+
"\n", "\n\n"
|
85
|
+
)
|
86
|
+
|
87
|
+
if model_description and info_description:
|
88
|
+
return f"{model_description}\n\n{info_description}"
|
89
|
+
elif model_description:
|
90
|
+
return textwrap.dedent(model_description)
|
91
|
+
elif info_description:
|
92
|
+
return textwrap.dedent(info_description)
|
93
|
+
|
94
|
+
return None
|
95
|
+
|
96
|
+
def _load_owner(self) -> list[str] | None:
|
97
|
+
owner = self.data_contract_specification.info.owner
|
98
|
+
|
99
|
+
return [f"team:{owner}"] if owner else None
|
100
|
+
|
101
|
+
def _load_version(self) -> str | None:
|
102
|
+
version = self.data_contract_specification.info.version
|
103
|
+
|
104
|
+
return version
|
105
|
+
|
106
|
+
def _load_cron_schedule(self) -> str | None:
|
107
|
+
try:
|
108
|
+
cron_schedule = (
|
109
|
+
self.data_contract_specification.servicelevels.frequency.cron
|
110
|
+
)
|
111
|
+
return cron_schedule
|
112
|
+
except AttributeError:
|
113
|
+
return None
|
114
|
+
|
115
|
+
def load_data_quality_checks(self) -> dg.AssetChecksDefinition:
|
116
|
+
@dg.asset_check(
|
117
|
+
asset=self.asset_key,
|
118
|
+
blocking=True,
|
119
|
+
)
|
120
|
+
def check_asset():
|
121
|
+
run = self.data_contract.test()
|
122
|
+
|
123
|
+
return dg.AssetCheckResult(
|
124
|
+
passed=run.result == ResultEnum.passed,
|
125
|
+
metadata={
|
126
|
+
"quality check": run.pretty(),
|
127
|
+
},
|
128
|
+
)
|
129
|
+
|
130
|
+
return check_asset
|
131
|
+
|
132
|
+
def load_freshness_checks(self, lower_bound_delta: timedelta):
|
133
|
+
freshness_checks = dg.build_last_update_freshness_checks(
|
134
|
+
assets=[self.asset_name],
|
135
|
+
lower_bound_delta=lower_bound_delta,
|
136
|
+
deadline_cron=self.cron_schedule,
|
137
|
+
)
|
138
|
+
|
139
|
+
return freshness_checks
|
@@ -0,0 +1,102 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: dagster-datacontract
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Load metadata and asset check spesifications from data contracts.
|
5
|
+
Author-email: Fredrik Bakken <fredrik@dataheim.io>
|
6
|
+
Requires-Python: >=3.11.10
|
7
|
+
Description-Content-Type: text/markdown
|
8
|
+
Requires-Dist: dagster>=1.10.9
|
9
|
+
Requires-Dist: dagster-webserver>=1.10.9
|
10
|
+
Requires-Dist: datacontract-cli>=0.10.23
|
11
|
+
|
12
|
+
# dagster-datacontract
|
13
|
+
|
14
|
+
[Dagster](https://github.com/dagster-io/dagster) [Datacontract](https://github.com/datacontract/datacontract-cli) implementation for loading asset metadata and context information from data contracts to Dagster assets.
|
15
|
+
|
16
|
+
## Installation
|
17
|
+
|
18
|
+
Install `dagster-datacontract` using [`uv`](https://github.com/astral-sh/uv):
|
19
|
+
|
20
|
+
```shell
|
21
|
+
uv add dagster-datacontract
|
22
|
+
```
|
23
|
+
|
24
|
+
## Simple Example
|
25
|
+
|
26
|
+
The following example can be found inside the [example](https://github.com/dataheim-io/dagster-datacontract/tree/main/example) directory:
|
27
|
+
|
28
|
+
```python
|
29
|
+
from datetime import timedelta
|
30
|
+
from pathlib import Path
|
31
|
+
|
32
|
+
import dagster as dg
|
33
|
+
import requests
|
34
|
+
import polars as pl
|
35
|
+
from datacontract.data_contract import DataContract
|
36
|
+
|
37
|
+
from dagster_datacontract import DataContractLoader
|
38
|
+
|
39
|
+
|
40
|
+
asset_name = "yellow_taxi_trip_records"
|
41
|
+
data_contract = DataContractLoader(
|
42
|
+
asset_name=asset_name,
|
43
|
+
data_contract=DataContract(
|
44
|
+
data_contract_file="./example/datacontract.yml",
|
45
|
+
),
|
46
|
+
)
|
47
|
+
|
48
|
+
|
49
|
+
@dg.asset(
|
50
|
+
name=asset_name,
|
51
|
+
metadata=data_contract.metadata,
|
52
|
+
tags=data_contract.tags,
|
53
|
+
description=data_contract.description,
|
54
|
+
owners=data_contract.owner,
|
55
|
+
code_version=data_contract.version,
|
56
|
+
)
|
57
|
+
def yellow_taxi_trip_records(
|
58
|
+
context: dg.AssetExecutionContext,
|
59
|
+
) -> None:
|
60
|
+
download_path = "./example/data"
|
61
|
+
Path(download_path).mkdir(parents=True, exist_ok=True)
|
62
|
+
|
63
|
+
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"
|
64
|
+
response = requests.get(url=url)
|
65
|
+
|
66
|
+
file_path = f"{download_path}/yellow_tripdata_2025-01.parquet"
|
67
|
+
context.log.info(f"Reading data from '{url}' and writing to '{file_path}'.")
|
68
|
+
with open(file_path, "wb") as f:
|
69
|
+
f.write(response.content)
|
70
|
+
|
71
|
+
df = pl.read_parquet(file_path)
|
72
|
+
context.log.info(f"File contents downloaded:\n{df}")
|
73
|
+
|
74
|
+
|
75
|
+
asset_check_yellow_taxi_trip_records = data_contract.load_data_quality_checks()
|
76
|
+
|
77
|
+
freshness_checks = data_contract.load_freshness_checks(
|
78
|
+
lower_bound_delta=timedelta(minutes=5)
|
79
|
+
)
|
80
|
+
freshness_checks_sensor = dg.build_sensor_for_freshness_checks(
|
81
|
+
freshness_checks=freshness_checks,
|
82
|
+
default_status=dg.DefaultSensorStatus.RUNNING,
|
83
|
+
)
|
84
|
+
|
85
|
+
job = dg.define_asset_job(
|
86
|
+
name="monthly_taxi_trips",
|
87
|
+
selection=[asset_name],
|
88
|
+
)
|
89
|
+
schedule = dg.ScheduleDefinition(
|
90
|
+
job=job,
|
91
|
+
cron_schedule=data_contract.cron_schedule,
|
92
|
+
default_status=dg.DefaultScheduleStatus.RUNNING,
|
93
|
+
)
|
94
|
+
|
95
|
+
|
96
|
+
defs = dg.Definitions(
|
97
|
+
assets=[yellow_taxi_trip_records],
|
98
|
+
asset_checks=[asset_check_yellow_taxi_trip_records, *freshness_checks],
|
99
|
+
schedules=[schedule],
|
100
|
+
sensors=[freshness_checks_sensor],
|
101
|
+
)
|
102
|
+
```
|
@@ -0,0 +1,5 @@
|
|
1
|
+
dagster_datacontract/__init__.py,sha256=AzD8f6UK7F4QmvtfUPrgpcvXPtNT-x_bT_CXTiM29OU,4459
|
2
|
+
dagster_datacontract-0.1.0.dist-info/METADATA,sha256=O8yiXq6f5G-cN1maR_mI5w1reVP3MTn_Tc3nB_dIAeg,2969
|
3
|
+
dagster_datacontract-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
4
|
+
dagster_datacontract-0.1.0.dist-info/top_level.txt,sha256=_HUQ6OJ50Q0VZxEkdocTtxk1QkJpztb1QY7A0rcvtCE,21
|
5
|
+
dagster_datacontract-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
dagster_datacontract
|