dagster-datacontract 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
1
+ import json
2
+ import textwrap
3
+ from datetime import timedelta
4
+ from typing import Any
5
+
6
+ import dagster as dg
7
+ from dagster import TableColumnLineage, TableSchema
8
+ from datacontract.data_contract import DataContract
9
+ from datacontract.model.run import ResultEnum
10
+
11
+
12
+ class DataContractLoader:
13
+ def __init__(
14
+ self,
15
+ asset_name: str,
16
+ data_contract: DataContract,
17
+ ):
18
+ self.asset_name = asset_name
19
+ self.asset_key = dg.AssetKey(path=self.asset_name)
20
+ self.data_contract = data_contract
21
+ self.data_contract_specification = (
22
+ self.data_contract.get_data_contract_specification()
23
+ )
24
+ self.metadata = self._load_metadata()
25
+ self.tags = self._load_tags()
26
+ self.description = self._load_description()
27
+ self.owner = self._load_owner()
28
+ self.version = self._load_version()
29
+ self.cron_schedule = self._load_cron_schedule()
30
+
31
+ def _load_metadata(
32
+ self,
33
+ ) -> dict[str, TableColumnLineage | TableSchema | Any] | None:
34
+ fields = self.data_contract_specification.models.get(self.asset_name).fields
35
+
36
+ columns = []
37
+ deps_by_column = {}
38
+
39
+ for column_name, column_field in fields.items():
40
+ columns.append(
41
+ dg.TableColumn(
42
+ name=column_name,
43
+ type=column_field.type,
44
+ description=column_field.description,
45
+ )
46
+ )
47
+
48
+ lineage = json.loads(column_field.model_dump_json()).get("lineage")
49
+ if not lineage:
50
+ deps_by_column[column_name] = []
51
+ else:
52
+ lineage_entries = lineage.get("inputFields")
53
+
54
+ deps_by_column[column_name] = [
55
+ dg.TableColumnDep(
56
+ asset_key=dg.AssetKey(lineage_entry["name"]),
57
+ column_name=lineage_entry["field"],
58
+ )
59
+ for lineage_entry in lineage_entries
60
+ ]
61
+
62
+ return {
63
+ "dagster/column_schema": dg.TableSchema(columns=columns),
64
+ "dagster/column_lineage": dg.TableColumnLineage(
65
+ deps_by_column=deps_by_column
66
+ ),
67
+ }
68
+
69
+ def _load_tags(self) -> dict[str, str]:
70
+ tags = {
71
+ item.split(":")[0].strip(): item.split(":")[1].strip()
72
+ if ":" in item
73
+ else ""
74
+ for item in self.data_contract_specification.tags
75
+ }
76
+
77
+ return tags
78
+
79
+ def _load_description(self) -> str | None:
80
+ model_description = self.data_contract_specification.models.get(
81
+ self.asset_name
82
+ ).description.replace("\n", "\n\n")
83
+ info_description = self.data_contract_specification.info.description.replace(
84
+ "\n", "\n\n"
85
+ )
86
+
87
+ if model_description and info_description:
88
+ return f"{model_description}\n\n{info_description}"
89
+ elif model_description:
90
+ return textwrap.dedent(model_description)
91
+ elif info_description:
92
+ return textwrap.dedent(info_description)
93
+
94
+ return None
95
+
96
+ def _load_owner(self) -> list[str] | None:
97
+ owner = self.data_contract_specification.info.owner
98
+
99
+ return [f"team:{owner}"] if owner else None
100
+
101
+ def _load_version(self) -> str | None:
102
+ version = self.data_contract_specification.info.version
103
+
104
+ return version
105
+
106
+ def _load_cron_schedule(self) -> str | None:
107
+ try:
108
+ cron_schedule = (
109
+ self.data_contract_specification.servicelevels.frequency.cron
110
+ )
111
+ return cron_schedule
112
+ except AttributeError:
113
+ return None
114
+
115
+ def load_data_quality_checks(self) -> dg.AssetChecksDefinition:
116
+ @dg.asset_check(
117
+ asset=self.asset_key,
118
+ blocking=True,
119
+ )
120
+ def check_asset():
121
+ run = self.data_contract.test()
122
+
123
+ return dg.AssetCheckResult(
124
+ passed=run.result == ResultEnum.passed,
125
+ metadata={
126
+ "quality check": run.pretty(),
127
+ },
128
+ )
129
+
130
+ return check_asset
131
+
132
+ def load_freshness_checks(self, lower_bound_delta: timedelta):
133
+ freshness_checks = dg.build_last_update_freshness_checks(
134
+ assets=[self.asset_name],
135
+ lower_bound_delta=lower_bound_delta,
136
+ deadline_cron=self.cron_schedule,
137
+ )
138
+
139
+ return freshness_checks
@@ -0,0 +1,102 @@
1
+ Metadata-Version: 2.4
2
+ Name: dagster-datacontract
3
+ Version: 0.1.0
4
+ Summary: Load metadata and asset check spesifications from data contracts.
5
+ Author-email: Fredrik Bakken <fredrik@dataheim.io>
6
+ Requires-Python: >=3.11.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: dagster>=1.10.9
9
+ Requires-Dist: dagster-webserver>=1.10.9
10
+ Requires-Dist: datacontract-cli>=0.10.23
11
+
12
+ # dagster-datacontract
13
+
14
+ [Dagster](https://github.com/dagster-io/dagster) [Datacontract](https://github.com/datacontract/datacontract-cli) implementation for loading asset metadata and context information from data contracts to Dagster assets.
15
+
16
+ ## Installation
17
+
18
+ Install `dagster-datacontract` using [`uv`](https://github.com/astral-sh/uv):
19
+
20
+ ```shell
21
+ uv add dagster-datacontract
22
+ ```
23
+
24
+ ## Simple Example
25
+
26
+ The following example can be found inside the [example](https://github.com/dataheim-io/dagster-datacontract/tree/main/example) directory:
27
+
28
+ ```python
29
+ from datetime import timedelta
30
+ from pathlib import Path
31
+
32
+ import dagster as dg
33
+ import requests
34
+ import polars as pl
35
+ from datacontract.data_contract import DataContract
36
+
37
+ from dagster_datacontract import DataContractLoader
38
+
39
+
40
+ asset_name = "yellow_taxi_trip_records"
41
+ data_contract = DataContractLoader(
42
+ asset_name=asset_name,
43
+ data_contract=DataContract(
44
+ data_contract_file="./example/datacontract.yml",
45
+ ),
46
+ )
47
+
48
+
49
+ @dg.asset(
50
+ name=asset_name,
51
+ metadata=data_contract.metadata,
52
+ tags=data_contract.tags,
53
+ description=data_contract.description,
54
+ owners=data_contract.owner,
55
+ code_version=data_contract.version,
56
+ )
57
+ def yellow_taxi_trip_records(
58
+ context: dg.AssetExecutionContext,
59
+ ) -> None:
60
+ download_path = "./example/data"
61
+ Path(download_path).mkdir(parents=True, exist_ok=True)
62
+
63
+ url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"
64
+ response = requests.get(url=url)
65
+
66
+ file_path = f"{download_path}/yellow_tripdata_2025-01.parquet"
67
+ context.log.info(f"Reading data from '{url}' and writing to '{file_path}'.")
68
+ with open(file_path, "wb") as f:
69
+ f.write(response.content)
70
+
71
+ df = pl.read_parquet(file_path)
72
+ context.log.info(f"File contents downloaded:\n{df}")
73
+
74
+
75
+ asset_check_yellow_taxi_trip_records = data_contract.load_data_quality_checks()
76
+
77
+ freshness_checks = data_contract.load_freshness_checks(
78
+ lower_bound_delta=timedelta(minutes=5)
79
+ )
80
+ freshness_checks_sensor = dg.build_sensor_for_freshness_checks(
81
+ freshness_checks=freshness_checks,
82
+ default_status=dg.DefaultSensorStatus.RUNNING,
83
+ )
84
+
85
+ job = dg.define_asset_job(
86
+ name="monthly_taxi_trips",
87
+ selection=[asset_name],
88
+ )
89
+ schedule = dg.ScheduleDefinition(
90
+ job=job,
91
+ cron_schedule=data_contract.cron_schedule,
92
+ default_status=dg.DefaultScheduleStatus.RUNNING,
93
+ )
94
+
95
+
96
+ defs = dg.Definitions(
97
+ assets=[yellow_taxi_trip_records],
98
+ asset_checks=[asset_check_yellow_taxi_trip_records, *freshness_checks],
99
+ schedules=[schedule],
100
+ sensors=[freshness_checks_sensor],
101
+ )
102
+ ```
@@ -0,0 +1,5 @@
1
+ dagster_datacontract/__init__.py,sha256=AzD8f6UK7F4QmvtfUPrgpcvXPtNT-x_bT_CXTiM29OU,4459
2
+ dagster_datacontract-0.1.0.dist-info/METADATA,sha256=O8yiXq6f5G-cN1maR_mI5w1reVP3MTn_Tc3nB_dIAeg,2969
3
+ dagster_datacontract-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
4
+ dagster_datacontract-0.1.0.dist-info/top_level.txt,sha256=_HUQ6OJ50Q0VZxEkdocTtxk1QkJpztb1QY7A0rcvtCE,21
5
+ dagster_datacontract-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (78.1.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ dagster_datacontract