dagster-datacontract 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,3 @@
1
- import json
2
- import re
3
- import textwrap
4
1
  from datetime import timedelta
5
2
  from typing import Any
6
3
 
@@ -10,6 +7,14 @@ from datacontract.data_contract import DataContract
10
7
  from datacontract.model.run import ResultEnum
11
8
  from loguru import logger
12
9
 
10
+ from dagster_datacontract.description import get_description
11
+ from dagster_datacontract.metadata import (
12
+ get_column_lineage,
13
+ get_server_information,
14
+ get_table_column,
15
+ )
16
+ from dagster_datacontract.tags import get_tags
17
+
13
18
 
14
19
  class DataContractLoader:
15
20
  def __init__(
@@ -24,8 +29,11 @@ class DataContractLoader:
24
29
  self.data_contract.get_data_contract_specification()
25
30
  )
26
31
  self.metadata = self._load_metadata()
27
- self.tags = self._load_tags(self.data_contract_specification.tags)
28
- self.description = self.load_description()
32
+ self.tags = get_tags(self.data_contract_specification.tags)
33
+ self.description = get_description(
34
+ self.asset_name,
35
+ self.data_contract_specification,
36
+ )
29
37
  self.owner = self._load_owner()
30
38
  self.version = self._load_version()
31
39
  self.cron_schedule = self._load_cron_schedule()
@@ -33,76 +41,32 @@ class DataContractLoader:
33
41
  def _load_metadata(
34
42
  self,
35
43
  ) -> dict[str, TableColumnLineage | TableSchema | Any] | None:
36
- fields = self.data_contract_specification.models.get(self.asset_name).fields
37
-
38
44
  columns = []
39
45
  deps_by_column = {}
40
46
 
47
+ fields = self.data_contract_specification.models.get(self.asset_name).fields
48
+
41
49
  for column_name, column_field in fields.items():
42
- nullable = column_field.required if column_field.required else True
43
- unique = column_field.unique if column_field.unique else False
44
-
45
- columns.append(
46
- dg.TableColumn(
47
- name=column_name,
48
- type=column_field.type,
49
- description=column_field.description,
50
- constraints=dg.TableColumnConstraints(
51
- nullable=nullable,
52
- unique=unique,
53
- ),
54
- tags=self._load_tags(column_field.tags),
55
- )
56
- )
50
+ table_column = get_table_column(column_name, column_field)
51
+ columns.append(table_column)
57
52
 
58
- lineage = json.loads(column_field.model_dump_json()).get("lineage")
59
- if not lineage:
60
- deps_by_column[column_name] = []
61
- else:
62
- lineage_entries = lineage.get("inputFields")
53
+ table_column_lineage = get_column_lineage(column_field)
54
+ deps_by_column[column_name] = table_column_lineage
63
55
 
64
- deps_by_column[column_name] = [
65
- dg.TableColumnDep(
66
- asset_key=dg.AssetKey(lineage_entry["name"]),
67
- column_name=lineage_entry["field"],
68
- )
69
- for lineage_entry in lineage_entries
70
- ]
56
+ server_information = get_server_information(
57
+ self.data_contract_specification,
58
+ self.data_contract._server,
59
+ self.asset_name,
60
+ )
71
61
 
72
62
  return {
73
63
  "dagster/column_schema": dg.TableSchema(columns=columns),
74
64
  "dagster/column_lineage": dg.TableColumnLineage(
75
65
  deps_by_column=deps_by_column
76
66
  ),
67
+ **server_information,
77
68
  }
78
69
 
79
- @staticmethod
80
- def _load_tags(
81
- tags_list: list[str] | None,
82
- ) -> dict[str, str]:
83
- """Safely load tags from data contract.
84
-
85
- More information about Dagster tags:
86
- https://docs.dagster.io/guides/build/assets/metadata-and-tags/tags
87
- """
88
- key_pattern = re.compile(r"^[\w.-]{1,63}$")
89
- val_pattern = re.compile(r"^[\w.-]{0,63}$")
90
-
91
- tags = {}
92
-
93
- for item in tags_list:
94
- if ":" in item:
95
- key, val = map(str.strip, item.split(":", 1))
96
- else:
97
- key, val = item.strip(), ""
98
-
99
- if key_pattern.match(key) and val_pattern.match(val):
100
- tags[key] = val
101
- else:
102
- logger.warning(f"Ignoring invalid tag: {item}")
103
-
104
- return tags
105
-
106
70
  def _load_owner(self) -> list[str] | None:
107
71
  owner = self.data_contract_specification.info.owner
108
72
 
@@ -120,55 +84,9 @@ class DataContractLoader:
120
84
  )
121
85
  return cron_schedule
122
86
  except AttributeError:
87
+ logger.warning("'servicelevels.frequency.cron' not found in Data Contract.")
123
88
  return None
124
89
 
125
- def load_description(
126
- self, config: dict[str, Any] | None = None, separator: str = "\n"
127
- ) -> str | None:
128
- """Load and return a formatted description string based on the data contract specification.
129
-
130
- This method composes a description by pulling text from different parts
131
- of the data contract specification (e.g., model and info descriptions),
132
- joining them using the specified separator.
133
-
134
- Args:
135
- config (dict[str, Any] | None, optional): A configuration dictionary
136
- specifying the order in which to concatenate the description parts.
137
- Defaults to `{"order": ["model", "info"]}`.
138
- separator (str, optional): A string used to separate different parts
139
- of the description. Defaults to a newline character (`"\n"`).
140
-
141
- Returns:
142
- str | None: A single string combining the specified description parts
143
- if available, otherwise `None`.
144
-
145
-
146
- Example:
147
- >>> self.load_description()
148
- 'Model description...\nInfo description...'
149
- """
150
- default_config = {"order": ["model", "info"]}
151
-
152
- configuration = default_config | (config or {})
153
-
154
- descriptions = {
155
- "model": self.data_contract_specification.models.get(
156
- self.asset_name
157
- ).description,
158
- "info": self.data_contract_specification.info.description,
159
- }
160
-
161
- parts = []
162
- for key in configuration["order"]:
163
- desc = descriptions.get(key).replace("\n", f"{separator}\n")
164
- if desc:
165
- parts.append(textwrap.dedent(desc))
166
-
167
- if parts:
168
- return f"{separator}\n".join(parts)
169
-
170
- return None
171
-
172
90
  def load_data_quality_checks(self) -> dg.AssetChecksDefinition:
173
91
  """Define and return a data quality check for the specified asset.
174
92
 
@@ -190,7 +108,11 @@ class DataContractLoader:
190
108
  blocking=True,
191
109
  )
192
110
  def check_asset():
193
- run = self.data_contract.test()
111
+ data_contract = DataContract(
112
+ data_contract=self.data_contract_specification,
113
+ server=self.server_name,
114
+ )
115
+ run = data_contract.test()
194
116
 
195
117
  return dg.AssetCheckResult(
196
118
  passed=run.result == ResultEnum.passed,
@@ -0,0 +1,3 @@
1
+ from dagster_datacontract.description.description import get_description
2
+
3
+ __all__ = ["get_description"]
@@ -0,0 +1,48 @@
1
+ import textwrap
2
+ from typing import Any
3
+
4
+ from datacontract.data_contract import DataContractSpecification
5
+
6
+
7
+ def get_description(
8
+ asset_name: str,
9
+ data_contract_specification: DataContractSpecification,
10
+ config: dict[str, Any] | None = None,
11
+ separator: str = "\n",
12
+ ) -> str | None:
13
+ """Load and return a formatted description string based on the data contract specification.
14
+
15
+ This method composes a description by pulling text from different parts
16
+ of the data contract specification (e.g., model and info descriptions),
17
+ joining them using the specified separator.
18
+
19
+ Args:
20
+ config (dict[str, Any] | None, optional): A configuration dictionary
21
+ specifying the order in which to concatenate the description parts.
22
+ Defaults to `{"order": ["model", "info"]}`.
23
+ separator (str, optional): A string used to separate different parts
24
+ of the description. Defaults to a newline character (`"\n"`).
25
+
26
+ Returns:
27
+ str | None: A single string combining the specified description parts
28
+ if available, otherwise `None`.
29
+ """
30
+ default_config = {"order": ["model", "info"]}
31
+
32
+ configuration = default_config | (config or {})
33
+
34
+ descriptions = {
35
+ "model": data_contract_specification.models.get(asset_name).description,
36
+ "info": data_contract_specification.info.description,
37
+ }
38
+
39
+ parts = []
40
+ for key in configuration["order"]:
41
+ desc = descriptions.get(key).replace("\n", f"{separator}\n")
42
+ if desc:
43
+ parts.append(textwrap.dedent(desc))
44
+
45
+ if parts:
46
+ return f"{separator}\n".join(parts)
47
+
48
+ return None
@@ -0,0 +1,7 @@
1
+ from dagster_datacontract.metadata.server_information import get_server_information
2
+ from dagster_datacontract.metadata.table_colums import (
3
+ get_column_lineage,
4
+ get_table_column,
5
+ )
6
+
7
+ __all__ = ["get_table_column", "get_column_lineage", "get_server_information"]
@@ -0,0 +1,75 @@
1
+ import os
2
+ import urllib.parse
3
+
4
+ from datacontract.data_contract import DataContractSpecification
5
+
6
+
7
+ def _normalize_path(path: str) -> str:
8
+ parsed = urllib.parse.urlparse(path)
9
+
10
+ if not parsed.scheme or parsed.scheme == "file":
11
+ full_path = os.path.abspath(os.path.expanduser(path))
12
+ return f"file://{full_path}"
13
+ else:
14
+ return path
15
+
16
+
17
+ def get_server_information(
18
+ data_contract_specification: DataContractSpecification,
19
+ server_name: str | None,
20
+ asset_name: str,
21
+ ) -> dict[str, str]:
22
+ server = data_contract_specification.servers.get(server_name)
23
+ if not server:
24
+ return {}
25
+
26
+ server_information = {}
27
+ match server.type:
28
+ case "azure":
29
+ server_information["dagster/uri"] = server.location
30
+ case "databricks":
31
+ server_information["dagster/uri"] = server.host
32
+ server_information["dagster/table_name"] = (
33
+ f"{server.catalog}.{server.schema}.{asset_name}"
34
+ )
35
+ case "kafka":
36
+ server_information["dagster/uri"] = server.host
37
+ case "kinesis":
38
+ server_information = {}
39
+ case "local":
40
+ server_information["dagster/uri"] = _normalize_path(server.path)
41
+ case "oracle":
42
+ server_information["dagster/uri"] = f"{server.host}:{server.port}"
43
+ case "postgres":
44
+ server_information["dagster/uri"] = f"{server.host}:{server.port}"
45
+ server_information["dagster/table_name"] = (
46
+ f"{server.database}.{server.schema}.{asset_name}"
47
+ )
48
+ case "pubsub":
49
+ server_information = {}
50
+ case "redshift":
51
+ server_information["dagster/uri"] = server.endpoint
52
+ server_information["dagster/table_name"] = (
53
+ f"{server.database}.{server.schema}.{asset_name}"
54
+ )
55
+ case "s3":
56
+ server_information["dagster/uri"] = server.location
57
+ case "sftp":
58
+ server_information["dagster/uri"] = server.location
59
+ case "snowflake":
60
+ server_information["dagster/table_name"] = (
61
+ f"{server.database}.{server.schema}.{asset_name}"
62
+ )
63
+ case "sqlserver":
64
+ server_information["dagster/table_name"] = (
65
+ f"{server.database}.{server.schema}.{asset_name}"
66
+ )
67
+ case "trino":
68
+ server_information["dagster/uri"] = f"{server.host}:{server.port}"
69
+ server_information["dagster/table_name"] = (
70
+ f"{server.catalog}.{server.schema}.{asset_name}"
71
+ )
72
+ case _:
73
+ server_information = {}
74
+
75
+ return server_information
@@ -0,0 +1,109 @@
1
+ import json
2
+ from typing import Any
3
+
4
+ import dagster as dg
5
+ from dagster import TableColumnDep
6
+ from datacontract.model.data_contract_specification import Field
7
+
8
+ from dagster_datacontract.tags import get_tags
9
+
10
+
11
+ def get_other_item(name: str, column_field: Field) -> list[str] | None:
12
+ """Retrieve a list containing a single formatted string representing an attribute of a Field, if it exists.
13
+
14
+ Args:
15
+ name (str): The name of the attribute to fetch from the Field.
16
+ column_field (Field): The Field instance from which to extract the attribute.
17
+
18
+ Returns:
19
+ list[str] | None: A list with a formatted string (e.g., "format=csv")
20
+ if the attribute exists and is truthy, otherwise an empty list.
21
+ """
22
+ value = getattr(column_field, name, None)
23
+ return [f"{name}={value}"] if value else []
24
+
25
+
26
+ def get_table_column_constraints(column_field: Field) -> dg.TableColumnConstraints:
27
+ """Convert a Field object to Dagster TableColumnConstraints, including nullability, uniqueness, and other properties.
28
+
29
+ Args:
30
+ column_field (Field): A data contract field specification containing
31
+ column metadata.
32
+
33
+ Returns:
34
+ dg.TableColumnConstraints: A Dagster representation of the field's
35
+ column constraints.
36
+ """
37
+ nullable = column_field.required if column_field.required else True
38
+ unique = column_field.unique if column_field.unique else False
39
+ other = [
40
+ *(get_other_item("title", column_field)),
41
+ *(get_other_item("primaryKey", column_field)),
42
+ *(get_other_item("format", column_field)),
43
+ *(get_other_item("minLength", column_field)),
44
+ *(get_other_item("maxLength", column_field)),
45
+ *(get_other_item("pattern", column_field)),
46
+ *(get_other_item("minimum", column_field)),
47
+ *(get_other_item("exclusiveMinimum", column_field)),
48
+ *(get_other_item("maximum", column_field)),
49
+ *(get_other_item("exclusiveMaximum", column_field)),
50
+ *(get_other_item("pii", column_field)),
51
+ *(get_other_item("classification", column_field)),
52
+ ]
53
+
54
+ return dg.TableColumnConstraints(
55
+ nullable=nullable,
56
+ unique=unique,
57
+ other=other,
58
+ )
59
+
60
+
61
+ def get_table_column(column_name: str, column_field: Field) -> dg.TableColumn:
62
+ """Create a Dagster TableColumn from a given column name and Field metadata.
63
+
64
+ Args:
65
+ column_name (str): The name of the column.
66
+ column_field (Field): The Field instance containing metadata such as
67
+ type, description, constraints, and tags.
68
+
69
+ Returns:
70
+ dg.TableColumn: A Dagster TableColumn object representing the column
71
+ definition.
72
+ """
73
+ return dg.TableColumn(
74
+ name=column_name,
75
+ type=column_field.type,
76
+ description=column_field.description,
77
+ constraints=get_table_column_constraints(column_field),
78
+ tags=get_tags(column_field.tags),
79
+ )
80
+
81
+
82
+ def get_column_lineage(column_field: Field) -> list[Any] | list[TableColumnDep | Any]:
83
+ """Extract column-level lineage information from a Field and return it as a list of TableColumnDep objects.
84
+
85
+ The function parses the JSON-serialized Field to retrieve any input lineage
86
+ defined under the "lineage.inputFields" key. Each lineage entry is converted
87
+ into a Dagster TableColumnDep representing a dependency on a specific column
88
+ of another asset.
89
+
90
+ Args:
91
+ column_field (Field): The Field instance that may contain lineage metadata.
92
+
93
+ Returns:
94
+ list[Any] | list[TableColumnDep | Any]: A list of TableColumnDep objects
95
+ if lineage is defined; otherwise, an empty list.
96
+ """
97
+ lineage = json.loads(column_field.model_dump_json()).get("lineage")
98
+
99
+ if not lineage:
100
+ return []
101
+
102
+ lineage_entries = lineage.get("inputFields")
103
+ return [
104
+ dg.TableColumnDep(
105
+ asset_key=dg.AssetKey(lineage_entry["name"]),
106
+ column_name=lineage_entry["field"],
107
+ )
108
+ for lineage_entry in lineage_entries
109
+ ]
@@ -0,0 +1,3 @@
1
+ from dagster_datacontract.tags.tags import get_tags
2
+
3
+ __all__ = ["get_tags"]
@@ -0,0 +1,45 @@
1
+ import re
2
+
3
+ from loguru import logger
4
+
5
+
6
+ def get_tags(
7
+ tags_list: list[str] | None,
8
+ ) -> dict[str, str]:
9
+ """Parse and validate a list of string tags into a dictionary format.
10
+
11
+ Each tag in the input list should be in the form "key:value" or simply "key".
12
+ - Keys must match the pattern: ^[\w.-]{1,63}$
13
+ - Values (if provided) must match the pattern: ^[\w.-]{0,63}$
14
+
15
+ Invalid tags (those that do not match the expected format) will be ignored,
16
+ and a warning will be logged.
17
+
18
+ More information about Dagster tags:
19
+ https://docs.dagster.io/guides/build/assets/metadata-and-tags/tags
20
+
21
+ Args:
22
+ tags_list (list[str] | None): A list of tags as strings. Each tag may be
23
+ formatted as "key:value" or just "key". If None, an empty dict is returned.
24
+
25
+ Returns:
26
+ dict[str, str]: A dictionary of validated tags, where keys are tag names
27
+ and values are tag values (empty string if not provided).
28
+ """
29
+ key_pattern = re.compile(r"^[\w.-]{1,63}$")
30
+ val_pattern = re.compile(r"^[\w.-]{0,63}$")
31
+
32
+ tags = {}
33
+
34
+ for item in tags_list:
35
+ if ":" in item:
36
+ key, val = map(str.strip, item.split(":", 1))
37
+ else:
38
+ key, val = item.strip(), ""
39
+
40
+ if key_pattern.match(key) and val_pattern.match(val):
41
+ tags[key] = val
42
+ else:
43
+ logger.warning(f"Ignoring invalid tag: {item}")
44
+
45
+ return tags
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dagster-datacontract
3
- Version: 0.2.4
3
+ Version: 0.3.0
4
4
  Summary: Load metadata and asset check spesifications from data contracts.
5
5
  Author-email: Fredrik Bakken <fredrik@dataheim.io>
6
6
  Requires-Python: >=3.10.0
@@ -0,0 +1,13 @@
1
+ dagster_datacontract/__init__.py,sha256=UVpDzZYjmskEDJqEewMD52uWvu183O5RV9hjnP8mYAc,5488
2
+ dagster_datacontract/description/__init__.py,sha256=ulWqPp5jIPvCzaDFZcjLjcDkljJ5j_FRsE0dXhK8Wlc,104
3
+ dagster_datacontract/description/description.py,sha256=FmjgCYDpJ9UHrvAv0sAthfRohDjdG0lL1XcMKK8QMmI,1646
4
+ dagster_datacontract/metadata/__init__.py,sha256=e-xmcWWoAhmKTwosshsxnyrjI1j-UyY6YpdpzA2ggF4,269
5
+ dagster_datacontract/metadata/server_information.py,sha256=m1pv9sMfVjGIjzYVJ9R-KF-ABXCgAGKKH12dgAbm_jQ,2669
6
+ dagster_datacontract/metadata/table_colums.py,sha256=Q7ZCiMReWU4-T2YfBvtt5vvoVXEoUgzK5OPMxQEgzpQ,4013
7
+ dagster_datacontract/tags/__init__.py,sha256=2Ph-M0WbBKUjJWIzM_cEBW3SQZh7Nq8oy5MbD5bt_lc,76
8
+ dagster_datacontract/tags/tags.py,sha256=aZ_HTkc-vjJ_rofT32fT_zrLCt9x1ZGn8XoihhOMhfU,1414
9
+ dagster_datacontract-0.3.0.dist-info/licenses/LICENSE,sha256=9ULsEM1ICzCaGoso40plwO-d_SCQ7nsU6ZA4xgfaRq8,11338
10
+ dagster_datacontract-0.3.0.dist-info/METADATA,sha256=1uuc5HEqV3OYmP7jNe9TZftEcirYwKbRXbnoAOe3pWQ,3029
11
+ dagster_datacontract-0.3.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
12
+ dagster_datacontract-0.3.0.dist-info/top_level.txt,sha256=_HUQ6OJ50Q0VZxEkdocTtxk1QkJpztb1QY7A0rcvtCE,21
13
+ dagster_datacontract-0.3.0.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- dagster_datacontract/__init__.py,sha256=uL2Dgw7FHykepsmGQjU51kuQ7lKlVBLUxx4_nGn_3VU,8225
2
- dagster_datacontract-0.2.4.dist-info/licenses/LICENSE,sha256=9ULsEM1ICzCaGoso40plwO-d_SCQ7nsU6ZA4xgfaRq8,11338
3
- dagster_datacontract-0.2.4.dist-info/METADATA,sha256=53-UmbMwaHB2LiAnIP0tQrYKR2rU9Jbcee9fYQAWG7M,3029
4
- dagster_datacontract-0.2.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
5
- dagster_datacontract-0.2.4.dist-info/top_level.txt,sha256=_HUQ6OJ50Q0VZxEkdocTtxk1QkJpztb1QY7A0rcvtCE,21
6
- dagster_datacontract-0.2.4.dist-info/RECORD,,