dasl-client 1.0.6__tar.gz → 1.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dasl-client might be problematic. Click here for more details.
- {dasl_client-1.0.6 → dasl_client-1.0.9}/PKG-INFO +3 -2
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/client.py +10 -8
- dasl_client-1.0.9/dasl_client/conn/client_identifier.py +23 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/conn/conn.py +1 -1
- dasl_client-1.0.9/dasl_client/preset_development/__init__.py +4 -0
- dasl_client-1.0.9/dasl_client/preset_development/errors.py +159 -0
- dasl_client-1.0.9/dasl_client/preset_development/preview_engine.py +344 -0
- dasl_client-1.0.9/dasl_client/preset_development/preview_parameters.py +386 -0
- dasl_client-1.0.9/dasl_client/preset_development/stage.py +559 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/types/admin_config.py +10 -7
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/types/datasource.py +177 -155
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/types/dbui.py +46 -34
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/types/rule.py +91 -65
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/types/types.py +72 -52
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/types/workspace_config.py +86 -123
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client.egg-info/PKG-INFO +3 -2
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client.egg-info/SOURCES.txt +6 -1
- dasl_client-1.0.9/dasl_client.egg-info/requires.txt +4 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/pyproject.toml +4 -3
- {dasl_client-1.0.6 → dasl_client-1.0.9}/test/test_api_changes.py +0 -1
- {dasl_client-1.0.6 → dasl_client-1.0.9}/test/test_api_surface.py +1 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/test/test_marshaling.py +3 -62
- dasl_client-1.0.6/dasl_client/conn/user_agent.py +0 -11
- dasl_client-1.0.6/dasl_client.egg-info/requires.txt +0 -3
- {dasl_client-1.0.6 → dasl_client-1.0.9}/LICENSE +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/README.md +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/__init__.py +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/auth/__init__.py +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/auth/auth.py +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/conn/__init__.py +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/errors/__init__.py +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/errors/errors.py +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/helpers.py +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/types/__init__.py +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client/types/helpers.py +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client.egg-info/dependency_links.txt +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/dasl_client.egg-info/top_level.txt +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/setup.cfg +0 -0
- {dasl_client-1.0.6 → dasl_client-1.0.9}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dasl_client
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.9
|
|
4
4
|
Summary: The DASL client library used for interacting with the DASL workspace
|
|
5
5
|
Home-page: https://github.com/antimatter/asl
|
|
6
6
|
Author: Antimatter Team
|
|
@@ -8,9 +8,10 @@ Author-email: Antimatter Team <support@antimatter.io>
|
|
|
8
8
|
Requires-Python: >=3.8
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
|
-
Requires-Dist: dasl_api==0.1.
|
|
11
|
+
Requires-Dist: dasl_api==0.1.12
|
|
12
12
|
Requires-Dist: databricks-sdk>=0.41.0
|
|
13
13
|
Requires-Dist: pydantic>=2
|
|
14
|
+
Requires-Dist: typing_extensions==4.10.0
|
|
14
15
|
|
|
15
16
|
# DASL Client Library
|
|
16
17
|
|
|
@@ -51,7 +51,7 @@ class Client:
|
|
|
51
51
|
|
|
52
52
|
:param auth: Authorization instance for authorizing requests to
|
|
53
53
|
the dasl control plane.
|
|
54
|
-
:returns Client
|
|
54
|
+
:returns: Client
|
|
55
55
|
"""
|
|
56
56
|
self.auth = auth
|
|
57
57
|
|
|
@@ -83,7 +83,7 @@ class Client:
|
|
|
83
83
|
:param dasl_host: The URL of the DASL server. This value should
|
|
84
84
|
not generally be specified unless you are testing against
|
|
85
85
|
an alternative environment.
|
|
86
|
-
:returns Client
|
|
86
|
+
:returns: Client for the newly created workspace.
|
|
87
87
|
"""
|
|
88
88
|
with error_handler():
|
|
89
89
|
if workspace_url is None:
|
|
@@ -131,7 +131,7 @@ class Client:
|
|
|
131
131
|
:param dasl_host: The URL of the DASL server. This value should
|
|
132
132
|
not generally be specified unless you are testing against
|
|
133
133
|
an alternative environment.
|
|
134
|
-
:returns Client
|
|
134
|
+
:returns: Client for the existing workspace.
|
|
135
135
|
"""
|
|
136
136
|
with error_handler():
|
|
137
137
|
if workspace_url is None:
|
|
@@ -195,7 +195,7 @@ class Client:
|
|
|
195
195
|
:param dasl_host: The URL of the DASL server. This value should
|
|
196
196
|
not generally be specified unless you are testing against
|
|
197
197
|
an alternative environment.
|
|
198
|
-
:returns Client
|
|
198
|
+
:returns: Client for the newly created or existing workspace.
|
|
199
199
|
"""
|
|
200
200
|
try:
|
|
201
201
|
return Client.new_workspace(
|
|
@@ -274,7 +274,7 @@ class Client:
|
|
|
274
274
|
you will need to repopulate the service_principal_secret correctly
|
|
275
275
|
before passing the result back to put_admin_config.
|
|
276
276
|
|
|
277
|
-
:returns AdminConfig
|
|
277
|
+
:returns: AdminConfig containing the current settings.
|
|
278
278
|
"""
|
|
279
279
|
with error_handler():
|
|
280
280
|
return AdminConfig.from_api_obj(
|
|
@@ -306,6 +306,8 @@ class Client:
|
|
|
306
306
|
Retrieve the WorkspaceConfig from the DASL server. The returned
|
|
307
307
|
value can be updated directly and passed to put_config in order
|
|
308
308
|
to make changes.
|
|
309
|
+
|
|
310
|
+
:returns: WorkspaceConfig containing the current configuration.
|
|
309
311
|
"""
|
|
310
312
|
with error_handler():
|
|
311
313
|
return WorkspaceConfig.from_api_obj(
|
|
@@ -348,7 +350,7 @@ class Client:
|
|
|
348
350
|
in order to make changes.
|
|
349
351
|
|
|
350
352
|
:param name: The unique name of the DataSource within this workspace
|
|
351
|
-
:returns DataSource
|
|
353
|
+
:returns: DataSource
|
|
352
354
|
"""
|
|
353
355
|
with error_handler():
|
|
354
356
|
return DataSource.from_api_obj(
|
|
@@ -459,7 +461,7 @@ class Client:
|
|
|
459
461
|
in order to make changes.
|
|
460
462
|
|
|
461
463
|
:param name: The unique name of the Rule within this workspace
|
|
462
|
-
:returns Rule
|
|
464
|
+
:returns: Rule
|
|
463
465
|
"""
|
|
464
466
|
with error_handler():
|
|
465
467
|
return Rule.from_api_obj(
|
|
@@ -601,7 +603,7 @@ class Client:
|
|
|
601
603
|
:param value: The observable value
|
|
602
604
|
:param cursor: A cursor to be used when paginating results
|
|
603
605
|
:param limit: A limit of the number of results to return
|
|
604
|
-
:returns EventsList
|
|
606
|
+
:returns: EventsList
|
|
605
607
|
"""
|
|
606
608
|
with error_handler():
|
|
607
609
|
return Dbui.ObservableEvents.EventsList.from_api_obj(
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from importlib.metadata import version
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_user_agent() -> str:
|
|
5
|
+
"""
|
|
6
|
+
A helper function defining the user agent for requests originating from
|
|
7
|
+
the ASL python conn library. We include the version of the API that the
|
|
8
|
+
connection was built off.
|
|
9
|
+
|
|
10
|
+
:return: A user-agent string.
|
|
11
|
+
"""
|
|
12
|
+
return get_client_identifier()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_client_identifier() -> str:
|
|
16
|
+
"""
|
|
17
|
+
A helper function defining the client identifier for identifying the python
|
|
18
|
+
client. We include the version of the version of the API that the client
|
|
19
|
+
uses.
|
|
20
|
+
|
|
21
|
+
:return: The python client identifier.
|
|
22
|
+
"""
|
|
23
|
+
return f"asl-python-client/{version('dasl_api')}"
|
|
@@ -4,7 +4,7 @@ from typing import Optional
|
|
|
4
4
|
import urllib3
|
|
5
5
|
from dasl_api import ApiClient, Configuration
|
|
6
6
|
|
|
7
|
-
from dasl_client.conn.
|
|
7
|
+
from dasl_client.conn.client_identifier import get_user_agent
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def get_base_conn(enable_retries: bool = True, host: Optional[str] = None) -> ApiClient:
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
2
|
+
from pyspark.sql.types import *
|
|
3
|
+
from pyspark.sql.dataframe import DataFrame
|
|
4
|
+
from typing import Dict, Any, List, Mapping, Tuple
|
|
5
|
+
from IPython import get_ipython
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PresetError(Exception):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class InvalidGoldTableSchemaError(PresetError):
|
|
13
|
+
def __init__(self, schema: str, additional_message: str = ""):
|
|
14
|
+
self.schema = schema
|
|
15
|
+
message = (
|
|
16
|
+
f"Malformed gold schema provided {schema}. {additional_message}".strip()
|
|
17
|
+
)
|
|
18
|
+
super().__init__(message)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class NoSilverStageProvdedError(PresetError):
|
|
22
|
+
def __init__(self, additional_msg: str = ""):
|
|
23
|
+
message = f"No silver stage provided{additional_msg}."
|
|
24
|
+
super().__init__(message)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class NoSilverTransformStageProvdedError(PresetError):
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
message: str = "No silver transform stage provided, but gold stage is present.",
|
|
31
|
+
):
|
|
32
|
+
super().__init__(message)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PreTransformNotFound(PresetError):
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
message: str = "Requested silver pretransform name not found in preset's silver pretransforms.",
|
|
39
|
+
):
|
|
40
|
+
super().__init__(message)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class NoSilverPreTransformStageProvdedError(PresetError):
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
message: str = "No silver transform stage provided, but prestransform name provided.",
|
|
47
|
+
):
|
|
48
|
+
super().__init__(message)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class MissingTableFieldError(PresetError):
|
|
52
|
+
def __init__(self, layer: str, table_name: str, field_name: str):
|
|
53
|
+
self.layer = layer
|
|
54
|
+
self.table_name = table_name
|
|
55
|
+
self.field_name = field_name
|
|
56
|
+
message = f"{layer} stage {table_name} is missing {field_name} field."
|
|
57
|
+
super().__init__(message)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DuplicateFieldNameError(PresetError):
|
|
61
|
+
def __init__(self, stage: str, stage_name: str, field_name: str):
|
|
62
|
+
self.stage = stage
|
|
63
|
+
self.stage_name = stage_name
|
|
64
|
+
self.field_name = field_name
|
|
65
|
+
message = f"Duplicate field specification name found in {stage} stage {stage_name} named {field_name}."
|
|
66
|
+
super().__init__(message)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class MalformedFieldError(PresetError):
|
|
70
|
+
def __init__(self, stage: str, stage_name: str, field_name: str):
|
|
71
|
+
self.stage = stage
|
|
72
|
+
self.stage_name = stage_name
|
|
73
|
+
self.field_name = field_name
|
|
74
|
+
message = f"Please provide 1 operation only in {stage} stage {stage_name}'s field specification named {field_name}."
|
|
75
|
+
super().__init__(message)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class MissingFieldNameError(PresetError):
|
|
79
|
+
def __init__(self, stage: str, stage_name: str):
|
|
80
|
+
self.stage = stage
|
|
81
|
+
self.stage_name = stage_name
|
|
82
|
+
message = (
|
|
83
|
+
f"Field specification in {stage} stage {stage_name} missing name field."
|
|
84
|
+
)
|
|
85
|
+
super().__init__(message)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class MissingSilverKeysError(PresetError):
|
|
89
|
+
def __init__(self, missing_keys: str):
|
|
90
|
+
self.missing_keys = missing_keys
|
|
91
|
+
message = f"Gold table/s have no corresponding input from silver table/s: {missing_keys}"
|
|
92
|
+
super().__init__(message)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class MissingAutoloaderConfigError(PresetError):
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
message: str = "Autoloader mode selected, but no autoloader configuration found in preset.autoloader.",
|
|
99
|
+
):
|
|
100
|
+
super().__init__(message)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class AutoloaderMissingFieldError(PresetError):
|
|
104
|
+
def __init__(self, field_name: str):
|
|
105
|
+
self.field_name = field_name
|
|
106
|
+
message = f"Autoloader mode selected, but missing field {field_name} in preset."
|
|
107
|
+
super().__init__(message)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class UnknownGoldTableError(PresetError):
|
|
111
|
+
def __init__(self, table_name: str, schema: str):
|
|
112
|
+
self.table_name = table_name
|
|
113
|
+
self.schema = schema
|
|
114
|
+
message = (
|
|
115
|
+
f"The referenced Gold table name {table_name} does not exist in {schema}."
|
|
116
|
+
)
|
|
117
|
+
super().__init__(message)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class GoldTableCompatibilityError(PresetError):
|
|
121
|
+
def __init__(self, message: str):
|
|
122
|
+
super().__init__(message)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class ReferencedColumnMissingError(PresetError):
|
|
126
|
+
def __init__(self, operation: str, column_name: str):
|
|
127
|
+
self.operation = operation
|
|
128
|
+
self.column_name = column_name
|
|
129
|
+
message = f"The referenced column {column_name} was not found in the dataframe during {operation} operation."
|
|
130
|
+
super().__init__(message)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class MissingJoinFieldError(PresetError):
|
|
134
|
+
def __init__(self, field_name: str):
|
|
135
|
+
self.field_name = field_name
|
|
136
|
+
message = f"Join operation is missing required field {field_name}."
|
|
137
|
+
super().__init__(message)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class MissingUtilityConfigurationFieldError(PresetError):
|
|
141
|
+
def __init__(self, operation: str, field_name: str):
|
|
142
|
+
self.operation = operation
|
|
143
|
+
self.field_name = field_name
|
|
144
|
+
message = f"The required configuration field {field_name} was not suppled in the {operation} operation."
|
|
145
|
+
super().__init__(message)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class AssertionFailedError(PresetError):
|
|
149
|
+
def __init__(self, expr: str, assertion_message: str, df: DataFrame):
|
|
150
|
+
# Get the Databricks built-in functions out the namespace.
|
|
151
|
+
ipython = get_ipython()
|
|
152
|
+
display = ipython.user_ns["display"]
|
|
153
|
+
|
|
154
|
+
self.expr = expr
|
|
155
|
+
self.assertion_message = assertion_message
|
|
156
|
+
self.df = df
|
|
157
|
+
message = f"The above rows failed the assertion expression {expr} with reason: {assertion_message}\n"
|
|
158
|
+
display(df)
|
|
159
|
+
super().__init__(message)
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
2
|
+
from pyspark.sql.types import *
|
|
3
|
+
from pyspark.sql.dataframe import DataFrame
|
|
4
|
+
from pyspark.sql.functions import lit, col as col_, sum as sum_, when
|
|
5
|
+
from dasl_client.preset_development.preview_parameters import *
|
|
6
|
+
from dasl_client.preset_development.stage import *
|
|
7
|
+
from dasl_client.preset_development.errors import *
|
|
8
|
+
import yaml
|
|
9
|
+
from IPython import get_ipython
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PreviewEngine:
|
|
13
|
+
"""
|
|
14
|
+
This class deserializes the in-development preset's YAML and performs a series of
|
|
15
|
+
validation steps before attempting to compile each stage's table and execute them
|
|
16
|
+
based on the provided PreviewParameters.
|
|
17
|
+
|
|
18
|
+
Upon successful execution, output is generated for each successfully executed
|
|
19
|
+
stage's table operations. Additionally, if Gold stages are computed, their outputs
|
|
20
|
+
are validated against the provided Gold stage tables to ensure compatibility on a
|
|
21
|
+
per-table-name basis with the Unity Catalog.
|
|
22
|
+
|
|
23
|
+
For example, a preset Gold stage table named "http_activity" will be checked against
|
|
24
|
+
the corresponding table in the Unity Catalog schema—also named "http_activity" to
|
|
25
|
+
confirm that inserting into the Unity Catalog most likely not cause errors.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self, spark: SparkSession, preset_yaml_str: str, ds_params: PreviewParameters
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Creates the PreviewEngine using the given preset YAML and datasource parameters.
|
|
33
|
+
The YAML is deserialized here and checked to verify whether the requested
|
|
34
|
+
pretransform name, if provided, exists in the preset.
|
|
35
|
+
|
|
36
|
+
Instance Attributes:
|
|
37
|
+
ds_params (PreviewParameters): The input datasource's configuration.
|
|
38
|
+
preset (Dict[str, Any]): The deserialized preset YAML.
|
|
39
|
+
pretransform_name (str): The name of the requested pretransform. Defaults to None.
|
|
40
|
+
pre (Stage): Stores the pretransform Stage object internally.
|
|
41
|
+
silver (List[Stage]): Stores the Silver Stage objects internally.
|
|
42
|
+
gold (List[Stage]): Stores the Gold Stage objects internally.
|
|
43
|
+
"""
|
|
44
|
+
self._spark = spark
|
|
45
|
+
self._ds_params = ds_params
|
|
46
|
+
self._preset = yaml.safe_load(preset_yaml_str)
|
|
47
|
+
self._pretransform_name = ds_params._pretransform_name
|
|
48
|
+
|
|
49
|
+
self._validate_gold_inputs(
|
|
50
|
+
self._preset.get("silver", None), self._preset.get("gold", None)
|
|
51
|
+
)
|
|
52
|
+
if self._pretransform_name:
|
|
53
|
+
self._validate_pretransform_name(
|
|
54
|
+
self._preset.get("silver", None), self._pretransform_name
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
self._pre = None
|
|
58
|
+
self._silver = []
|
|
59
|
+
self._gold = []
|
|
60
|
+
self._result_df_map = {}
|
|
61
|
+
|
|
62
|
+
def _validate_pretransform_name(
|
|
63
|
+
self, silver: Dict[str, str], pretransform_name: str
|
|
64
|
+
) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Validates the given pretransform name exists in the provided preset's Silver
|
|
67
|
+
PreTransform stages.
|
|
68
|
+
"""
|
|
69
|
+
if not silver:
|
|
70
|
+
raise NoSilverStageProvdedError(", but pretransform name provided")
|
|
71
|
+
if not (silver_pre_transform := silver.get("preTransform", None)):
|
|
72
|
+
raise NoSilverPreTransformStageProvdedError()
|
|
73
|
+
silver_pre_output_names = []
|
|
74
|
+
for table in silver_pre_transform:
|
|
75
|
+
if not (name := table.get("name", None)):
|
|
76
|
+
raise MissingTableFieldError(
|
|
77
|
+
"Silver pretransform",
|
|
78
|
+
table.get("name", "<stage missing name>"),
|
|
79
|
+
"name",
|
|
80
|
+
)
|
|
81
|
+
silver_pre_output_names += [name]
|
|
82
|
+
if pretransform_name not in silver_pre_output_names:
|
|
83
|
+
raise PreTransformNotFound()
|
|
84
|
+
|
|
85
|
+
def _validate_gold_inputs(
|
|
86
|
+
self, silver: Dict[str, str], gold: Dict[str, str]
|
|
87
|
+
) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Validate gold tables all have a silver table to input from.
|
|
90
|
+
"""
|
|
91
|
+
if not gold:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
if not len(gold):
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
if not silver:
|
|
98
|
+
raise NoSilverStageProvdedError(", but gold stage is present")
|
|
99
|
+
|
|
100
|
+
gold_input_names = []
|
|
101
|
+
for table in gold:
|
|
102
|
+
if not (input := table.get("input", None)):
|
|
103
|
+
raise MissingTableFieldError(
|
|
104
|
+
"Gold", table.get("name", "<stage missing name>"), "input"
|
|
105
|
+
)
|
|
106
|
+
gold_input_names += [input]
|
|
107
|
+
|
|
108
|
+
if not (silver_transform := silver.get("transform", None)):
|
|
109
|
+
raise NoSilverTransformStageProvdedError()
|
|
110
|
+
silver_output_names = []
|
|
111
|
+
for table in silver_transform:
|
|
112
|
+
if not (name := table.get("name", None)):
|
|
113
|
+
raise MissingTableFieldError(
|
|
114
|
+
"Silver transform", table.get("name", ""), "name"
|
|
115
|
+
)
|
|
116
|
+
silver_output_names += [name]
|
|
117
|
+
|
|
118
|
+
missing_keys = set(gold_input_names) - set(silver_output_names)
|
|
119
|
+
if missing_keys:
|
|
120
|
+
raise MissingSilverKeysError(missing_keys)
|
|
121
|
+
|
|
122
|
+
def _compile_stages(self) -> None:
|
|
123
|
+
"""
|
|
124
|
+
Creates Stage objects, setting pretransform to None if not provided.
|
|
125
|
+
"""
|
|
126
|
+
pretransform = None
|
|
127
|
+
if self._pretransform_name:
|
|
128
|
+
for table in self._preset["silver"]["preTransform"]:
|
|
129
|
+
if table["name"] == self._pretransform_name:
|
|
130
|
+
self._pre = Stage(self._spark, "silver pretransform", table)
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
self._silver = [
|
|
134
|
+
Stage(self._spark, "silver transform", table)
|
|
135
|
+
for table in self._preset.get("silver", {}).get("transform", [])
|
|
136
|
+
]
|
|
137
|
+
self._gold = [
|
|
138
|
+
Stage(self._spark, "gold", table) for table in self._preset.get("gold", [])
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
def _run(
|
|
142
|
+
self, df: DataFrame
|
|
143
|
+
) -> Tuple[DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]]:
|
|
144
|
+
"""
|
|
145
|
+
Runs all stages, in medallion stage order. This allows prior stage outputs to feed
|
|
146
|
+
into later stage inputs.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Dataframes containing the output from each run Stage.
|
|
150
|
+
"""
|
|
151
|
+
if self._pre:
|
|
152
|
+
df = self._pre.run(df)
|
|
153
|
+
|
|
154
|
+
silver_output_map = {}
|
|
155
|
+
for table in self._silver:
|
|
156
|
+
silver_output_map[table._name] = table.run(df)
|
|
157
|
+
|
|
158
|
+
gold_output_map = {}
|
|
159
|
+
for table in self._gold:
|
|
160
|
+
# We store as gold_name/silver_input to prevent clobbering on duplicate gold table use.
|
|
161
|
+
gold_output_map[f"{table._name}/{table._input}"] = table.run(
|
|
162
|
+
silver_output_map[table._input]
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return (
|
|
166
|
+
(df, silver_output_map, gold_output_map)
|
|
167
|
+
if self._pre
|
|
168
|
+
else (None, silver_output_map, gold_output_map)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def _render_output(
|
|
172
|
+
self,
|
|
173
|
+
input_df: DataFrame,
|
|
174
|
+
stage_dataframes: Tuple[DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]],
|
|
175
|
+
gold_table_schema: str,
|
|
176
|
+
) -> None:
|
|
177
|
+
"""
|
|
178
|
+
Displays formatted HTML output from executed Stages' DataFrames.
|
|
179
|
+
"""
|
|
180
|
+
# TODO: Investigate further into using Databricks's style sheets here.
|
|
181
|
+
|
|
182
|
+
# Get the Databricks built-in functions out the namespace.
|
|
183
|
+
ipython = get_ipython()
|
|
184
|
+
displayHTML = ipython.user_ns["displayHTML"]
|
|
185
|
+
display = ipython.user_ns["display"]
|
|
186
|
+
|
|
187
|
+
def d(txt, lvl) -> None:
|
|
188
|
+
displayHTML(
|
|
189
|
+
f"""
|
|
190
|
+
<div style="background-color:
|
|
191
|
+
background-color: rgb(18, 23, 26); padding: 0; margin: 0;">
|
|
192
|
+
<h{lvl} style="margin: 0; background-color: rgb(244, 234, 229);">{txt}</h{lvl}>
|
|
193
|
+
</div>
|
|
194
|
+
"""
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def check_struct_compatibility(
|
|
198
|
+
target_field: StructField, df_field: StructField, prefix=""
|
|
199
|
+
):
|
|
200
|
+
if not (
|
|
201
|
+
isinstance(target_field.dataType, StructType)
|
|
202
|
+
and isinstance(df_field.dataType, StructType)
|
|
203
|
+
):
|
|
204
|
+
return
|
|
205
|
+
|
|
206
|
+
target_fields = {
|
|
207
|
+
field.name: field for field in target_field.dataType.fields
|
|
208
|
+
}
|
|
209
|
+
for field in df_field.dataType.fields:
|
|
210
|
+
if field.name not in target_fields:
|
|
211
|
+
raise GoldTableCompatibilityError(
|
|
212
|
+
f"Extra field found in gold stage output STRUCT column {prefix}{target_field.name}: {field.name}"
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
if isinstance(field.dataType, StructType):
|
|
216
|
+
check_struct_compatibility(
|
|
217
|
+
target_fields[field.name],
|
|
218
|
+
field,
|
|
219
|
+
prefix=prefix + target_field.name + ".",
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
(pre_df, silver, gold) = stage_dataframes
|
|
223
|
+
d("Input", 1)
|
|
224
|
+
display(input_df)
|
|
225
|
+
d("Silver Pre-Transform", 1)
|
|
226
|
+
if pre_df:
|
|
227
|
+
display(pre_df)
|
|
228
|
+
else:
|
|
229
|
+
d("Skipped", 2)
|
|
230
|
+
d("Silver Transform", 1)
|
|
231
|
+
for name, df in silver.items():
|
|
232
|
+
d(f"{name}", 2)
|
|
233
|
+
display(df)
|
|
234
|
+
d("Gold", 1)
|
|
235
|
+
for name, df in gold.items():
|
|
236
|
+
d(f"{name}", 2)
|
|
237
|
+
d("Stage output", 3)
|
|
238
|
+
display(df)
|
|
239
|
+
|
|
240
|
+
# NOTE: Name is stored as Gold_name/Silver_input. So we need to get just the Gold table
|
|
241
|
+
# name that we are comparing the dataframe metadata to.
|
|
242
|
+
name = name.split("/")[0]
|
|
243
|
+
|
|
244
|
+
if not self._spark.catalog.tableExists(f"{gold_table_schema}.{name}"):
|
|
245
|
+
raise UnknownGoldTableError(name, gold_table_schema)
|
|
246
|
+
|
|
247
|
+
# Performs the type check.
|
|
248
|
+
delta_df = self._spark.table(f"{gold_table_schema}.{name}").limit(0)
|
|
249
|
+
unioned_df = delta_df.unionByName(df, allowMissingColumns=True)
|
|
250
|
+
|
|
251
|
+
# Now we check no new columns.
|
|
252
|
+
if not set(df.columns).issubset(delta_df.columns):
|
|
253
|
+
raise GoldTableCompatibilityError(
|
|
254
|
+
f"Extra columns provided: {', '.join([col for col in df.columns if col not in delta_df.columns])}"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Now we check no new fields in STRUCT columns.
|
|
258
|
+
for field in delta_df.schema.fields:
|
|
259
|
+
if isinstance(field.dataType, StructType) and field.name in df.columns:
|
|
260
|
+
# Retrieve the corresponding field from the DataFrame's schema.
|
|
261
|
+
df_field = next(f for f in df.schema.fields if f.name == field.name)
|
|
262
|
+
check_struct_compatibility(field, df_field)
|
|
263
|
+
|
|
264
|
+
# Check nullable columns exist, and data what we are inserting is set.
|
|
265
|
+
non_nullable_cols = [
|
|
266
|
+
field.name for field in delta_df.schema.fields if not field.nullable
|
|
267
|
+
]
|
|
268
|
+
null_checks = [
|
|
269
|
+
sum_(when(col_(col).isNull(), 1).otherwise(0)).alias(col)
|
|
270
|
+
for col in non_nullable_cols
|
|
271
|
+
]
|
|
272
|
+
null_counts = df.select(null_checks).collect()[0].asDict()
|
|
273
|
+
cols_with_nulls = []
|
|
274
|
+
try:
|
|
275
|
+
cols_with_nulls = [
|
|
276
|
+
col_name for col_name, count in null_counts.items() if count > 0
|
|
277
|
+
]
|
|
278
|
+
except TypeError:
|
|
279
|
+
# There were no records returned and so null_counts == None.
|
|
280
|
+
pass
|
|
281
|
+
if cols_with_nulls:
|
|
282
|
+
raise GoldTableCompatibilityError(
|
|
283
|
+
f"Record with null data found for non-nullable columns: {', '.join([col for col in cols_with_nulls])}"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
d("Resultant gold table preview", 3)
|
|
287
|
+
display(unioned_df)
|
|
288
|
+
|
|
289
|
+
def evaluate(self, gold_table_schema: str) -> None:
|
|
290
|
+
"""
|
|
291
|
+
Evaluates the loaded preset YAML using the input datasource configuration to load
|
|
292
|
+
records. Finally, checks that the output from the Gold stages is compatible with
|
|
293
|
+
the Unity Catalog Gold tables.
|
|
294
|
+
"""
|
|
295
|
+
s = gold_table_schema.split(".")
|
|
296
|
+
if len(s) != 2:
|
|
297
|
+
raise InvalidGoldTableSchemaError(gold_table_schema)
|
|
298
|
+
catalog_name = s[0]
|
|
299
|
+
schema_name = s[1]
|
|
300
|
+
if any(
|
|
301
|
+
row.catalog == catalog_name
|
|
302
|
+
for row in self._spark.sql("SHOW CATALOGS").collect()
|
|
303
|
+
):
|
|
304
|
+
if not any(
|
|
305
|
+
row.databaseName == schema_name
|
|
306
|
+
for row in self._spark.sql(f"SHOW SCHEMAS IN {catalog_name}").collect()
|
|
307
|
+
):
|
|
308
|
+
raise InvalidGoldTableSchemaError(
|
|
309
|
+
gold_table_schema,
|
|
310
|
+
f"Schema {schema_name} not found in catalog {catalog_name} or insufficient permissions.",
|
|
311
|
+
)
|
|
312
|
+
else:
|
|
313
|
+
raise InvalidGoldTableSchemaError(
|
|
314
|
+
gold_table_schema,
|
|
315
|
+
f"Catalog {catalog_name} not found or insufficient permissions.",
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# If we are using the autoloader, fetch format from preset and others.
|
|
319
|
+
if self._ds_params._mode == "autoloader":
|
|
320
|
+
if not (autoloader_conf := self._preset.get("autoloader", None)):
|
|
321
|
+
raise MissingAutoloaderConfigError()
|
|
322
|
+
if not (file_format := autoloader_conf.get("format", None)):
|
|
323
|
+
raise AutoloaderMissingFieldError("format")
|
|
324
|
+
self._ds_params.set_autoloader_format(file_format)
|
|
325
|
+
if schemaFile := autoloader_conf.get("schemaFile", None):
|
|
326
|
+
self._ds_params.set_autoloader_schema_file(schemaFile)
|
|
327
|
+
if multiline := autoloader_conf.get("multiline", None):
|
|
328
|
+
if multiline == "true":
|
|
329
|
+
self._ds_params.set_multiline(True)
|
|
330
|
+
else:
|
|
331
|
+
self._ds_params.set_multiline(False)
|
|
332
|
+
if cloudFiles := autoloader_conf.get("cloudFiles", None):
|
|
333
|
+
if schema_hints := cloudFiles.get("schemaHints", None):
|
|
334
|
+
self._ds_params.set_autoloader_cloudfiles_schema_hints(schema_hints)
|
|
335
|
+
if schema_hints_file := cloudFiles.get("schemaHintsFile", None):
|
|
336
|
+
self._ds_params.set_autoloader_cloudfiles_schema_hint_file(
|
|
337
|
+
schema_hints_file
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
self._compile_stages()
|
|
341
|
+
|
|
342
|
+
with self._ds_params as df:
|
|
343
|
+
self._result_df_map = self._run(df)
|
|
344
|
+
self._render_output(df, self._result_df_map, gold_table_schema)
|