dasl-client 1.0.7__py3-none-any.whl → 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dasl-client might be problematic. Click here for more details.
- dasl_client/auth/auth.py +6 -5
- dasl_client/client.py +45 -9
- dasl_client/preset_development/__init__.py +4 -0
- dasl_client/preset_development/errors.py +159 -0
- dasl_client/preset_development/preview_engine.py +344 -0
- dasl_client/preset_development/preview_parameters.py +386 -0
- dasl_client/preset_development/stage.py +559 -0
- dasl_client/types/__init__.py +1 -0
- dasl_client/types/admin_config.py +10 -7
- dasl_client/types/content.py +235 -0
- dasl_client/types/datasource.py +177 -138
- dasl_client/types/dbui.py +46 -34
- dasl_client/types/rule.py +91 -65
- dasl_client/types/types.py +67 -54
- dasl_client/types/workspace_config.py +90 -74
- {dasl_client-1.0.7.dist-info → dasl_client-1.0.11.dist-info}/METADATA +3 -2
- dasl_client-1.0.11.dist-info/RECORD +29 -0
- dasl_client-1.0.7.dist-info/RECORD +0 -23
- {dasl_client-1.0.7.dist-info → dasl_client-1.0.11.dist-info}/LICENSE +0 -0
- {dasl_client-1.0.7.dist-info → dasl_client-1.0.11.dist-info}/WHEEL +0 -0
- {dasl_client-1.0.7.dist-info → dasl_client-1.0.11.dist-info}/top_level.txt +0 -0
dasl_client/auth/auth.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import abc
|
|
2
|
+
import base64
|
|
2
3
|
import time
|
|
3
4
|
from datetime import datetime
|
|
4
5
|
|
|
@@ -176,15 +177,15 @@ class DatabricksSecretAuth(Authorization):
|
|
|
176
177
|
principalName=self._principal,
|
|
177
178
|
)
|
|
178
179
|
handler = api.WorkspaceV1Api(api_client=self._client)
|
|
179
|
-
resp = handler.
|
|
180
|
-
workspace=self._workspace,
|
|
180
|
+
resp = handler.workspace_v1_request_secret(
|
|
181
|
+
workspace=self._workspace, workspace_v1_request_secret_request=req
|
|
181
182
|
)
|
|
182
183
|
secret_name = resp.secret_name
|
|
183
184
|
secret_value = ""
|
|
184
185
|
for tries in range(3):
|
|
185
186
|
try:
|
|
186
|
-
secret_value =
|
|
187
|
-
secret_name, "token"
|
|
187
|
+
secret_value = (
|
|
188
|
+
WorkspaceClient().secrets.get_secret(secret_name, "token").value
|
|
188
189
|
)
|
|
189
190
|
break
|
|
190
191
|
except ResourceDoesNotExist:
|
|
@@ -194,7 +195,7 @@ class DatabricksSecretAuth(Authorization):
|
|
|
194
195
|
raise RuntimeError(f"failed to complete secret auth")
|
|
195
196
|
|
|
196
197
|
req = WorkspaceV1AuthenticateRequest(
|
|
197
|
-
databricks_secret=secret_value,
|
|
198
|
+
databricks_secret=base64.b64decode(secret_value).decode("utf-8"),
|
|
198
199
|
)
|
|
199
200
|
handler = api.WorkspaceV1Api(api_client=self._client)
|
|
200
201
|
|
dasl_client/client.py
CHANGED
|
@@ -3,8 +3,8 @@ from typing import Any, Callable, Iterator, List, Optional, TypeVar
|
|
|
3
3
|
|
|
4
4
|
from dasl_api import (
|
|
5
5
|
CoreV1Api,
|
|
6
|
-
CoreV1DataSourceAutoloaderSpec,
|
|
7
6
|
DbuiV1Api,
|
|
7
|
+
ContentV1Api,
|
|
8
8
|
WorkspaceV1Api,
|
|
9
9
|
WorkspaceV1CreateWorkspaceRequest,
|
|
10
10
|
api,
|
|
@@ -13,6 +13,7 @@ from pydantic import Field
|
|
|
13
13
|
|
|
14
14
|
from dasl_client.auth.auth import (
|
|
15
15
|
Authorization,
|
|
16
|
+
DatabricksSecretAuth,
|
|
16
17
|
DatabricksTokenAuth,
|
|
17
18
|
ServiceAccountKeyAuth,
|
|
18
19
|
)
|
|
@@ -29,6 +30,8 @@ from .types import (
|
|
|
29
30
|
WorkspaceConfig,
|
|
30
31
|
TransformRequest,
|
|
31
32
|
TransformResponse,
|
|
33
|
+
DataSourcePresetsList,
|
|
34
|
+
DataSourcePreset,
|
|
32
35
|
)
|
|
33
36
|
|
|
34
37
|
|
|
@@ -51,7 +54,7 @@ class Client:
|
|
|
51
54
|
|
|
52
55
|
:param auth: Authorization instance for authorizing requests to
|
|
53
56
|
the dasl control plane.
|
|
54
|
-
:returns Client
|
|
57
|
+
:returns: Client
|
|
55
58
|
"""
|
|
56
59
|
self.auth = auth
|
|
57
60
|
|
|
@@ -83,7 +86,7 @@ class Client:
|
|
|
83
86
|
:param dasl_host: The URL of the DASL server. This value should
|
|
84
87
|
not generally be specified unless you are testing against
|
|
85
88
|
an alternative environment.
|
|
86
|
-
:returns Client
|
|
89
|
+
:returns: Client for the newly created workspace.
|
|
87
90
|
"""
|
|
88
91
|
with error_handler():
|
|
89
92
|
if workspace_url is None:
|
|
@@ -131,7 +134,7 @@ class Client:
|
|
|
131
134
|
:param dasl_host: The URL of the DASL server. This value should
|
|
132
135
|
not generally be specified unless you are testing against
|
|
133
136
|
an alternative environment.
|
|
134
|
-
:returns Client
|
|
137
|
+
:returns: Client for the existing workspace.
|
|
135
138
|
"""
|
|
136
139
|
with error_handler():
|
|
137
140
|
if workspace_url is None:
|
|
@@ -195,7 +198,7 @@ class Client:
|
|
|
195
198
|
:param dasl_host: The URL of the DASL server. This value should
|
|
196
199
|
not generally be specified unless you are testing against
|
|
197
200
|
an alternative environment.
|
|
198
|
-
:returns Client
|
|
201
|
+
:returns: Client for the newly created or existing workspace.
|
|
199
202
|
"""
|
|
200
203
|
try:
|
|
201
204
|
return Client.new_workspace(
|
|
@@ -229,6 +232,9 @@ class Client:
|
|
|
229
232
|
def _dbui_client(self) -> DbuiV1Api:
|
|
230
233
|
return DbuiV1Api(self.auth.client())
|
|
231
234
|
|
|
235
|
+
def _content_client(self) -> ContentV1Api:
|
|
236
|
+
return ContentV1Api(self.auth.client())
|
|
237
|
+
|
|
232
238
|
def _workspace(self) -> str:
|
|
233
239
|
return self.auth.workspace()
|
|
234
240
|
|
|
@@ -274,7 +280,7 @@ class Client:
|
|
|
274
280
|
you will need to repopulate the service_principal_secret correctly
|
|
275
281
|
before passing the result back to put_admin_config.
|
|
276
282
|
|
|
277
|
-
:returns AdminConfig
|
|
283
|
+
:returns: AdminConfig containing the current settings.
|
|
278
284
|
"""
|
|
279
285
|
with error_handler():
|
|
280
286
|
return AdminConfig.from_api_obj(
|
|
@@ -306,6 +312,8 @@ class Client:
|
|
|
306
312
|
Retrieve the WorkspaceConfig from the DASL server. The returned
|
|
307
313
|
value can be updated directly and passed to put_config in order
|
|
308
314
|
to make changes.
|
|
315
|
+
|
|
316
|
+
:returns: WorkspaceConfig containing the current configuration.
|
|
309
317
|
"""
|
|
310
318
|
with error_handler():
|
|
311
319
|
return WorkspaceConfig.from_api_obj(
|
|
@@ -348,7 +356,7 @@ class Client:
|
|
|
348
356
|
in order to make changes.
|
|
349
357
|
|
|
350
358
|
:param name: The unique name of the DataSource within this workspace
|
|
351
|
-
:returns DataSource
|
|
359
|
+
:returns: DataSource
|
|
352
360
|
"""
|
|
353
361
|
with error_handler():
|
|
354
362
|
return DataSource.from_api_obj(
|
|
@@ -459,7 +467,7 @@ class Client:
|
|
|
459
467
|
in order to make changes.
|
|
460
468
|
|
|
461
469
|
:param name: The unique name of the Rule within this workspace
|
|
462
|
-
:returns Rule
|
|
470
|
+
:returns: Rule
|
|
463
471
|
"""
|
|
464
472
|
with error_handler():
|
|
465
473
|
return Rule.from_api_obj(
|
|
@@ -601,7 +609,7 @@ class Client:
|
|
|
601
609
|
:param value: The observable value
|
|
602
610
|
:param cursor: A cursor to be used when paginating results
|
|
603
611
|
:param limit: A limit of the number of results to return
|
|
604
|
-
:returns EventsList
|
|
612
|
+
:returns: EventsList
|
|
605
613
|
"""
|
|
606
614
|
with error_handler():
|
|
607
615
|
return Dbui.ObservableEvents.EventsList.from_api_obj(
|
|
@@ -614,3 +622,31 @@ class Client:
|
|
|
614
622
|
limit=limit,
|
|
615
623
|
)
|
|
616
624
|
)
|
|
625
|
+
|
|
626
|
+
def list_presets(self) -> DataSourcePresetsList:
|
|
627
|
+
"""
|
|
628
|
+
List the Presets in this workspace. This will include any user defined
|
|
629
|
+
presets if a custom presets path has been configured in the workspace.
|
|
630
|
+
|
|
631
|
+
:returns: DataSourcePresetsList
|
|
632
|
+
"""
|
|
633
|
+
with error_handler():
|
|
634
|
+
return DataSourcePresetsList.from_api_obj(
|
|
635
|
+
self._content_client().content_v1_get_preset_data_sources(
|
|
636
|
+
self._workspace(),
|
|
637
|
+
)
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
def get_preset(self, name: str) -> DataSourcePreset:
|
|
641
|
+
"""
|
|
642
|
+
Get the preset with the argument name from the DASL server. If the preset name
|
|
643
|
+
begins with 'internal_' it will instead be collected from the user catalog,
|
|
644
|
+
provided a preset path is set in the workspace config.
|
|
645
|
+
|
|
646
|
+
:param name: The unique name of the DataSource preset within this workspace.
|
|
647
|
+
:returns: DataSourcePreset
|
|
648
|
+
"""
|
|
649
|
+
with error_handler():
|
|
650
|
+
return DataSourcePreset.from_api_obj(
|
|
651
|
+
self._content_client().content_v1_get_preset_datasource(self._workspace(), name)
|
|
652
|
+
)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
2
|
+
from pyspark.sql.types import *
|
|
3
|
+
from pyspark.sql.dataframe import DataFrame
|
|
4
|
+
from typing import Dict, Any, List, Mapping, Tuple
|
|
5
|
+
from IPython import get_ipython
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PresetError(Exception):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class InvalidGoldTableSchemaError(PresetError):
|
|
13
|
+
def __init__(self, schema: str, additional_message: str = ""):
|
|
14
|
+
self.schema = schema
|
|
15
|
+
message = (
|
|
16
|
+
f"Malformed gold schema provided {schema}. {additional_message}".strip()
|
|
17
|
+
)
|
|
18
|
+
super().__init__(message)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class NoSilverStageProvdedError(PresetError):
|
|
22
|
+
def __init__(self, additional_msg: str = ""):
|
|
23
|
+
message = f"No silver stage provided{additional_msg}."
|
|
24
|
+
super().__init__(message)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class NoSilverTransformStageProvdedError(PresetError):
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
message: str = "No silver transform stage provided, but gold stage is present.",
|
|
31
|
+
):
|
|
32
|
+
super().__init__(message)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PreTransformNotFound(PresetError):
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
message: str = "Requested silver pretransform name not found in preset's silver pretransforms.",
|
|
39
|
+
):
|
|
40
|
+
super().__init__(message)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class NoSilverPreTransformStageProvdedError(PresetError):
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
message: str = "No silver transform stage provided, but prestransform name provided.",
|
|
47
|
+
):
|
|
48
|
+
super().__init__(message)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class MissingTableFieldError(PresetError):
|
|
52
|
+
def __init__(self, layer: str, table_name: str, field_name: str):
|
|
53
|
+
self.layer = layer
|
|
54
|
+
self.table_name = table_name
|
|
55
|
+
self.field_name = field_name
|
|
56
|
+
message = f"{layer} stage {table_name} is missing {field_name} field."
|
|
57
|
+
super().__init__(message)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DuplicateFieldNameError(PresetError):
|
|
61
|
+
def __init__(self, stage: str, stage_name: str, field_name: str):
|
|
62
|
+
self.stage = stage
|
|
63
|
+
self.stage_name = stage_name
|
|
64
|
+
self.field_name = field_name
|
|
65
|
+
message = f"Duplicate field specification name found in {stage} stage {stage_name} named {field_name}."
|
|
66
|
+
super().__init__(message)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class MalformedFieldError(PresetError):
|
|
70
|
+
def __init__(self, stage: str, stage_name: str, field_name: str):
|
|
71
|
+
self.stage = stage
|
|
72
|
+
self.stage_name = stage_name
|
|
73
|
+
self.field_name = field_name
|
|
74
|
+
message = f"Please provide 1 operation only in {stage} stage {stage_name}'s field specification named {field_name}."
|
|
75
|
+
super().__init__(message)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class MissingFieldNameError(PresetError):
|
|
79
|
+
def __init__(self, stage: str, stage_name: str):
|
|
80
|
+
self.stage = stage
|
|
81
|
+
self.stage_name = stage_name
|
|
82
|
+
message = (
|
|
83
|
+
f"Field specification in {stage} stage {stage_name} missing name field."
|
|
84
|
+
)
|
|
85
|
+
super().__init__(message)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class MissingSilverKeysError(PresetError):
|
|
89
|
+
def __init__(self, missing_keys: str):
|
|
90
|
+
self.missing_keys = missing_keys
|
|
91
|
+
message = f"Gold table/s have no corresponding input from silver table/s: {missing_keys}"
|
|
92
|
+
super().__init__(message)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class MissingAutoloaderConfigError(PresetError):
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
message: str = "Autoloader mode selected, but no autoloader configuration found in preset.autoloader.",
|
|
99
|
+
):
|
|
100
|
+
super().__init__(message)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class AutoloaderMissingFieldError(PresetError):
|
|
104
|
+
def __init__(self, field_name: str):
|
|
105
|
+
self.field_name = field_name
|
|
106
|
+
message = f"Autoloader mode selected, but missing field {field_name} in preset."
|
|
107
|
+
super().__init__(message)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class UnknownGoldTableError(PresetError):
|
|
111
|
+
def __init__(self, table_name: str, schema: str):
|
|
112
|
+
self.table_name = table_name
|
|
113
|
+
self.schema = schema
|
|
114
|
+
message = (
|
|
115
|
+
f"The referenced Gold table name {table_name} does not exist in {schema}."
|
|
116
|
+
)
|
|
117
|
+
super().__init__(message)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class GoldTableCompatibilityError(PresetError):
|
|
121
|
+
def __init__(self, message: str):
|
|
122
|
+
super().__init__(message)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class ReferencedColumnMissingError(PresetError):
|
|
126
|
+
def __init__(self, operation: str, column_name: str):
|
|
127
|
+
self.operation = operation
|
|
128
|
+
self.column_name = column_name
|
|
129
|
+
message = f"The referenced column {column_name} was not found in the dataframe during {operation} operation."
|
|
130
|
+
super().__init__(message)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class MissingJoinFieldError(PresetError):
|
|
134
|
+
def __init__(self, field_name: str):
|
|
135
|
+
self.field_name = field_name
|
|
136
|
+
message = f"Join operation is missing required field {field_name}."
|
|
137
|
+
super().__init__(message)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class MissingUtilityConfigurationFieldError(PresetError):
|
|
141
|
+
def __init__(self, operation: str, field_name: str):
|
|
142
|
+
self.operation = operation
|
|
143
|
+
self.field_name = field_name
|
|
144
|
+
message = f"The required configuration field {field_name} was not suppled in the {operation} operation."
|
|
145
|
+
super().__init__(message)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class AssertionFailedError(PresetError):
|
|
149
|
+
def __init__(self, expr: str, assertion_message: str, df: DataFrame):
|
|
150
|
+
# Get the Databricks built-in functions out the namespace.
|
|
151
|
+
ipython = get_ipython()
|
|
152
|
+
display = ipython.user_ns["display"]
|
|
153
|
+
|
|
154
|
+
self.expr = expr
|
|
155
|
+
self.assertion_message = assertion_message
|
|
156
|
+
self.df = df
|
|
157
|
+
message = f"The above rows failed the assertion expression {expr} with reason: {assertion_message}\n"
|
|
158
|
+
display(df)
|
|
159
|
+
super().__init__(message)
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
2
|
+
from pyspark.sql.types import *
|
|
3
|
+
from pyspark.sql.dataframe import DataFrame
|
|
4
|
+
from pyspark.sql.functions import lit, col as col_, sum as sum_, when
|
|
5
|
+
from dasl_client.preset_development.preview_parameters import *
|
|
6
|
+
from dasl_client.preset_development.stage import *
|
|
7
|
+
from dasl_client.preset_development.errors import *
|
|
8
|
+
import yaml
|
|
9
|
+
from IPython import get_ipython
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PreviewEngine:
|
|
13
|
+
"""
|
|
14
|
+
This class deserializes the in-development preset's YAML and performs a series of
|
|
15
|
+
validation steps before attempting to compile each stage's table and execute them
|
|
16
|
+
based on the provided PreviewParameters.
|
|
17
|
+
|
|
18
|
+
Upon successful execution, output is generated for each successfully executed
|
|
19
|
+
stage's table operations. Additionally, if Gold stages are computed, their outputs
|
|
20
|
+
are validated against the provided Gold stage tables to ensure compatibility on a
|
|
21
|
+
per-table-name basis with the Unity Catalog.
|
|
22
|
+
|
|
23
|
+
For example, a preset Gold stage table named "http_activity" will be checked against
|
|
24
|
+
the corresponding table in the Unity Catalog schema—also named "http_activity" to
|
|
25
|
+
confirm that inserting into the Unity Catalog most likely not cause errors.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self, spark: SparkSession, preset_yaml_str: str, ds_params: PreviewParameters
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Creates the PreviewEngine using the given preset YAML and datasource parameters.
|
|
33
|
+
The YAML is deserialized here and checked to verify whether the requested
|
|
34
|
+
pretransform name, if provided, exists in the preset.
|
|
35
|
+
|
|
36
|
+
Instance Attributes:
|
|
37
|
+
ds_params (PreviewParameters): The input datasource's configuration.
|
|
38
|
+
preset (Dict[str, Any]): The deserialized preset YAML.
|
|
39
|
+
pretransform_name (str): The name of the requested pretransform. Defaults to None.
|
|
40
|
+
pre (Stage): Stores the pretransform Stage object internally.
|
|
41
|
+
silver (List[Stage]): Stores the Silver Stage objects internally.
|
|
42
|
+
gold (List[Stage]): Stores the Gold Stage objects internally.
|
|
43
|
+
"""
|
|
44
|
+
self._spark = spark
|
|
45
|
+
self._ds_params = ds_params
|
|
46
|
+
self._preset = yaml.safe_load(preset_yaml_str)
|
|
47
|
+
self._pretransform_name = ds_params._pretransform_name
|
|
48
|
+
|
|
49
|
+
self._validate_gold_inputs(
|
|
50
|
+
self._preset.get("silver", None), self._preset.get("gold", None)
|
|
51
|
+
)
|
|
52
|
+
if self._pretransform_name:
|
|
53
|
+
self._validate_pretransform_name(
|
|
54
|
+
self._preset.get("silver", None), self._pretransform_name
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
self._pre = None
|
|
58
|
+
self._silver = []
|
|
59
|
+
self._gold = []
|
|
60
|
+
self._result_df_map = {}
|
|
61
|
+
|
|
62
|
+
def _validate_pretransform_name(
|
|
63
|
+
self, silver: Dict[str, str], pretransform_name: str
|
|
64
|
+
) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Validates the given pretransform name exists in the provided preset's Silver
|
|
67
|
+
PreTransform stages.
|
|
68
|
+
"""
|
|
69
|
+
if not silver:
|
|
70
|
+
raise NoSilverStageProvdedError(", but pretransform name provided")
|
|
71
|
+
if not (silver_pre_transform := silver.get("preTransform", None)):
|
|
72
|
+
raise NoSilverPreTransformStageProvdedError()
|
|
73
|
+
silver_pre_output_names = []
|
|
74
|
+
for table in silver_pre_transform:
|
|
75
|
+
if not (name := table.get("name", None)):
|
|
76
|
+
raise MissingTableFieldError(
|
|
77
|
+
"Silver pretransform",
|
|
78
|
+
table.get("name", "<stage missing name>"),
|
|
79
|
+
"name",
|
|
80
|
+
)
|
|
81
|
+
silver_pre_output_names += [name]
|
|
82
|
+
if pretransform_name not in silver_pre_output_names:
|
|
83
|
+
raise PreTransformNotFound()
|
|
84
|
+
|
|
85
|
+
def _validate_gold_inputs(
|
|
86
|
+
self, silver: Dict[str, str], gold: Dict[str, str]
|
|
87
|
+
) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Validate gold tables all have a silver table to input from.
|
|
90
|
+
"""
|
|
91
|
+
if not gold:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
if not len(gold):
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
if not silver:
|
|
98
|
+
raise NoSilverStageProvdedError(", but gold stage is present")
|
|
99
|
+
|
|
100
|
+
gold_input_names = []
|
|
101
|
+
for table in gold:
|
|
102
|
+
if not (input := table.get("input", None)):
|
|
103
|
+
raise MissingTableFieldError(
|
|
104
|
+
"Gold", table.get("name", "<stage missing name>"), "input"
|
|
105
|
+
)
|
|
106
|
+
gold_input_names += [input]
|
|
107
|
+
|
|
108
|
+
if not (silver_transform := silver.get("transform", None)):
|
|
109
|
+
raise NoSilverTransformStageProvdedError()
|
|
110
|
+
silver_output_names = []
|
|
111
|
+
for table in silver_transform:
|
|
112
|
+
if not (name := table.get("name", None)):
|
|
113
|
+
raise MissingTableFieldError(
|
|
114
|
+
"Silver transform", table.get("name", ""), "name"
|
|
115
|
+
)
|
|
116
|
+
silver_output_names += [name]
|
|
117
|
+
|
|
118
|
+
missing_keys = set(gold_input_names) - set(silver_output_names)
|
|
119
|
+
if missing_keys:
|
|
120
|
+
raise MissingSilverKeysError(missing_keys)
|
|
121
|
+
|
|
122
|
+
def _compile_stages(self) -> None:
|
|
123
|
+
"""
|
|
124
|
+
Creates Stage objects, setting pretransform to None if not provided.
|
|
125
|
+
"""
|
|
126
|
+
pretransform = None
|
|
127
|
+
if self._pretransform_name:
|
|
128
|
+
for table in self._preset["silver"]["preTransform"]:
|
|
129
|
+
if table["name"] == self._pretransform_name:
|
|
130
|
+
self._pre = Stage(self._spark, "silver pretransform", table)
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
self._silver = [
|
|
134
|
+
Stage(self._spark, "silver transform", table)
|
|
135
|
+
for table in self._preset.get("silver", {}).get("transform", [])
|
|
136
|
+
]
|
|
137
|
+
self._gold = [
|
|
138
|
+
Stage(self._spark, "gold", table) for table in self._preset.get("gold", [])
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
def _run(
|
|
142
|
+
self, df: DataFrame
|
|
143
|
+
) -> Tuple[DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]]:
|
|
144
|
+
"""
|
|
145
|
+
Runs all stages, in medallion stage order. This allows prior stage outputs to feed
|
|
146
|
+
into later stage inputs.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Dataframes containing the output from each run Stage.
|
|
150
|
+
"""
|
|
151
|
+
if self._pre:
|
|
152
|
+
df = self._pre.run(df)
|
|
153
|
+
|
|
154
|
+
silver_output_map = {}
|
|
155
|
+
for table in self._silver:
|
|
156
|
+
silver_output_map[table._name] = table.run(df)
|
|
157
|
+
|
|
158
|
+
gold_output_map = {}
|
|
159
|
+
for table in self._gold:
|
|
160
|
+
# We store as gold_name/silver_input to prevent clobbering on duplicate gold table use.
|
|
161
|
+
gold_output_map[f"{table._name}/{table._input}"] = table.run(
|
|
162
|
+
silver_output_map[table._input]
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return (
|
|
166
|
+
(df, silver_output_map, gold_output_map)
|
|
167
|
+
if self._pre
|
|
168
|
+
else (None, silver_output_map, gold_output_map)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def _render_output(
|
|
172
|
+
self,
|
|
173
|
+
input_df: DataFrame,
|
|
174
|
+
stage_dataframes: Tuple[DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]],
|
|
175
|
+
gold_table_schema: str,
|
|
176
|
+
) -> None:
|
|
177
|
+
"""
|
|
178
|
+
Displays formatted HTML output from executed Stages' DataFrames.
|
|
179
|
+
"""
|
|
180
|
+
# TODO: Investigate further into using Databricks's style sheets here.
|
|
181
|
+
|
|
182
|
+
# Get the Databricks built-in functions out the namespace.
|
|
183
|
+
ipython = get_ipython()
|
|
184
|
+
displayHTML = ipython.user_ns["displayHTML"]
|
|
185
|
+
display = ipython.user_ns["display"]
|
|
186
|
+
|
|
187
|
+
def d(txt, lvl) -> None:
|
|
188
|
+
displayHTML(
|
|
189
|
+
f"""
|
|
190
|
+
<div style="background-color:
|
|
191
|
+
background-color: rgb(18, 23, 26); padding: 0; margin: 0;">
|
|
192
|
+
<h{lvl} style="margin: 0; background-color: rgb(244, 234, 229);">{txt}</h{lvl}>
|
|
193
|
+
</div>
|
|
194
|
+
"""
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def check_struct_compatibility(
|
|
198
|
+
target_field: StructField, df_field: StructField, prefix=""
|
|
199
|
+
):
|
|
200
|
+
if not (
|
|
201
|
+
isinstance(target_field.dataType, StructType)
|
|
202
|
+
and isinstance(df_field.dataType, StructType)
|
|
203
|
+
):
|
|
204
|
+
return
|
|
205
|
+
|
|
206
|
+
target_fields = {
|
|
207
|
+
field.name: field for field in target_field.dataType.fields
|
|
208
|
+
}
|
|
209
|
+
for field in df_field.dataType.fields:
|
|
210
|
+
if field.name not in target_fields:
|
|
211
|
+
raise GoldTableCompatibilityError(
|
|
212
|
+
f"Extra field found in gold stage output STRUCT column {prefix}{target_field.name}: {field.name}"
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
if isinstance(field.dataType, StructType):
|
|
216
|
+
check_struct_compatibility(
|
|
217
|
+
target_fields[field.name],
|
|
218
|
+
field,
|
|
219
|
+
prefix=prefix + target_field.name + ".",
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
(pre_df, silver, gold) = stage_dataframes
|
|
223
|
+
d("Input", 1)
|
|
224
|
+
display(input_df)
|
|
225
|
+
d("Silver Pre-Transform", 1)
|
|
226
|
+
if pre_df:
|
|
227
|
+
display(pre_df)
|
|
228
|
+
else:
|
|
229
|
+
d("Skipped", 2)
|
|
230
|
+
d("Silver Transform", 1)
|
|
231
|
+
for name, df in silver.items():
|
|
232
|
+
d(f"{name}", 2)
|
|
233
|
+
display(df)
|
|
234
|
+
d("Gold", 1)
|
|
235
|
+
for name, df in gold.items():
|
|
236
|
+
d(f"{name}", 2)
|
|
237
|
+
d("Stage output", 3)
|
|
238
|
+
display(df)
|
|
239
|
+
|
|
240
|
+
# NOTE: Name is stored as Gold_name/Silver_input. So we need to get just the Gold table
|
|
241
|
+
# name that we are comparing the dataframe metadata to.
|
|
242
|
+
name = name.split("/")[0]
|
|
243
|
+
|
|
244
|
+
if not self._spark.catalog.tableExists(f"{gold_table_schema}.{name}"):
|
|
245
|
+
raise UnknownGoldTableError(name, gold_table_schema)
|
|
246
|
+
|
|
247
|
+
# Performs the type check.
|
|
248
|
+
delta_df = self._spark.table(f"{gold_table_schema}.{name}").limit(0)
|
|
249
|
+
unioned_df = delta_df.unionByName(df, allowMissingColumns=True)
|
|
250
|
+
|
|
251
|
+
# Now we check no new columns.
|
|
252
|
+
if not set(df.columns).issubset(delta_df.columns):
|
|
253
|
+
raise GoldTableCompatibilityError(
|
|
254
|
+
f"Extra columns provided: {', '.join([col for col in df.columns if col not in delta_df.columns])}"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Now we check no new fields in STRUCT columns.
|
|
258
|
+
for field in delta_df.schema.fields:
|
|
259
|
+
if isinstance(field.dataType, StructType) and field.name in df.columns:
|
|
260
|
+
# Retrieve the corresponding field from the DataFrame's schema.
|
|
261
|
+
df_field = next(f for f in df.schema.fields if f.name == field.name)
|
|
262
|
+
check_struct_compatibility(field, df_field)
|
|
263
|
+
|
|
264
|
+
# Check nullable columns exist, and data what we are inserting is set.
|
|
265
|
+
non_nullable_cols = [
|
|
266
|
+
field.name for field in delta_df.schema.fields if not field.nullable
|
|
267
|
+
]
|
|
268
|
+
null_checks = [
|
|
269
|
+
sum_(when(col_(col).isNull(), 1).otherwise(0)).alias(col)
|
|
270
|
+
for col in non_nullable_cols
|
|
271
|
+
]
|
|
272
|
+
null_counts = df.select(null_checks).collect()[0].asDict()
|
|
273
|
+
cols_with_nulls = []
|
|
274
|
+
try:
|
|
275
|
+
cols_with_nulls = [
|
|
276
|
+
col_name for col_name, count in null_counts.items() if count > 0
|
|
277
|
+
]
|
|
278
|
+
except TypeError:
|
|
279
|
+
# There were no records returned and so null_counts == None.
|
|
280
|
+
pass
|
|
281
|
+
if cols_with_nulls:
|
|
282
|
+
raise GoldTableCompatibilityError(
|
|
283
|
+
f"Record with null data found for non-nullable columns: {', '.join([col for col in cols_with_nulls])}"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
d("Resultant gold table preview", 3)
|
|
287
|
+
display(unioned_df)
|
|
288
|
+
|
|
289
|
+
def evaluate(self, gold_table_schema: str) -> None:
|
|
290
|
+
"""
|
|
291
|
+
Evaluates the loaded preset YAML using the input datasource configuration to load
|
|
292
|
+
records. Finally, checks that the output from the Gold stages is compatible with
|
|
293
|
+
the Unity Catalog Gold tables.
|
|
294
|
+
"""
|
|
295
|
+
s = gold_table_schema.split(".")
|
|
296
|
+
if len(s) != 2:
|
|
297
|
+
raise InvalidGoldTableSchemaError(gold_table_schema)
|
|
298
|
+
catalog_name = s[0]
|
|
299
|
+
schema_name = s[1]
|
|
300
|
+
if any(
|
|
301
|
+
row.catalog == catalog_name
|
|
302
|
+
for row in self._spark.sql("SHOW CATALOGS").collect()
|
|
303
|
+
):
|
|
304
|
+
if not any(
|
|
305
|
+
row.databaseName == schema_name
|
|
306
|
+
for row in self._spark.sql(f"SHOW SCHEMAS IN {catalog_name}").collect()
|
|
307
|
+
):
|
|
308
|
+
raise InvalidGoldTableSchemaError(
|
|
309
|
+
gold_table_schema,
|
|
310
|
+
f"Schema {schema_name} not found in catalog {catalog_name} or insufficient permissions.",
|
|
311
|
+
)
|
|
312
|
+
else:
|
|
313
|
+
raise InvalidGoldTableSchemaError(
|
|
314
|
+
gold_table_schema,
|
|
315
|
+
f"Catalog {catalog_name} not found or insufficient permissions.",
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# If we are using the autoloader, fetch format from preset and others.
|
|
319
|
+
if self._ds_params._mode == "autoloader":
|
|
320
|
+
if not (autoloader_conf := self._preset.get("autoloader", None)):
|
|
321
|
+
raise MissingAutoloaderConfigError()
|
|
322
|
+
if not (file_format := autoloader_conf.get("format", None)):
|
|
323
|
+
raise AutoloaderMissingFieldError("format")
|
|
324
|
+
self._ds_params.set_autoloader_format(file_format)
|
|
325
|
+
if schemaFile := autoloader_conf.get("schemaFile", None):
|
|
326
|
+
self._ds_params.set_autoloader_schema_file(schemaFile)
|
|
327
|
+
if multiline := autoloader_conf.get("multiline", None):
|
|
328
|
+
if multiline == "true":
|
|
329
|
+
self._ds_params.set_multiline(True)
|
|
330
|
+
else:
|
|
331
|
+
self._ds_params.set_multiline(False)
|
|
332
|
+
if cloudFiles := autoloader_conf.get("cloudFiles", None):
|
|
333
|
+
if schema_hints := cloudFiles.get("schemaHints", None):
|
|
334
|
+
self._ds_params.set_autoloader_cloudfiles_schema_hints(schema_hints)
|
|
335
|
+
if schema_hints_file := cloudFiles.get("schemaHintsFile", None):
|
|
336
|
+
self._ds_params.set_autoloader_cloudfiles_schema_hint_file(
|
|
337
|
+
schema_hints_file
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
self._compile_stages()
|
|
341
|
+
|
|
342
|
+
with self._ds_params as df:
|
|
343
|
+
self._result_df_map = self._run(df)
|
|
344
|
+
self._render_output(df, self._result_df_map, gold_table_schema)
|