dasl-client 1.0.7__tar.gz → 1.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dasl-client might be problematic. Click here for more details.

Files changed (37) hide show
  1. {dasl_client-1.0.7 → dasl_client-1.0.9}/PKG-INFO +2 -1
  2. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/client.py +10 -8
  3. dasl_client-1.0.9/dasl_client/preset_development/__init__.py +4 -0
  4. dasl_client-1.0.9/dasl_client/preset_development/errors.py +159 -0
  5. dasl_client-1.0.9/dasl_client/preset_development/preview_engine.py +344 -0
  6. dasl_client-1.0.9/dasl_client/preset_development/preview_parameters.py +386 -0
  7. dasl_client-1.0.9/dasl_client/preset_development/stage.py +559 -0
  8. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/types/admin_config.py +10 -7
  9. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/types/datasource.py +177 -138
  10. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/types/dbui.py +46 -34
  11. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/types/rule.py +91 -65
  12. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/types/types.py +67 -54
  13. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/types/workspace_config.py +86 -74
  14. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client.egg-info/PKG-INFO +2 -1
  15. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client.egg-info/SOURCES.txt +5 -0
  16. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client.egg-info/requires.txt +1 -0
  17. {dasl_client-1.0.7 → dasl_client-1.0.9}/pyproject.toml +3 -2
  18. {dasl_client-1.0.7 → dasl_client-1.0.9}/LICENSE +0 -0
  19. {dasl_client-1.0.7 → dasl_client-1.0.9}/README.md +0 -0
  20. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/__init__.py +0 -0
  21. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/auth/__init__.py +0 -0
  22. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/auth/auth.py +0 -0
  23. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/conn/__init__.py +0 -0
  24. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/conn/client_identifier.py +0 -0
  25. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/conn/conn.py +0 -0
  26. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/errors/__init__.py +0 -0
  27. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/errors/errors.py +0 -0
  28. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/helpers.py +0 -0
  29. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/types/__init__.py +0 -0
  30. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client/types/helpers.py +0 -0
  31. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client.egg-info/dependency_links.txt +0 -0
  32. {dasl_client-1.0.7 → dasl_client-1.0.9}/dasl_client.egg-info/top_level.txt +0 -0
  33. {dasl_client-1.0.7 → dasl_client-1.0.9}/setup.cfg +0 -0
  34. {dasl_client-1.0.7 → dasl_client-1.0.9}/setup.py +0 -0
  35. {dasl_client-1.0.7 → dasl_client-1.0.9}/test/test_api_changes.py +0 -0
  36. {dasl_client-1.0.7 → dasl_client-1.0.9}/test/test_api_surface.py +0 -0
  37. {dasl_client-1.0.7 → dasl_client-1.0.9}/test/test_marshaling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dasl_client
3
- Version: 1.0.7
3
+ Version: 1.0.9
4
4
  Summary: The DASL client library used for interacting with the DASL workspace
5
5
  Home-page: https://github.com/antimatter/asl
6
6
  Author: Antimatter Team
@@ -11,6 +11,7 @@ License-File: LICENSE
11
11
  Requires-Dist: dasl_api==0.1.12
12
12
  Requires-Dist: databricks-sdk>=0.41.0
13
13
  Requires-Dist: pydantic>=2
14
+ Requires-Dist: typing_extensions==4.10.0
14
15
 
15
16
  # DASL Client Library
16
17
 
@@ -51,7 +51,7 @@ class Client:
51
51
 
52
52
  :param auth: Authorization instance for authorizing requests to
53
53
  the dasl control plane.
54
- :returns Client
54
+ :returns: Client
55
55
  """
56
56
  self.auth = auth
57
57
 
@@ -83,7 +83,7 @@ class Client:
83
83
  :param dasl_host: The URL of the DASL server. This value should
84
84
  not generally be specified unless you are testing against
85
85
  an alternative environment.
86
- :returns Client
86
+ :returns: Client for the newly created workspace.
87
87
  """
88
88
  with error_handler():
89
89
  if workspace_url is None:
@@ -131,7 +131,7 @@ class Client:
131
131
  :param dasl_host: The URL of the DASL server. This value should
132
132
  not generally be specified unless you are testing against
133
133
  an alternative environment.
134
- :returns Client
134
+ :returns: Client for the existing workspace.
135
135
  """
136
136
  with error_handler():
137
137
  if workspace_url is None:
@@ -195,7 +195,7 @@ class Client:
195
195
  :param dasl_host: The URL of the DASL server. This value should
196
196
  not generally be specified unless you are testing against
197
197
  an alternative environment.
198
- :returns Client
198
+ :returns: Client for the newly created or existing workspace.
199
199
  """
200
200
  try:
201
201
  return Client.new_workspace(
@@ -274,7 +274,7 @@ class Client:
274
274
  you will need to repopulate the service_principal_secret correctly
275
275
  before passing the result back to put_admin_config.
276
276
 
277
- :returns AdminConfig
277
+ :returns: AdminConfig containing the current settings.
278
278
  """
279
279
  with error_handler():
280
280
  return AdminConfig.from_api_obj(
@@ -306,6 +306,8 @@ class Client:
306
306
  Retrieve the WorkspaceConfig from the DASL server. The returned
307
307
  value can be updated directly and passed to put_config in order
308
308
  to make changes.
309
+
310
+ :returns: WorkspaceConfig containing the current configuration.
309
311
  """
310
312
  with error_handler():
311
313
  return WorkspaceConfig.from_api_obj(
@@ -348,7 +350,7 @@ class Client:
348
350
  in order to make changes.
349
351
 
350
352
  :param name: The unique name of the DataSource within this workspace
351
- :returns DataSource
353
+ :returns: DataSource
352
354
  """
353
355
  with error_handler():
354
356
  return DataSource.from_api_obj(
@@ -459,7 +461,7 @@ class Client:
459
461
  in order to make changes.
460
462
 
461
463
  :param name: The unique name of the Rule within this workspace
462
- :returns Rule
464
+ :returns: Rule
463
465
  """
464
466
  with error_handler():
465
467
  return Rule.from_api_obj(
@@ -601,7 +603,7 @@ class Client:
601
603
  :param value: The observable value
602
604
  :param cursor: A cursor to be used when paginating results
603
605
  :param limit: A limit of the number of results to return
604
- :returns EventsList
606
+ :returns: EventsList
605
607
  """
606
608
  with error_handler():
607
609
  return Dbui.ObservableEvents.EventsList.from_api_obj(
@@ -0,0 +1,4 @@
1
+ # dasl_client/preset_development/__init__.py
2
+ from .preview_parameters import *
3
+ from .errors import *
4
+ from .preview_engine import *
@@ -0,0 +1,159 @@
1
+ from pyspark.sql import DataFrame, SparkSession
2
+ from pyspark.sql.types import *
3
+ from pyspark.sql.dataframe import DataFrame
4
+ from typing import Dict, Any, List, Mapping, Tuple
5
+ from IPython import get_ipython
6
+
7
+
8
+ class PresetError(Exception):
9
+ pass
10
+
11
+
12
+ class InvalidGoldTableSchemaError(PresetError):
13
+ def __init__(self, schema: str, additional_message: str = ""):
14
+ self.schema = schema
15
+ message = (
16
+ f"Malformed gold schema provided {schema}. {additional_message}".strip()
17
+ )
18
+ super().__init__(message)
19
+
20
+
21
+ class NoSilverStageProvdedError(PresetError):
22
+ def __init__(self, additional_msg: str = ""):
23
+ message = f"No silver stage provided{additional_msg}."
24
+ super().__init__(message)
25
+
26
+
27
+ class NoSilverTransformStageProvdedError(PresetError):
28
+ def __init__(
29
+ self,
30
+ message: str = "No silver transform stage provided, but gold stage is present.",
31
+ ):
32
+ super().__init__(message)
33
+
34
+
35
+ class PreTransformNotFound(PresetError):
36
+ def __init__(
37
+ self,
38
+ message: str = "Requested silver pretransform name not found in preset's silver pretransforms.",
39
+ ):
40
+ super().__init__(message)
41
+
42
+
43
+ class NoSilverPreTransformStageProvdedError(PresetError):
44
+ def __init__(
45
+ self,
46
+ message: str = "No silver transform stage provided, but prestransform name provided.",
47
+ ):
48
+ super().__init__(message)
49
+
50
+
51
+ class MissingTableFieldError(PresetError):
52
+ def __init__(self, layer: str, table_name: str, field_name: str):
53
+ self.layer = layer
54
+ self.table_name = table_name
55
+ self.field_name = field_name
56
+ message = f"{layer} stage {table_name} is missing {field_name} field."
57
+ super().__init__(message)
58
+
59
+
60
+ class DuplicateFieldNameError(PresetError):
61
+ def __init__(self, stage: str, stage_name: str, field_name: str):
62
+ self.stage = stage
63
+ self.stage_name = stage_name
64
+ self.field_name = field_name
65
+ message = f"Duplicate field specification name found in {stage} stage {stage_name} named {field_name}."
66
+ super().__init__(message)
67
+
68
+
69
+ class MalformedFieldError(PresetError):
70
+ def __init__(self, stage: str, stage_name: str, field_name: str):
71
+ self.stage = stage
72
+ self.stage_name = stage_name
73
+ self.field_name = field_name
74
+ message = f"Please provide 1 operation only in {stage} stage {stage_name}'s field specification named {field_name}."
75
+ super().__init__(message)
76
+
77
+
78
+ class MissingFieldNameError(PresetError):
79
+ def __init__(self, stage: str, stage_name: str):
80
+ self.stage = stage
81
+ self.stage_name = stage_name
82
+ message = (
83
+ f"Field specification in {stage} stage {stage_name} missing name field."
84
+ )
85
+ super().__init__(message)
86
+
87
+
88
+ class MissingSilverKeysError(PresetError):
89
+ def __init__(self, missing_keys: str):
90
+ self.missing_keys = missing_keys
91
+ message = f"Gold table/s have no corresponding input from silver table/s: {missing_keys}"
92
+ super().__init__(message)
93
+
94
+
95
+ class MissingAutoloaderConfigError(PresetError):
96
+ def __init__(
97
+ self,
98
+ message: str = "Autoloader mode selected, but no autoloader configuration found in preset.autoloader.",
99
+ ):
100
+ super().__init__(message)
101
+
102
+
103
+ class AutoloaderMissingFieldError(PresetError):
104
+ def __init__(self, field_name: str):
105
+ self.field_name = field_name
106
+ message = f"Autoloader mode selected, but missing field {field_name} in preset."
107
+ super().__init__(message)
108
+
109
+
110
+ class UnknownGoldTableError(PresetError):
111
+ def __init__(self, table_name: str, schema: str):
112
+ self.table_name = table_name
113
+ self.schema = schema
114
+ message = (
115
+ f"The referenced Gold table name {table_name} does not exist in {schema}."
116
+ )
117
+ super().__init__(message)
118
+
119
+
120
+ class GoldTableCompatibilityError(PresetError):
121
+ def __init__(self, message: str):
122
+ super().__init__(message)
123
+
124
+
125
+ class ReferencedColumnMissingError(PresetError):
126
+ def __init__(self, operation: str, column_name: str):
127
+ self.operation = operation
128
+ self.column_name = column_name
129
+ message = f"The referenced column {column_name} was not found in the dataframe during {operation} operation."
130
+ super().__init__(message)
131
+
132
+
133
+ class MissingJoinFieldError(PresetError):
134
+ def __init__(self, field_name: str):
135
+ self.field_name = field_name
136
+ message = f"Join operation is missing required field {field_name}."
137
+ super().__init__(message)
138
+
139
+
140
+ class MissingUtilityConfigurationFieldError(PresetError):
141
+ def __init__(self, operation: str, field_name: str):
142
+ self.operation = operation
143
+ self.field_name = field_name
144
+ message = f"The required configuration field {field_name} was not suppled in the {operation} operation."
145
+ super().__init__(message)
146
+
147
+
148
+ class AssertionFailedError(PresetError):
149
+ def __init__(self, expr: str, assertion_message: str, df: DataFrame):
150
+ # Get the Databricks built-in functions out the namespace.
151
+ ipython = get_ipython()
152
+ display = ipython.user_ns["display"]
153
+
154
+ self.expr = expr
155
+ self.assertion_message = assertion_message
156
+ self.df = df
157
+ message = f"The above rows failed the assertion expression {expr} with reason: {assertion_message}\n"
158
+ display(df)
159
+ super().__init__(message)
@@ -0,0 +1,344 @@
1
+ from pyspark.sql import DataFrame, SparkSession
2
+ from pyspark.sql.types import *
3
+ from pyspark.sql.dataframe import DataFrame
4
+ from pyspark.sql.functions import lit, col as col_, sum as sum_, when
5
+ from dasl_client.preset_development.preview_parameters import *
6
+ from dasl_client.preset_development.stage import *
7
+ from dasl_client.preset_development.errors import *
8
+ import yaml
9
+ from IPython import get_ipython
10
+
11
+
12
+ class PreviewEngine:
13
+ """
14
+ This class deserializes the in-development preset's YAML and performs a series of
15
+ validation steps before attempting to compile each stage's table and execute them
16
+ based on the provided PreviewParameters.
17
+
18
+ Upon successful execution, output is generated for each successfully executed
19
+ stage's table operations. Additionally, if Gold stages are computed, their outputs
20
+ are validated against the provided Gold stage tables to ensure compatibility on a
21
+ per-table-name basis with the Unity Catalog.
22
+
23
+ For example, a preset Gold stage table named "http_activity" will be checked against
24
+ the corresponding table in the Unity Catalog schema—also named "http_activity" to
25
+ confirm that inserting into the Unity Catalog most likely not cause errors.
26
+ """
27
+
28
+ def __init__(
29
+ self, spark: SparkSession, preset_yaml_str: str, ds_params: PreviewParameters
30
+ ):
31
+ """
32
+ Creates the PreviewEngine using the given preset YAML and datasource parameters.
33
+ The YAML is deserialized here and checked to verify whether the requested
34
+ pretransform name, if provided, exists in the preset.
35
+
36
+ Instance Attributes:
37
+ ds_params (PreviewParameters): The input datasource's configuration.
38
+ preset (Dict[str, Any]): The deserialized preset YAML.
39
+ pretransform_name (str): The name of the requested pretransform. Defaults to None.
40
+ pre (Stage): Stores the pretransform Stage object internally.
41
+ silver (List[Stage]): Stores the Silver Stage objects internally.
42
+ gold (List[Stage]): Stores the Gold Stage objects internally.
43
+ """
44
+ self._spark = spark
45
+ self._ds_params = ds_params
46
+ self._preset = yaml.safe_load(preset_yaml_str)
47
+ self._pretransform_name = ds_params._pretransform_name
48
+
49
+ self._validate_gold_inputs(
50
+ self._preset.get("silver", None), self._preset.get("gold", None)
51
+ )
52
+ if self._pretransform_name:
53
+ self._validate_pretransform_name(
54
+ self._preset.get("silver", None), self._pretransform_name
55
+ )
56
+
57
+ self._pre = None
58
+ self._silver = []
59
+ self._gold = []
60
+ self._result_df_map = {}
61
+
62
+ def _validate_pretransform_name(
63
+ self, silver: Dict[str, str], pretransform_name: str
64
+ ) -> None:
65
+ """
66
+ Validates the given pretransform name exists in the provided preset's Silver
67
+ PreTransform stages.
68
+ """
69
+ if not silver:
70
+ raise NoSilverStageProvdedError(", but pretransform name provided")
71
+ if not (silver_pre_transform := silver.get("preTransform", None)):
72
+ raise NoSilverPreTransformStageProvdedError()
73
+ silver_pre_output_names = []
74
+ for table in silver_pre_transform:
75
+ if not (name := table.get("name", None)):
76
+ raise MissingTableFieldError(
77
+ "Silver pretransform",
78
+ table.get("name", "<stage missing name>"),
79
+ "name",
80
+ )
81
+ silver_pre_output_names += [name]
82
+ if pretransform_name not in silver_pre_output_names:
83
+ raise PreTransformNotFound()
84
+
85
+ def _validate_gold_inputs(
86
+ self, silver: Dict[str, str], gold: Dict[str, str]
87
+ ) -> None:
88
+ """
89
+ Validate gold tables all have a silver table to input from.
90
+ """
91
+ if not gold:
92
+ return
93
+
94
+ if not len(gold):
95
+ return
96
+
97
+ if not silver:
98
+ raise NoSilverStageProvdedError(", but gold stage is present")
99
+
100
+ gold_input_names = []
101
+ for table in gold:
102
+ if not (input := table.get("input", None)):
103
+ raise MissingTableFieldError(
104
+ "Gold", table.get("name", "<stage missing name>"), "input"
105
+ )
106
+ gold_input_names += [input]
107
+
108
+ if not (silver_transform := silver.get("transform", None)):
109
+ raise NoSilverTransformStageProvdedError()
110
+ silver_output_names = []
111
+ for table in silver_transform:
112
+ if not (name := table.get("name", None)):
113
+ raise MissingTableFieldError(
114
+ "Silver transform", table.get("name", ""), "name"
115
+ )
116
+ silver_output_names += [name]
117
+
118
+ missing_keys = set(gold_input_names) - set(silver_output_names)
119
+ if missing_keys:
120
+ raise MissingSilverKeysError(missing_keys)
121
+
122
+ def _compile_stages(self) -> None:
123
+ """
124
+ Creates Stage objects, setting pretransform to None if not provided.
125
+ """
126
+ pretransform = None
127
+ if self._pretransform_name:
128
+ for table in self._preset["silver"]["preTransform"]:
129
+ if table["name"] == self._pretransform_name:
130
+ self._pre = Stage(self._spark, "silver pretransform", table)
131
+ break
132
+
133
+ self._silver = [
134
+ Stage(self._spark, "silver transform", table)
135
+ for table in self._preset.get("silver", {}).get("transform", [])
136
+ ]
137
+ self._gold = [
138
+ Stage(self._spark, "gold", table) for table in self._preset.get("gold", [])
139
+ ]
140
+
141
+ def _run(
142
+ self, df: DataFrame
143
+ ) -> Tuple[DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]]:
144
+ """
145
+ Runs all stages, in medallion stage order. This allows prior stage outputs to feed
146
+ into later stage inputs.
147
+
148
+ Returns:
149
+ Dataframes containing the output from each run Stage.
150
+ """
151
+ if self._pre:
152
+ df = self._pre.run(df)
153
+
154
+ silver_output_map = {}
155
+ for table in self._silver:
156
+ silver_output_map[table._name] = table.run(df)
157
+
158
+ gold_output_map = {}
159
+ for table in self._gold:
160
+ # We store as gold_name/silver_input to prevent clobbering on duplicate gold table use.
161
+ gold_output_map[f"{table._name}/{table._input}"] = table.run(
162
+ silver_output_map[table._input]
163
+ )
164
+
165
+ return (
166
+ (df, silver_output_map, gold_output_map)
167
+ if self._pre
168
+ else (None, silver_output_map, gold_output_map)
169
+ )
170
+
171
+ def _render_output(
172
+ self,
173
+ input_df: DataFrame,
174
+ stage_dataframes: Tuple[DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]],
175
+ gold_table_schema: str,
176
+ ) -> None:
177
+ """
178
+ Displays formatted HTML output from executed Stages' DataFrames.
179
+ """
180
+ # TODO: Investigate further into using Databricks's style sheets here.
181
+
182
+ # Get the Databricks built-in functions out the namespace.
183
+ ipython = get_ipython()
184
+ displayHTML = ipython.user_ns["displayHTML"]
185
+ display = ipython.user_ns["display"]
186
+
187
+ def d(txt, lvl) -> None:
188
+ displayHTML(
189
+ f"""
190
+ <div style="background-color:
191
+ background-color: rgb(18, 23, 26); padding: 0; margin: 0;">
192
+ <h{lvl} style="margin: 0; background-color: rgb(244, 234, 229);">{txt}</h{lvl}>
193
+ </div>
194
+ """
195
+ )
196
+
197
+ def check_struct_compatibility(
198
+ target_field: StructField, df_field: StructField, prefix=""
199
+ ):
200
+ if not (
201
+ isinstance(target_field.dataType, StructType)
202
+ and isinstance(df_field.dataType, StructType)
203
+ ):
204
+ return
205
+
206
+ target_fields = {
207
+ field.name: field for field in target_field.dataType.fields
208
+ }
209
+ for field in df_field.dataType.fields:
210
+ if field.name not in target_fields:
211
+ raise GoldTableCompatibilityError(
212
+ f"Extra field found in gold stage output STRUCT column {prefix}{target_field.name}: {field.name}"
213
+ )
214
+ else:
215
+ if isinstance(field.dataType, StructType):
216
+ check_struct_compatibility(
217
+ target_fields[field.name],
218
+ field,
219
+ prefix=prefix + target_field.name + ".",
220
+ )
221
+
222
+ (pre_df, silver, gold) = stage_dataframes
223
+ d("Input", 1)
224
+ display(input_df)
225
+ d("Silver Pre-Transform", 1)
226
+ if pre_df:
227
+ display(pre_df)
228
+ else:
229
+ d("Skipped", 2)
230
+ d("Silver Transform", 1)
231
+ for name, df in silver.items():
232
+ d(f"{name}", 2)
233
+ display(df)
234
+ d("Gold", 1)
235
+ for name, df in gold.items():
236
+ d(f"{name}", 2)
237
+ d("Stage output", 3)
238
+ display(df)
239
+
240
+ # NOTE: Name is stored as Gold_name/Silver_input. So we need to get just the Gold table
241
+ # name that we are comparing the dataframe metadata to.
242
+ name = name.split("/")[0]
243
+
244
+ if not self._spark.catalog.tableExists(f"{gold_table_schema}.{name}"):
245
+ raise UnknownGoldTableError(name, gold_table_schema)
246
+
247
+ # Performs the type check.
248
+ delta_df = self._spark.table(f"{gold_table_schema}.{name}").limit(0)
249
+ unioned_df = delta_df.unionByName(df, allowMissingColumns=True)
250
+
251
+ # Now we check no new columns.
252
+ if not set(df.columns).issubset(delta_df.columns):
253
+ raise GoldTableCompatibilityError(
254
+ f"Extra columns provided: {', '.join([col for col in df.columns if col not in delta_df.columns])}"
255
+ )
256
+
257
+ # Now we check no new fields in STRUCT columns.
258
+ for field in delta_df.schema.fields:
259
+ if isinstance(field.dataType, StructType) and field.name in df.columns:
260
+ # Retrieve the corresponding field from the DataFrame's schema.
261
+ df_field = next(f for f in df.schema.fields if f.name == field.name)
262
+ check_struct_compatibility(field, df_field)
263
+
264
+ # Check nullable columns exist, and data what we are inserting is set.
265
+ non_nullable_cols = [
266
+ field.name for field in delta_df.schema.fields if not field.nullable
267
+ ]
268
+ null_checks = [
269
+ sum_(when(col_(col).isNull(), 1).otherwise(0)).alias(col)
270
+ for col in non_nullable_cols
271
+ ]
272
+ null_counts = df.select(null_checks).collect()[0].asDict()
273
+ cols_with_nulls = []
274
+ try:
275
+ cols_with_nulls = [
276
+ col_name for col_name, count in null_counts.items() if count > 0
277
+ ]
278
+ except TypeError:
279
+ # There were no records returned and so null_counts == None.
280
+ pass
281
+ if cols_with_nulls:
282
+ raise GoldTableCompatibilityError(
283
+ f"Record with null data found for non-nullable columns: {', '.join([col for col in cols_with_nulls])}"
284
+ )
285
+
286
+ d("Resultant gold table preview", 3)
287
+ display(unioned_df)
288
+
289
+ def evaluate(self, gold_table_schema: str) -> None:
290
+ """
291
+ Evaluates the loaded preset YAML using the input datasource configuration to load
292
+ records. Finally, checks that the output from the Gold stages is compatible with
293
+ the Unity Catalog Gold tables.
294
+ """
295
+ s = gold_table_schema.split(".")
296
+ if len(s) != 2:
297
+ raise InvalidGoldTableSchemaError(gold_table_schema)
298
+ catalog_name = s[0]
299
+ schema_name = s[1]
300
+ if any(
301
+ row.catalog == catalog_name
302
+ for row in self._spark.sql("SHOW CATALOGS").collect()
303
+ ):
304
+ if not any(
305
+ row.databaseName == schema_name
306
+ for row in self._spark.sql(f"SHOW SCHEMAS IN {catalog_name}").collect()
307
+ ):
308
+ raise InvalidGoldTableSchemaError(
309
+ gold_table_schema,
310
+ f"Schema {schema_name} not found in catalog {catalog_name} or insufficient permissions.",
311
+ )
312
+ else:
313
+ raise InvalidGoldTableSchemaError(
314
+ gold_table_schema,
315
+ f"Catalog {catalog_name} not found or insufficient permissions.",
316
+ )
317
+
318
+ # If we are using the autoloader, fetch format from preset and others.
319
+ if self._ds_params._mode == "autoloader":
320
+ if not (autoloader_conf := self._preset.get("autoloader", None)):
321
+ raise MissingAutoloaderConfigError()
322
+ if not (file_format := autoloader_conf.get("format", None)):
323
+ raise AutoloaderMissingFieldError("format")
324
+ self._ds_params.set_autoloader_format(file_format)
325
+ if schemaFile := autoloader_conf.get("schemaFile", None):
326
+ self._ds_params.set_autoloader_schema_file(schemaFile)
327
+ if multiline := autoloader_conf.get("multiline", None):
328
+ if multiline == "true":
329
+ self._ds_params.set_multiline(True)
330
+ else:
331
+ self._ds_params.set_multiline(False)
332
+ if cloudFiles := autoloader_conf.get("cloudFiles", None):
333
+ if schema_hints := cloudFiles.get("schemaHints", None):
334
+ self._ds_params.set_autoloader_cloudfiles_schema_hints(schema_hints)
335
+ if schema_hints_file := cloudFiles.get("schemaHintsFile", None):
336
+ self._ds_params.set_autoloader_cloudfiles_schema_hint_file(
337
+ schema_hints_file
338
+ )
339
+
340
+ self._compile_stages()
341
+
342
+ with self._ds_params as df:
343
+ self._result_df_map = self._run(df)
344
+ self._render_output(df, self._result_df_map, gold_table_schema)