dasl-client 1.0.22__py3-none-any.whl → 1.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dasl-client might be problematic. Click here for more details.
- dasl_client/__init__.py +1 -0
- dasl_client/client.py +240 -23
- dasl_client/exec_rule.py +92 -0
- dasl_client/helpers.py +1 -1
- dasl_client/preset_development/errors.py +42 -0
- dasl_client/preset_development/preview_engine.py +122 -61
- dasl_client/preset_development/preview_parameters.py +237 -97
- dasl_client/preset_development/stage.py +87 -24
- dasl_client/regions.json +3 -0
- dasl_client/regions.py +18 -0
- dasl_client/types/datasource.py +51 -0
- dasl_client/types/rule.py +33 -0
- dasl_client/types/workspace_config.py +121 -9
- dasl_client-1.0.24.dist-info/METADATA +18 -0
- dasl_client-1.0.24.dist-info/RECORD +32 -0
- {dasl_client-1.0.22.dist-info → dasl_client-1.0.24.dist-info}/WHEEL +1 -1
- {dasl_client-1.0.22.dist-info → dasl_client-1.0.24.dist-info}/top_level.txt +0 -1
- dasl_client-1.0.22.dist-info/METADATA +0 -34
- dasl_client-1.0.22.dist-info/RECORD +0 -36
- test/__init__.py +0 -0
- test/conftest.py +0 -18
- test/constants.py +0 -10
- test/test_api_changes.py +0 -137
- test/test_api_surface.py +0 -304
- test/test_databricks_secret_auth.py +0 -116
- test/test_marshaling.py +0 -910
- {dasl_client-1.0.22.dist-info → dasl_client-1.0.24.dist-info/licenses}/LICENSE +0 -0
|
@@ -7,6 +7,12 @@ from dasl_client.preset_development.stage import *
|
|
|
7
7
|
from dasl_client.preset_development.errors import *
|
|
8
8
|
import yaml
|
|
9
9
|
from IPython import get_ipython
|
|
10
|
+
from itertools import count
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@udf(StringType())
|
|
14
|
+
def constant_udf(*args):
|
|
15
|
+
return "<sortable_random_id>"
|
|
10
16
|
|
|
11
17
|
|
|
12
18
|
class PreviewEngine:
|
|
@@ -54,8 +60,9 @@ class PreviewEngine:
|
|
|
54
60
|
self._preset.get("silver", None), self._pretransform_name
|
|
55
61
|
)
|
|
56
62
|
|
|
63
|
+
self._pre_bronze = None
|
|
57
64
|
self._bronze = None
|
|
58
|
-
self.
|
|
65
|
+
self._pre_silver = None
|
|
59
66
|
self._silver = []
|
|
60
67
|
self._gold = []
|
|
61
68
|
self._result_df_map: Tuple[
|
|
@@ -124,13 +131,32 @@ class PreviewEngine:
|
|
|
124
131
|
|
|
125
132
|
def _compile_stages(self) -> None:
|
|
126
133
|
"""
|
|
127
|
-
Creates Stage objects, setting pretransform to None if not provided.
|
|
134
|
+
Creates Stage objects, setting silver pretransform to None if not provided.
|
|
128
135
|
"""
|
|
136
|
+
pre_bronze_field_counter = count()
|
|
137
|
+
pre_bronze_name_counter = count()
|
|
138
|
+
pre_bronze_expr_groups = self._preset.get("bronze", {}).get("preTransform", [])
|
|
139
|
+
if pre_bronze_expr_groups:
|
|
140
|
+
tables = [
|
|
141
|
+
{
|
|
142
|
+
"name": f"Index {next(pre_bronze_name_counter)}",
|
|
143
|
+
"fields": [
|
|
144
|
+
{"name": str(next(pre_bronze_field_counter)), "expr": expr}
|
|
145
|
+
for expr in expr_group
|
|
146
|
+
],
|
|
147
|
+
}
|
|
148
|
+
for expr_group in pre_bronze_expr_groups
|
|
149
|
+
]
|
|
150
|
+
for table in tables:
|
|
151
|
+
self._pre_bronze = [
|
|
152
|
+
Stage(self._spark, "bronze pretransform", table) for table in tables
|
|
153
|
+
]
|
|
154
|
+
|
|
129
155
|
pretransform = None
|
|
130
156
|
if self._pretransform_name:
|
|
131
157
|
for table in self._preset["silver"]["preTransform"]:
|
|
132
158
|
if table["name"] == self._pretransform_name:
|
|
133
|
-
self.
|
|
159
|
+
self._pre_silver = Stage(self._spark, "silver pretransform", table)
|
|
134
160
|
break
|
|
135
161
|
|
|
136
162
|
self._silver = [
|
|
@@ -151,8 +177,56 @@ class PreviewEngine:
|
|
|
151
177
|
Returns:
|
|
152
178
|
Dataframes containing the output from each run Stage.
|
|
153
179
|
"""
|
|
154
|
-
|
|
155
|
-
|
|
180
|
+
# If we are in silverbronze mode, and an autoloader has been provided, or we are
|
|
181
|
+
# not in silverbronze mode, we need to run the preBronze stage.
|
|
182
|
+
pre_bronze_output = {}
|
|
183
|
+
if (
|
|
184
|
+
self._ds_params._mode != "silverbronze"
|
|
185
|
+
or self._ds_params._autoloader_location
|
|
186
|
+
):
|
|
187
|
+
if self._pre_bronze:
|
|
188
|
+
for stage in self._pre_bronze:
|
|
189
|
+
df = stage.run(df)
|
|
190
|
+
pre_bronze_output[stage._name] = df
|
|
191
|
+
else:
|
|
192
|
+
# We are in silverbronze mode with no autoloader, so we treat first
|
|
193
|
+
# silverbronze table as initial df.
|
|
194
|
+
df = (
|
|
195
|
+
self._spark.table(self._ds_params._bronze_tables[0].get("name", ""))
|
|
196
|
+
.drop("dasl_id")
|
|
197
|
+
.limit(self._ds_params._record_limit)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
if time_col := self._ds_params._time_column:
|
|
201
|
+
df = df.filter(
|
|
202
|
+
f"timestamp({time_col}) >= timestamp('{self._ds_params._start_time}') AND timestamp({time_col}) < timestamp('{self._ds_params._end_time}')"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
df = df.withColumn("dasl_id", constant_udf())
|
|
206
|
+
|
|
207
|
+
self._bronze = df
|
|
208
|
+
|
|
209
|
+
# Deal with silverbronze table joins.
|
|
210
|
+
# Note: We can blind get here as validation should've caught anything missing.
|
|
211
|
+
if self._ds_params._mode == "silverbronze":
|
|
212
|
+
if alias := self._ds_params._bronze_tables[0].get("alias", None):
|
|
213
|
+
df = df.alias(alias)
|
|
214
|
+
for bronze_table in self._ds_params._bronze_tables[1:]:
|
|
215
|
+
join_df = (
|
|
216
|
+
spark.table(bronze_table["name"])
|
|
217
|
+
.drop("dasl_id")
|
|
218
|
+
.limit(self._ds_params._record_limit)
|
|
219
|
+
)
|
|
220
|
+
if alias := bronze_table.get("alias", None):
|
|
221
|
+
join_df = join_df.alias(alias)
|
|
222
|
+
df = df.join(
|
|
223
|
+
join_df,
|
|
224
|
+
expr(bronze_table["joinExpr"]),
|
|
225
|
+
bronze_table.get("joinType", "left"),
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
if self._pre_silver:
|
|
229
|
+
df = self._pre_silver.run(df)
|
|
156
230
|
|
|
157
231
|
silver_output_map = {}
|
|
158
232
|
for table in self._silver:
|
|
@@ -166,15 +240,17 @@ class PreviewEngine:
|
|
|
166
240
|
)
|
|
167
241
|
|
|
168
242
|
return (
|
|
169
|
-
(df, silver_output_map, gold_output_map)
|
|
170
|
-
if self.
|
|
171
|
-
else (None, silver_output_map, gold_output_map)
|
|
243
|
+
(df, silver_output_map, gold_output_map, pre_bronze_output)
|
|
244
|
+
if self._pre_silver
|
|
245
|
+
else (None, silver_output_map, gold_output_map, pre_bronze_output)
|
|
172
246
|
)
|
|
173
247
|
|
|
174
248
|
def _render_output(
|
|
175
249
|
self,
|
|
176
250
|
input_df: DataFrame,
|
|
177
|
-
stage_dataframes: Tuple[
|
|
251
|
+
stage_dataframes: Tuple[
|
|
252
|
+
List[DataFrame], DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]
|
|
253
|
+
],
|
|
178
254
|
gold_table_catalog: str,
|
|
179
255
|
gold_table_schema: str,
|
|
180
256
|
) -> None:
|
|
@@ -195,7 +271,7 @@ class PreviewEngine:
|
|
|
195
271
|
def d(txt, lvl) -> None:
|
|
196
272
|
displayHTML(
|
|
197
273
|
f"""
|
|
198
|
-
<div style="background-color:
|
|
274
|
+
<div style="background-color:
|
|
199
275
|
background-color: rgb(18, 23, 26); padding: 0; margin: 0;">
|
|
200
276
|
<h{lvl} style="margin: 0; background-color: rgb(244, 234, 229);">{txt}</h{lvl}>
|
|
201
277
|
</div>
|
|
@@ -227,12 +303,16 @@ class PreviewEngine:
|
|
|
227
303
|
prefix=prefix + target_field.name + ".",
|
|
228
304
|
)
|
|
229
305
|
|
|
230
|
-
(
|
|
231
|
-
d("Input", 1)
|
|
306
|
+
(pre_silver, silver, gold, pre_bronze) = stage_dataframes
|
|
307
|
+
d("Autoloader Input", 1)
|
|
232
308
|
display(input_df)
|
|
309
|
+
d("Bronze Pre-Transform", 1)
|
|
310
|
+
for name, df in pre_bronze.items():
|
|
311
|
+
d(f"{name}", 2)
|
|
312
|
+
display(df)
|
|
233
313
|
d("Silver Pre-Transform", 1)
|
|
234
|
-
if
|
|
235
|
-
display(
|
|
314
|
+
if pre_silver:
|
|
315
|
+
display(pre_silver)
|
|
236
316
|
else:
|
|
237
317
|
d("Skipped", 2)
|
|
238
318
|
d("Silver Transform", 1)
|
|
@@ -240,60 +320,40 @@ class PreviewEngine:
|
|
|
240
320
|
d(f"{name}", 2)
|
|
241
321
|
display(df)
|
|
242
322
|
d("Gold", 1)
|
|
243
|
-
for
|
|
244
|
-
d(f"{
|
|
323
|
+
for full_name, df in gold.items():
|
|
324
|
+
d(f"{full_name}", 2)
|
|
245
325
|
d("Stage output", 3)
|
|
246
326
|
display(df)
|
|
247
327
|
|
|
248
328
|
# NOTE: Name is stored as Gold_name/Silver_input. So we need to get just the Gold table
|
|
249
329
|
# name that we are comparing the dataframe metadata to.
|
|
250
|
-
name =
|
|
330
|
+
name = full_name.split("/")[0]
|
|
251
331
|
fqn_gold_table_name = f"{self.force_apply_backticks(gold_table_catalog)}.{self.force_apply_backticks(gold_table_schema)}.{self.force_apply_backticks(name)}"
|
|
252
332
|
|
|
253
333
|
if not self._spark.catalog.tableExists(f"{fqn_gold_table_name}"):
|
|
254
334
|
raise UnknownGoldTableError(name, gold_table_schema)
|
|
255
335
|
|
|
256
|
-
#
|
|
336
|
+
# Create a temporary table to perform the type check
|
|
257
337
|
delta_df = self._spark.table(f"{fqn_gold_table_name}").limit(0)
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
if not set(df.columns).issubset(delta_df.columns):
|
|
262
|
-
raise GoldTableCompatibilityError(
|
|
263
|
-
f"Extra columns provided: {', '.join([col for col in df.columns if col not in delta_df.columns])}"
|
|
264
|
-
)
|
|
338
|
+
delta_df.write.mode("overwrite").save(
|
|
339
|
+
f"{self._ds_params.get_autoloader_temp_schema_location()}/{full_name}"
|
|
340
|
+
)
|
|
265
341
|
|
|
266
|
-
#
|
|
267
|
-
|
|
268
|
-
if isinstance(field.dataType, StructType) and field.name in df.columns:
|
|
269
|
-
# Retrieve the corresponding field from the DataFrame's schema.
|
|
270
|
-
df_field = next(f for f in df.schema.fields if f.name == field.name)
|
|
271
|
-
check_struct_compatibility(field, df_field)
|
|
342
|
+
# Update the params to indicate we've added a testing temp gold table
|
|
343
|
+
self._ds_params.add_gold_schema_table(full_name)
|
|
272
344
|
|
|
273
|
-
#
|
|
274
|
-
non_nullable_cols = [
|
|
275
|
-
field.name for field in delta_df.schema.fields if not field.nullable
|
|
276
|
-
]
|
|
277
|
-
null_checks = [
|
|
278
|
-
sum_(when(col_(col).isNull(), 1).otherwise(0)).alias(col)
|
|
279
|
-
for col in non_nullable_cols
|
|
280
|
-
]
|
|
281
|
-
null_counts = df.select(null_checks).collect()[0].asDict()
|
|
282
|
-
cols_with_nulls = []
|
|
345
|
+
# Perform the type checks by trying to insert data into the table
|
|
283
346
|
try:
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
except
|
|
288
|
-
# There were no records returned and so null_counts == None.
|
|
289
|
-
pass
|
|
290
|
-
if cols_with_nulls:
|
|
347
|
+
df.write.mode("append").save(
|
|
348
|
+
f"{self._ds_params.get_autoloader_temp_schema_location()}/{full_name}"
|
|
349
|
+
)
|
|
350
|
+
except Exception as e:
|
|
291
351
|
raise GoldTableCompatibilityError(
|
|
292
|
-
f"
|
|
352
|
+
f"Preset gold table '{full_name}' did not match the gold schema for {fqn_gold_table_name}: {repr(e)}"
|
|
293
353
|
)
|
|
294
354
|
|
|
295
355
|
d("Resultant gold table preview", 3)
|
|
296
|
-
display(
|
|
356
|
+
display(df)
|
|
297
357
|
|
|
298
358
|
def is_backtick_escaped(self, name: str) -> bool:
|
|
299
359
|
"""
|
|
@@ -346,31 +406,32 @@ class PreviewEngine:
|
|
|
346
406
|
)
|
|
347
407
|
|
|
348
408
|
# If we are using the autoloader, fetch format from preset and others.
|
|
349
|
-
if self._ds_params._mode == "autoloader"
|
|
409
|
+
if self._ds_params._mode == "autoloader" or (
|
|
410
|
+
self._ds_params._mode == "silverbronze"
|
|
411
|
+
and self._ds_params._autoloader_location
|
|
412
|
+
):
|
|
413
|
+
if self._preset.get("bronze", {}).get("loadAsSingleVariant", False) == True:
|
|
414
|
+
self._ds_params._set_load_as_single_variant()
|
|
350
415
|
if not (autoloader_conf := self._preset.get("autoloader", None)):
|
|
351
416
|
raise MissingAutoloaderConfigError()
|
|
352
417
|
if not (file_format := autoloader_conf.get("format", None)):
|
|
353
418
|
raise AutoloaderMissingFieldError("format")
|
|
354
|
-
self._ds_params.
|
|
419
|
+
self._ds_params._set_autoloader_format(file_format)
|
|
355
420
|
if schemaFile := autoloader_conf.get("schemaFile", None):
|
|
356
|
-
self._ds_params.
|
|
357
|
-
if multiline := autoloader_conf.get("multiline", None):
|
|
358
|
-
if multiline == "true":
|
|
359
|
-
self._ds_params.set_multiline(True)
|
|
360
|
-
else:
|
|
361
|
-
self._ds_params.set_multiline(False)
|
|
421
|
+
self._ds_params._set_autoloader_schema_file(schemaFile)
|
|
362
422
|
if cloudFiles := autoloader_conf.get("cloudFiles", None):
|
|
363
423
|
if schema_hints := cloudFiles.get("schemaHints", None):
|
|
364
|
-
self._ds_params.
|
|
424
|
+
self._ds_params._set_autoloader_cloudfiles_schema_hints(
|
|
425
|
+
schema_hints
|
|
426
|
+
)
|
|
365
427
|
if schema_hints_file := cloudFiles.get("schemaHintsFile", None):
|
|
366
|
-
self._ds_params.
|
|
428
|
+
self._ds_params._set_autoloader_cloudfiles_schema_hint_file(
|
|
367
429
|
schema_hints_file
|
|
368
430
|
)
|
|
369
431
|
|
|
370
432
|
self._compile_stages()
|
|
371
433
|
|
|
372
434
|
with self._ds_params as df:
|
|
373
|
-
self._bronze = df
|
|
374
435
|
self._result_df_map = self._run(df)
|
|
375
436
|
if display:
|
|
376
437
|
self._render_output(
|