dasl-client 1.0.22__py3-none-any.whl → 1.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dasl-client might be problematic. Click here for more details.

@@ -7,6 +7,12 @@ from dasl_client.preset_development.stage import *
7
7
  from dasl_client.preset_development.errors import *
8
8
  import yaml
9
9
  from IPython import get_ipython
10
+ from itertools import count
11
+
12
+
13
+ @udf(StringType())
14
+ def constant_udf(*args):
15
+ return "<sortable_random_id>"
10
16
 
11
17
 
12
18
  class PreviewEngine:
@@ -54,8 +60,9 @@ class PreviewEngine:
54
60
  self._preset.get("silver", None), self._pretransform_name
55
61
  )
56
62
 
63
+ self._pre_bronze = None
57
64
  self._bronze = None
58
- self._pre = None
65
+ self._pre_silver = None
59
66
  self._silver = []
60
67
  self._gold = []
61
68
  self._result_df_map: Tuple[
@@ -124,13 +131,32 @@ class PreviewEngine:
124
131
 
125
132
  def _compile_stages(self) -> None:
126
133
  """
127
- Creates Stage objects, setting pretransform to None if not provided.
134
+ Creates Stage objects, setting silver pretransform to None if not provided.
128
135
  """
136
+ pre_bronze_field_counter = count()
137
+ pre_bronze_name_counter = count()
138
+ pre_bronze_expr_groups = self._preset.get("bronze", {}).get("preTransform", [])
139
+ if pre_bronze_expr_groups:
140
+ tables = [
141
+ {
142
+ "name": f"Index {next(pre_bronze_name_counter)}",
143
+ "fields": [
144
+ {"name": str(next(pre_bronze_field_counter)), "expr": expr}
145
+ for expr in expr_group
146
+ ],
147
+ }
148
+ for expr_group in pre_bronze_expr_groups
149
+ ]
150
+ for table in tables:
151
+ self._pre_bronze = [
152
+ Stage(self._spark, "bronze pretransform", table) for table in tables
153
+ ]
154
+
129
155
  pretransform = None
130
156
  if self._pretransform_name:
131
157
  for table in self._preset["silver"]["preTransform"]:
132
158
  if table["name"] == self._pretransform_name:
133
- self._pre = Stage(self._spark, "silver pretransform", table)
159
+ self._pre_silver = Stage(self._spark, "silver pretransform", table)
134
160
  break
135
161
 
136
162
  self._silver = [
@@ -151,8 +177,56 @@ class PreviewEngine:
151
177
  Returns:
152
178
  Dataframes containing the output from each run Stage.
153
179
  """
154
- if self._pre:
155
- df = self._pre.run(df)
180
+ # If we are in silverbronze mode, and an autoloader has been provided, or we are
181
+ # not in silverbronze mode, we need to run the preBronze stage.
182
+ pre_bronze_output = {}
183
+ if (
184
+ self._ds_params._mode != "silverbronze"
185
+ or self._ds_params._autoloader_location
186
+ ):
187
+ if self._pre_bronze:
188
+ for stage in self._pre_bronze:
189
+ df = stage.run(df)
190
+ pre_bronze_output[stage._name] = df
191
+ else:
192
+ # We are in silverbronze mode with no autoloader, so we treat first
193
+ # silverbronze table as initial df.
194
+ df = (
195
+ self._spark.table(self._ds_params._bronze_tables[0].get("name", ""))
196
+ .drop("dasl_id")
197
+ .limit(self._ds_params._record_limit)
198
+ )
199
+
200
+ if time_col := self._ds_params._time_column:
201
+ df = df.filter(
202
+ f"timestamp({time_col}) >= timestamp('{self._ds_params._start_time}') AND timestamp({time_col}) < timestamp('{self._ds_params._end_time}')"
203
+ )
204
+
205
+ df = df.withColumn("dasl_id", constant_udf())
206
+
207
+ self._bronze = df
208
+
209
+ # Deal with silverbronze table joins.
210
+ # Note: We can blind get here as validation should've caught anything missing.
211
+ if self._ds_params._mode == "silverbronze":
212
+ if alias := self._ds_params._bronze_tables[0].get("alias", None):
213
+ df = df.alias(alias)
214
+ for bronze_table in self._ds_params._bronze_tables[1:]:
215
+ join_df = (
216
+ spark.table(bronze_table["name"])
217
+ .drop("dasl_id")
218
+ .limit(self._ds_params._record_limit)
219
+ )
220
+ if alias := bronze_table.get("alias", None):
221
+ join_df = join_df.alias(alias)
222
+ df = df.join(
223
+ join_df,
224
+ expr(bronze_table["joinExpr"]),
225
+ bronze_table.get("joinType", "left"),
226
+ )
227
+
228
+ if self._pre_silver:
229
+ df = self._pre_silver.run(df)
156
230
 
157
231
  silver_output_map = {}
158
232
  for table in self._silver:
@@ -166,15 +240,17 @@ class PreviewEngine:
166
240
  )
167
241
 
168
242
  return (
169
- (df, silver_output_map, gold_output_map)
170
- if self._pre
171
- else (None, silver_output_map, gold_output_map)
243
+ (df, silver_output_map, gold_output_map, pre_bronze_output)
244
+ if self._pre_silver
245
+ else (None, silver_output_map, gold_output_map, pre_bronze_output)
172
246
  )
173
247
 
174
248
  def _render_output(
175
249
  self,
176
250
  input_df: DataFrame,
177
- stage_dataframes: Tuple[DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]],
251
+ stage_dataframes: Tuple[
252
+ List[DataFrame], DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]
253
+ ],
178
254
  gold_table_catalog: str,
179
255
  gold_table_schema: str,
180
256
  ) -> None:
@@ -195,7 +271,7 @@ class PreviewEngine:
195
271
  def d(txt, lvl) -> None:
196
272
  displayHTML(
197
273
  f"""
198
- <div style="background-color:
274
+ <div style="background-color:
199
275
  background-color: rgb(18, 23, 26); padding: 0; margin: 0;">
200
276
  <h{lvl} style="margin: 0; background-color: rgb(244, 234, 229);">{txt}</h{lvl}>
201
277
  </div>
@@ -227,12 +303,16 @@ class PreviewEngine:
227
303
  prefix=prefix + target_field.name + ".",
228
304
  )
229
305
 
230
- (pre_df, silver, gold) = stage_dataframes
231
- d("Input", 1)
306
+ (pre_silver, silver, gold, pre_bronze) = stage_dataframes
307
+ d("Autoloader Input", 1)
232
308
  display(input_df)
309
+ d("Bronze Pre-Transform", 1)
310
+ for name, df in pre_bronze.items():
311
+ d(f"{name}", 2)
312
+ display(df)
233
313
  d("Silver Pre-Transform", 1)
234
- if pre_df:
235
- display(pre_df)
314
+ if pre_silver:
315
+ display(pre_silver)
236
316
  else:
237
317
  d("Skipped", 2)
238
318
  d("Silver Transform", 1)
@@ -240,60 +320,40 @@ class PreviewEngine:
240
320
  d(f"{name}", 2)
241
321
  display(df)
242
322
  d("Gold", 1)
243
- for name, df in gold.items():
244
- d(f"{name}", 2)
323
+ for full_name, df in gold.items():
324
+ d(f"{full_name}", 2)
245
325
  d("Stage output", 3)
246
326
  display(df)
247
327
 
248
328
  # NOTE: Name is stored as Gold_name/Silver_input. So we need to get just the Gold table
249
329
  # name that we are comparing the dataframe metadata to.
250
- name = name.split("/")[0]
330
+ name = full_name.split("/")[0]
251
331
  fqn_gold_table_name = f"{self.force_apply_backticks(gold_table_catalog)}.{self.force_apply_backticks(gold_table_schema)}.{self.force_apply_backticks(name)}"
252
332
 
253
333
  if not self._spark.catalog.tableExists(f"{fqn_gold_table_name}"):
254
334
  raise UnknownGoldTableError(name, gold_table_schema)
255
335
 
256
- # Performs the type check.
336
+ # Create a temporary table to perform the type check
257
337
  delta_df = self._spark.table(f"{fqn_gold_table_name}").limit(0)
258
- unioned_df = delta_df.unionByName(df, allowMissingColumns=True)
259
-
260
- # Now we check no new columns.
261
- if not set(df.columns).issubset(delta_df.columns):
262
- raise GoldTableCompatibilityError(
263
- f"Extra columns provided: {', '.join([col for col in df.columns if col not in delta_df.columns])}"
264
- )
338
+ delta_df.write.mode("overwrite").save(
339
+ f"{self._ds_params.get_autoloader_temp_schema_location()}/{full_name}"
340
+ )
265
341
 
266
- # Now we check no new fields in STRUCT columns.
267
- for field in delta_df.schema.fields:
268
- if isinstance(field.dataType, StructType) and field.name in df.columns:
269
- # Retrieve the corresponding field from the DataFrame's schema.
270
- df_field = next(f for f in df.schema.fields if f.name == field.name)
271
- check_struct_compatibility(field, df_field)
342
+ # Update the params to indicate we've added a testing temp gold table
343
+ self._ds_params.add_gold_schema_table(full_name)
272
344
 
273
- # Check nullable columns exist, and data what we are inserting is set.
274
- non_nullable_cols = [
275
- field.name for field in delta_df.schema.fields if not field.nullable
276
- ]
277
- null_checks = [
278
- sum_(when(col_(col).isNull(), 1).otherwise(0)).alias(col)
279
- for col in non_nullable_cols
280
- ]
281
- null_counts = df.select(null_checks).collect()[0].asDict()
282
- cols_with_nulls = []
345
+ # Perform the type checks by trying to insert data into the table
283
346
  try:
284
- cols_with_nulls = [
285
- col_name for col_name, count in null_counts.items() if count > 0
286
- ]
287
- except TypeError:
288
- # There were no records returned and so null_counts == None.
289
- pass
290
- if cols_with_nulls:
347
+ df.write.mode("append").save(
348
+ f"{self._ds_params.get_autoloader_temp_schema_location()}/{full_name}"
349
+ )
350
+ except Exception as e:
291
351
  raise GoldTableCompatibilityError(
292
- f"Record with null data found for non-nullable columns: {', '.join([col for col in cols_with_nulls])}"
352
+ f"Preset gold table '{full_name}' did not match the gold schema for {fqn_gold_table_name}: {repr(e)}"
293
353
  )
294
354
 
295
355
  d("Resultant gold table preview", 3)
296
- display(unioned_df)
356
+ display(df)
297
357
 
298
358
  def is_backtick_escaped(self, name: str) -> bool:
299
359
  """
@@ -346,31 +406,32 @@ class PreviewEngine:
346
406
  )
347
407
 
348
408
  # If we are using the autoloader, fetch format from preset and others.
349
- if self._ds_params._mode == "autoloader":
409
+ if self._ds_params._mode == "autoloader" or (
410
+ self._ds_params._mode == "silverbronze"
411
+ and self._ds_params._autoloader_location
412
+ ):
413
+ if self._preset.get("bronze", {}).get("loadAsSingleVariant", False) == True:
414
+ self._ds_params._set_load_as_single_variant()
350
415
  if not (autoloader_conf := self._preset.get("autoloader", None)):
351
416
  raise MissingAutoloaderConfigError()
352
417
  if not (file_format := autoloader_conf.get("format", None)):
353
418
  raise AutoloaderMissingFieldError("format")
354
- self._ds_params.set_autoloader_format(file_format)
419
+ self._ds_params._set_autoloader_format(file_format)
355
420
  if schemaFile := autoloader_conf.get("schemaFile", None):
356
- self._ds_params.set_autoloader_schema_file(schemaFile)
357
- if multiline := autoloader_conf.get("multiline", None):
358
- if multiline == "true":
359
- self._ds_params.set_multiline(True)
360
- else:
361
- self._ds_params.set_multiline(False)
421
+ self._ds_params._set_autoloader_schema_file(schemaFile)
362
422
  if cloudFiles := autoloader_conf.get("cloudFiles", None):
363
423
  if schema_hints := cloudFiles.get("schemaHints", None):
364
- self._ds_params.set_autoloader_cloudfiles_schema_hints(schema_hints)
424
+ self._ds_params._set_autoloader_cloudfiles_schema_hints(
425
+ schema_hints
426
+ )
365
427
  if schema_hints_file := cloudFiles.get("schemaHintsFile", None):
366
- self._ds_params.set_autoloader_cloudfiles_schema_hint_file(
428
+ self._ds_params._set_autoloader_cloudfiles_schema_hint_file(
367
429
  schema_hints_file
368
430
  )
369
431
 
370
432
  self._compile_stages()
371
433
 
372
434
  with self._ds_params as df:
373
- self._bronze = df
374
435
  self._result_df_map = self._run(df)
375
436
  if display:
376
437
  self._render_output(