dasl-client 1.0.22__py3-none-any.whl → 1.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dasl-client might be problematic. Click here for more details.

@@ -9,11 +9,6 @@ import uuid
9
9
  from IPython import get_ipython
10
10
 
11
11
 
12
- @udf(StringType())
13
- def constant_udf(*args):
14
- return "<sortable_random_id>"
15
-
16
-
17
12
  class PreviewParameters:
18
13
  """
19
14
  This class provides three methods for supplying input records to the preset development environment.
@@ -60,6 +55,78 @@ class PreviewParameters:
60
55
  .set_table("system.access.audit")
61
56
  ```
62
57
 
58
+ **4. SilverBronze Mode:**
59
+ "silverbronze" mode, works like a more advanced "table" mode. It allows for joining of multiple
60
+ tables as input. This mode requires setting bronze table definitions. This mode behaves in 2
61
+ seperate ways depending on whether an autoloader location is set or not. If an autoloader location
62
+ is set the first entry in the bronze table definitions is used to name and alias the autoloader's
63
+ input and these can be used in later join expressions. Used in this way, the autoloader will be
64
+ loaded as in "autoloader" mode, and run through preBronze stages before being joined with the
65
+ remainder of the bronze table definitions. This mimics not skipping bronze in a DataSource and
66
+ joining what was read in silver. If an autoloader location is not set, the behaviour instead
67
+ attempts to emulate a DataSource set to skip the bronze stage. That is, all preBronze and bronze
68
+ stages will be skipped, and the name of the first entry in the given bronze table definitions will
69
+ be read from instead. Any subsequent bronze table definitions will be joined against this table.
70
+
71
+ Using no autoloader location (this will read from the first table):
72
+ ```python
73
+ bronze_tables = [
74
+ {
75
+ "name": "databricks_dev.default.sev_map",
76
+ "alias": "tab1"
77
+ },
78
+ {
79
+ "name": "databricks_dev.alan_bronze.akamai_waf",
80
+ "alias": "tab2",
81
+ "joinExpr": "id::string = tab2.serviceID",
82
+ "joinType": "left"
83
+ },
84
+ {
85
+ "name": "databricks_dev.alan_silver.cloudflare_hjttp_request",
86
+ "alias": "tab3",
87
+ "joinExpr": "tab1.id::string = tab3.ClientRequestsBytes",
88
+ "joinType": "inner"
89
+ }
90
+ ]
91
+
92
+ ds_params = (
93
+ PreviewParameters(spark)
94
+ .from_silverbronze_tables()
95
+ .set_bronze_table_definitions(bronze_tables)
96
+ )
97
+
98
+ ps = PreviewEngine(spark, yaml_string, ds_params)
99
+ ps.evaluate("stage.gold")
100
+ ```
101
+
102
+ Using an autoloader location (this will read from the autoloader and name the df tab1):
103
+ ```python
104
+ bronze_tables = [
105
+ {
106
+ "name": "tab1"
107
+ },
108
+ {
109
+ "name": "databricks_dev.alan_bronze.akamai_waf",
110
+ "alias": "tab2",
111
+ "joinExpr": "id::string = tab2.serviceID",
112
+ "joinType": "left"
113
+ },
114
+ {
115
+ "name": "databricks_dev.alan_silver.cloudflare_hjttp_request",
116
+ "alias": "tab3",
117
+ "joinExpr": "tab1.id::string = tab3.ClientRequestsBytes",
118
+ "joinType": "inner"
119
+ }
120
+ ]
121
+
122
+ ds_params = (
123
+ PreviewParameters(spark)
124
+ .from_silverbronze_tables()
125
+ .set_bronze_table_definitions(bronze_tables)
126
+ .set_autoloader_location("s3://antimatter-dasl-testing/csamples3/mars/area1/")
127
+ )
128
+ ```
129
+
63
130
  **Note:**
64
131
  When using autoloader mode, this implementation requires a location to store a temporary schema for
65
132
  the loaded records. By default, this is set to `"dbfs:/tmp/schemas"`. You can change this using
@@ -94,9 +161,10 @@ class PreviewParameters:
94
161
  df (DataFrame): Internal Spark DataFrame loaded using the specified parameters.
95
162
  """
96
163
  self._spark = spark
97
- self._mode = None # [input, autoloader]
164
+ self._mode = None # [input, table, autoloader, silverbronze]
98
165
  self._record_limit = 10
99
166
  self._autoloader_temp_schema_location = "dbfs:/tmp/schemas"
167
+ self._gold_test_schemas = []
100
168
 
101
169
  self._time_column = None
102
170
  self._start_time = None
@@ -109,18 +177,97 @@ class PreviewParameters:
109
177
  self._cloudfiles_schema_hints = None
110
178
  self._cloudfiles_reader_case_sensitive = "true"
111
179
  self._cloudfiles_multiline = "true"
180
+ self._cloudfiles_wholetext = "false"
112
181
  self._schema_uuid_str = str(uuid.uuid4())
182
+ self._single_variant_column = None
113
183
 
114
184
  self._schema = None
115
185
  self._data = None
116
186
 
117
187
  self._table = None
118
188
 
189
+ self._bronze_tables = None
190
+
119
191
  self._pretransform_name = None
120
- self._bronze_pre_transform: Optional[List[str]] = None
121
192
 
122
193
  self._df = None
123
194
 
195
+ def __create_from_autoloader(self) -> DataFrame:
196
+ stream_df = (
197
+ self._spark.readStream.format("cloudFiles")
198
+ .option("cloudFiles.format", self._autoloader_format)
199
+ .option("readerCaseSensitive", self._cloudfiles_reader_case_sensitive)
200
+ )
201
+
202
+ # text and wholetext needs to be handled seperately.
203
+ stream_df = (
204
+ stream_df.option("multiline", self._cloudfiles_multiline)
205
+ if self._autoloader_format != "text"
206
+ else stream_df.option("wholetext", self._cloudfiles_wholetext)
207
+ )
208
+
209
+ if self._single_variant_column:
210
+ stream_df = stream_df.option(
211
+ "singleVariantColumn", self._single_variant_column
212
+ )
213
+
214
+ if self._schema_file:
215
+ with open(self._schema_file, "r") as f:
216
+ stream_df = stream_df.schema(f.read().strip())
217
+ else:
218
+ stream_df = (
219
+ stream_df.option("inferSchema", "true")
220
+ .option("cloudFiles.inferColumnTypes", "true")
221
+ .option(
222
+ "cloudFiles.schemaLocation",
223
+ f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
224
+ )
225
+ )
226
+
227
+ if self._cloudfiles_schema_hints:
228
+ stream_df = stream_df.option(
229
+ "cloudFiles.schemaHints", self._cloudfiles_schema_hints
230
+ )
231
+ elif self._clouldfiles_schema_hints_file:
232
+ stream_df = stream_df.option(
233
+ "cloudFiles.schemaHintsFile", self._clouldfiles_schema_hints_file
234
+ )
235
+
236
+ stream_df = stream_df.load(self._autoloader_location).limit(self._record_limit)
237
+
238
+ query = (
239
+ stream_df.writeStream.format("memory")
240
+ .queryName("batch_data")
241
+ .trigger(availableNow=True)
242
+ .start()
243
+ )
244
+
245
+ query.awaitTermination()
246
+
247
+ def __create_from_silverbronze_tables_join(self) -> DataFrame:
248
+ if not self._bronze_tables or not len(self._bronze_tables):
249
+ raise MissingBronzeTablesError()
250
+
251
+ # Validate name and joinExpr are set.
252
+ for i in range(len(self._bronze_tables)):
253
+ if not self._bronze_tables[i].get("name", None):
254
+ raise MissingBronzeTableFieldError("name")
255
+ if i > 0 and not self._bronze_tables[i].get("joinExpr", None):
256
+ raise MissingBronzeTableFieldError("joinExpr")
257
+
258
+ # If there is an autoloader location given, we create the df now and
259
+ # then allow preBronze stage to run. Otherwise we skip preBronze stages
260
+ # and as part of running the silverbronze joins we create the df from
261
+ # the first entry in the bronze tables list.
262
+ df = None
263
+ if self._autoloader_location:
264
+ self.__create_from_autoloader()
265
+ df = self._spark.table("batch_data").alias(
266
+ self._bronze_tables[0].get("name", "")
267
+ ) # Use first's name.
268
+
269
+ return df
270
+
124
271
  def __enter__(self):
125
272
  """
126
273
  Creates a DataFrame with data using the method specified. In the case of "autoloader",
@@ -136,59 +283,10 @@ class PreviewParameters:
136
283
  elif self._mode == "table":
137
284
  self._df = self._spark.table(self._table).limit(self._record_limit)
138
285
  elif self._mode == "autoloader":
139
- stream_df = (
140
- self._spark.readStream.format("cloudFiles")
141
- .option("cloudFiles.format", self._autoloader_format)
142
- .option("multiline", self._cloudfiles_multiline)
143
- .option("readerCaseSensitive", self._cloudfiles_reader_case_sensitive)
144
- )
145
-
146
- if self._schema_file:
147
- with open(self._schema_file, "r") as f:
148
- stream_df = stream_df.schema(f.read().strip())
149
- else:
150
- stream_df = (
151
- stream_df.option("inferSchema", "true")
152
- .option("cloudFiles.inferColumnTypes", "true")
153
- .option(
154
- "cloudFiles.schemaLocation",
155
- f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
156
- )
157
- )
158
-
159
- if self._cloudfiles_schema_hints:
160
- stream_df = stream_df.option(
161
- "cloudFiles.schemaHints", self._cloudfiles_schema_hints
162
- )
163
- elif self._clouldfiles_schema_hints_file:
164
- stream_df = stream_df.option(
165
- "cloudFiles.schemaHintsFile", self._clouldfiles_schema_hints_file
166
- )
167
-
168
- stream_df = stream_df.load(self._autoloader_location).limit(
169
- self._record_limit
170
- )
171
-
172
- if self._bronze_pre_transform is not None:
173
- stream_df = stream_df.selectExpr(*self._bronze_pre_transform)
174
-
175
- query = (
176
- stream_df.writeStream.format("memory")
177
- .queryName("batch_data")
178
- .trigger(availableNow=True)
179
- .start()
180
- )
181
-
182
- query.awaitTermination()
183
-
286
+ self.__create_from_autoloader()
184
287
  self._df = self._spark.table("batch_data")
185
-
186
- if self._time_column:
187
- self._df = self._df.filter(
188
- f"timestamp({self._time_column}) >= timestamp('{self._start_time}') AND timestamp({self._time_column}) < timestamp('{self._end_time}')"
189
- )
190
-
191
- self._df = self._df.withColumn("dasl_id", constant_udf())
288
+ elif self._mode == "silverbronze":
289
+ self._df = self.__create_from_silverbronze_tables_join()
192
290
 
193
291
  return self._df
194
292
 
@@ -206,10 +304,21 @@ class PreviewParameters:
206
304
  f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
207
305
  recurse=True,
208
306
  )
307
+ for gold_test_schema in self._gold_test_schemas:
308
+ dbutils.fs.rm(
309
+ f"{self._autoloader_temp_schema_location}/{gold_test_schema}",
310
+ recurse=True,
311
+ )
209
312
  else:
210
- print(
211
- f"FYI, we are leaking temp data {self._autoloader_temp_schema_location}/{self._schema_uuid_str}"
212
- )
313
+ leaked_lines = [
314
+ f"FYI, we are leaking temp data {self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
315
+ *[
316
+ f"{self._autoloader_temp_schema_location}/{x}"
317
+ for x in self._gold_test_schemas
318
+ ],
319
+ ]
320
+ print(", ".join(leaked_lines))
321
+ self._gold_test_schemas = []
213
322
 
214
323
  def from_input(self):
215
324
  """
@@ -242,6 +351,36 @@ class PreviewParameters:
242
351
  self._mode = "table"
243
352
  return self
244
353
 
354
+ def from_silverbronze_tables(self):
355
+ """
356
+ Set the data source loader to "bronze tables" mode. Requires a list of bronze table
357
+ definitions to be provided.
358
+
359
+ Returns:
360
+ PreviewParameters: The current instance with updated configuration.
361
+ """
362
+ self._mode = "silverbronze"
363
+ return self
364
+
365
+ def set_bronze_table_definitions(self, definitions: List[Dict[str, str]]):
366
+ """
367
+ Set the bronze table definitions for bronze tables mode. `name` and `joinExpr` are
368
+ required. If `alias` is not provided, one can use the `name` to refer to the table.
369
+ If `joinType` is not provided, "left" is used as a default value. If pr
370
+
371
+ [
372
+ {
373
+ "name": "name",
374
+ "alias": "alias1",
375
+ "joinType": "inner",
376
+ "joinExpr": "base_table.col1 = alias1.col1
377
+ },
378
+ ...
379
+ ]
380
+ """
381
+ self._bronze_tables = definitions
382
+ return self
383
+
245
384
  def set_autoloader_temp_schema_location(self, path: str):
246
385
  """
247
386
  Set the location for the autoloader's streaming mode schema to be created. This is
@@ -253,6 +392,15 @@ class PreviewParameters:
253
392
  self._autoloader_temp_schema_location = path
254
393
  return self
255
394
 
395
+ def get_autoloader_temp_schema_location(self) -> str:
396
+ """
397
+ Get the location for the autoloader's streaming mode schema to be created.
398
+
399
+ Returns:
400
+ str: The location for the autoloader's streaming mode schema to be created.
401
+ """
402
+ return self._autoloader_temp_schema_location
403
+
256
404
  def set_data_schema(self, schema: StructType):
257
405
  """
258
406
  Set the input schema for "input" mode. For example:
@@ -290,7 +438,7 @@ class PreviewParameters:
290
438
  self._autoloader_location = location
291
439
  return self
292
440
 
293
- def set_autoloader_format(self, file_format: str):
441
+ def _set_autoloader_format(self, file_format: str):
294
442
  """
295
443
  Used internally to set the autoloader format.
296
444
 
@@ -299,12 +447,16 @@ class PreviewParameters:
299
447
  """
300
448
  if file_format.lower() == "jsonl":
301
449
  self._autoloader_format = "json"
302
- self.set_autoloader_multiline(False)
450
+ self._cloudfiles_multiline = "false"
451
+ return self
452
+ if file_format.lower() == "wholetext":
453
+ self._autoloader_format = "text"
454
+ self._cloudfiles_wholetext = "true"
303
455
  return self
304
456
  self._autoloader_format = file_format
305
457
  return self
306
458
 
307
- def set_autoloader_schema_file(self, path: str):
459
+ def _set_autoloader_schema_file(self, path: str):
308
460
  """
309
461
  Set the schema file path for "autoloader" mode.
310
462
 
@@ -314,7 +466,7 @@ class PreviewParameters:
314
466
  self._schema_file = path
315
467
  return self
316
468
 
317
- def set_autoloader_cloudfiles_schema_hint_file(self, path: str):
469
+ def _set_autoloader_cloudfiles_schema_hint_file(self, path: str):
318
470
  """
319
471
  Set the cloudFiles schema hints file path for "autoloader" mode.
320
472
 
@@ -324,7 +476,7 @@ class PreviewParameters:
324
476
  self._clouldfiles_schema_hints_file = path
325
477
  return self
326
478
 
327
- def set_autoloader_cloudfiles_schema_hints(self, cloudfiles_schema_hints: str):
479
+ def _set_autoloader_cloudfiles_schema_hints(self, cloudfiles_schema_hints: str):
328
480
  """
329
481
  Set the cloudFiles schema hints string for "autoloader" mode.
330
482
 
@@ -334,26 +486,6 @@ class PreviewParameters:
334
486
  self._cloudfiles_schema_hints = cloudfiles_schema_hints
335
487
  return self
336
488
 
337
- def set_autoloader_reader_case_sensitive(self, b: bool):
338
- """
339
- Set the cloudFiles reader case-sensitive boolean for "autoloader" mode.
340
-
341
- Returns:
342
- PreviewParameters: The current instance with updated configuration.
343
- """
344
- self._cloudfiles_reader_case_sensitive = "true" if b else "false"
345
- return self
346
-
347
- def set_autoloader_multiline(self, b: bool):
348
- """
349
- Set the cloudFiles multiline boolean for "autoloader" mode.
350
-
351
- Returns:
352
- PreviewParameters: The current instance with updated configuration.
353
- """
354
- self._cloudfiles_multiline = "true" if b else "false"
355
- return self
356
-
357
489
  def set_pretransform_name(self, pretransform_name: str):
358
490
  """
359
491
  Set the pretransform name to use, if desired. If not set, Silver PreTransform
@@ -365,16 +497,6 @@ class PreviewParameters:
365
497
  self._pretransform_name = pretransform_name
366
498
  return self
367
499
 
368
- def set_bronze_pre_transform(self, expr: List[str]):
369
- """
370
- Sets a pre-transform expression that will run before data is written to bronze
371
-
372
- Returns:
373
- PreviewParameters: The current instance with updated configuration.
374
- """
375
- self._bronze_pre_transform = expr
376
- return self
377
-
378
500
  def set_date_range(self, column: str, start_time: str, end_time: str):
379
501
  """
380
502
  Set the TIMESTAMP column and date range to use as the input data filter to
@@ -409,3 +531,21 @@ class PreviewParameters:
409
531
  """
410
532
  self._table = table_name
411
533
  return self
534
+
535
+ def _set_load_as_single_variant(self, col_name: Optional[str] = None):
536
+ """
537
+ Enable loadAsSingleVariant mode. This will ingest data into a single VARIANT-typed column.
538
+ The default name of that column is `data`.
539
+
540
+ Returns:
541
+ PreviewParameters: The current instance with updated configuration.
542
+ """
543
+ self._single_variant_column = col_name if col_name is not None else "data"
544
+ return self
545
+
546
+ def add_gold_schema_table(self, gold_schema_table_name: str):
547
+ """
548
+ Add a gold schema temporary table name that will need to be cleaned
549
+ up at the end of the run.
550
+ """
551
+ self._gold_test_schemas.append(gold_schema_table_name)
@@ -4,6 +4,8 @@ from pyspark.sql.dataframe import DataFrame
4
4
  from pyspark.sql.functions import col, lit
5
5
  from dasl_client.preset_development.errors import *
6
6
 
7
+ import re
8
+
7
9
  FieldSpec = Dict[str, Any]
8
10
 
9
11
 
@@ -45,6 +47,57 @@ class Stage:
45
47
 
46
48
  __op_list = ["assert", "literal", "from", "alias", "expr", "join"]
47
49
 
50
+ # From fieldSpec valid characters.
51
+ __invalid_char_pattern = re.compile(r"[\s,;{}\(\)\n\t=]")
52
+
53
+ def __validate_field_spec(self, fields: List[Dict[str, str]], stage: str):
54
+ names = []
55
+ for field in self._fields:
56
+ # Check for name. If no name check for assert.
57
+ if not (name := field.get("name", None)):
58
+ if not field.get("assert", None): # Can't walrus em all :/
59
+ raise MissingFieldNameError(self._stage, self._name)
60
+
61
+ # Check this new name does not duplicate an existing.
62
+ if name in names:
63
+ raise DuplicateFieldNameError(self._stage, self._name, name)
64
+ names += [name] if name != None else []
65
+
66
+ # Check for only 1 defined operation.
67
+ missing_op_count = [
68
+ spec for spec in [field.get(op, None) for op in self.__op_list]
69
+ ].count(None)
70
+ if (missing_op_count == len(self.__op_list)) or (
71
+ len(self.__op_list) - missing_op_count > 1
72
+ ):
73
+ raise MalformedFieldError(
74
+ self._stage, self._name, field.get("name", None)
75
+ )
76
+
77
+ # Literal must be a string.
78
+ if lit := field.get("literal", None):
79
+ if type(lit) != str:
80
+ raise InvalidLiteralError(
81
+ self._stage, self._name, field.get("name", None)
82
+ )
83
+
84
+ # Validate from (makes sure its not an expression, etc.). This mirrors Scala code's validation.
85
+ if frm := field.get("from", None):
86
+ if len(frm) >= 256:
87
+ raise InvalidFromError(
88
+ self._stage,
89
+ self._name,
90
+ field.get("name", None),
91
+ "Column name too long",
92
+ )
93
+ if frm.strip() == "" or self.__invalid_char_pattern.search(frm):
94
+ raise InvalidFromError(
95
+ self._stage,
96
+ self._name,
97
+ field.get("name", None),
98
+ "Malformed column name referenced",
99
+ )
100
+
48
101
  def __init__(self, spark: SparkSession, stage: str, table: Dict[str, any]):
49
102
  """
50
103
  Initializes a Stage object that encapsulates all operations required for a single
@@ -68,9 +121,10 @@ class Stage:
68
121
  self._utils = table.get("utils", {})
69
122
  self._input = table.get("input", None)
70
123
 
124
+ # The dasl_id does not exist before bronze or when dealing with temp fields.
71
125
  fields = (
72
126
  [{"name": "dasl_id", "from": "dasl_id"}] + table.get("fields", [])
73
- if self._stage != "temp_fields"
127
+ if self._stage not in ["temp_fields", "bronze pretransform"]
74
128
  else table.get("fields", [])
75
129
  )
76
130
  self._fields = [
@@ -78,24 +132,7 @@ class Stage:
78
132
  ]
79
133
  self._assertions = [f for f in fields if f.get("assert", None)]
80
134
 
81
- names = []
82
- for field in self._fields:
83
- if not (name := field.get("name", None)):
84
- if not field.get("assert", None): # Can't walrus em all :/
85
- raise MissingFieldNameError(self._stage, self._name)
86
- if name in names:
87
- raise DuplicateFieldNameError(self._stage, self._name, name)
88
- names += [name]
89
-
90
- missing_op_count = [
91
- spec for spec in [field.get(op, None) for op in self.__op_list]
92
- ].count(None)
93
- if (missing_op_count == len(self.__op_list)) or (
94
- len(self.__op_list) - missing_op_count > 1
95
- ):
96
- raise MalformedFieldError(
97
- self._stage, self._name, field.get("name", None)
98
- )
135
+ self.__validate_field_spec(self._fields, self._stage)
99
136
 
100
137
  def _referenced_columns(self) -> List[str]:
101
138
  """
@@ -311,9 +348,15 @@ class Stage:
311
348
  # check that the from column exists in the df?
312
349
  return f"{self.auto_backtick(field['from'])} AS {self.auto_backtick(name)}"
313
350
  elif field.get("literal", None):
314
- return f"'{field['literal']}' AS {self.auto_backtick(name)}"
351
+ return f"{repr(field['literal'])} AS {self.auto_backtick(name)}"
315
352
  elif field.get("expr", None) is not None:
316
- return f"{field['expr']} AS {self.auto_backtick(name)}"
353
+ expr = field["expr"].replace("\\", "\\\\")
354
+ # If we are in a bronze pretransform, we do not want the fieldSpec.name.
355
+ return (
356
+ f"{expr} AS {self.auto_backtick(name)}"
357
+ if self._stage != "bronze pretransform"
358
+ else expr
359
+ )
317
360
  else:
318
361
  return ""
319
362
 
@@ -426,6 +469,8 @@ class Stage:
426
469
  select_fields = self.render_fields(self._fields)
427
470
 
428
471
  if preserve := self._utils.get("unreferencedColumns", None):
472
+ if self._stage == "gold": # No utils run in gold.
473
+ raise DisallowedUtilityConfigurationError("unreferencedColumns", "gold")
429
474
  should_preserve = preserve.get("preserve", None)
430
475
  if type(should_preserve) != bool:
431
476
  raise MissingUtilityConfigurationFieldError(
@@ -438,9 +483,23 @@ class Stage:
438
483
  # applying backticks to all of them is OK here
439
484
  # since they will never use "obj.key" to reference nested fields of structs
440
485
  # so we just go ahead and apply backticks to all across the board
441
- select_fields += [
442
- f"struct({', '.join(list(map(lambda x: self.force_apply_backticks(x), preserved_columns)))}) AS {self.auto_backtick(embed_col)}"
443
- ]
486
+ colType = preserve.get("embedColumnType", "struct")
487
+ if colType == "struct":
488
+ select_fields += [
489
+ f"struct({', '.join(list(map(lambda x: self.force_apply_backticks(x), preserved_columns)))}) AS {self.auto_backtick(embed_col)}"
490
+ ]
491
+ elif colType == "json":
492
+ select_fields += [
493
+ f"to_json(struct({', '.join(list(map(lambda x: self.force_apply_backticks(x), preserved_columns)))})) AS {self.auto_backtick(embed_col)}"
494
+ ]
495
+ elif colType == "variant":
496
+ select_fields += [
497
+ f"parse_json(to_json(struct({', '.join(list(map(lambda x: self.force_apply_backticks(x), preserved_columns)))}))) AS {self.auto_backtick(embed_col)}"
498
+ ]
499
+ else:
500
+ raise UnknownUtilityConfigurationFieldError(
501
+ "embedColumnType", "unreferencedColumns"
502
+ )
444
503
  else:
445
504
  (
446
505
  preserved_columns,
@@ -476,6 +535,8 @@ class Stage:
476
535
  A DataFrame with the resultant operation's records.
477
536
  """
478
537
  if json_extracts := self._utils.get("jsonExtract", None):
538
+ if self._stage == "gold": # No utils run in gold.
539
+ raise DisallowedUtilityConfigurationError("jsonExtract", "gold")
479
540
  for json_extract in json_extracts:
480
541
  source = json_extract.get("source")
481
542
  if not source:
@@ -610,6 +671,8 @@ class Stage:
610
671
  A DataFrame with the resultant operation's records.
611
672
  """
612
673
  if temp_fields := self._utils.get("temporaryFields", None):
674
+ if self._stage == "gold": # No utils run in gold.
675
+ raise DisallowedUtilityConfigurationError("temporaryFields", "gold")
613
676
  df = Stage(self._spark, "temp_fields", {"fields": temp_fields}).run(df)
614
677
  return df
615
678
 
@@ -0,0 +1,3 @@
1
+ {
2
+ "us-east-1": "https://api.prod.sl.antimatter.io"
3
+ }
dasl_client/regions.py ADDED
@@ -0,0 +1,18 @@
1
+ import json
2
+ from importlib import resources
3
+ from typing import List
4
+
5
+ _data = json.loads(resources.files(__package__).joinpath("regions.json").read_text())
6
+
7
+
8
+ class Regions:
9
+ @staticmethod
10
+ def lookup(name: str) -> str:
11
+ try:
12
+ return _data(name)
13
+ except KeyError as e:
14
+ raise ValueError(f"unknown region {name}") from e
15
+
16
+ @staticmethod
17
+ def list() -> List[str]:
18
+ return list(_data.keys())