dasl-client 1.0.22__py3-none-any.whl → 1.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dasl-client might be problematic. Click here for more details.
- dasl_client/__init__.py +1 -0
- dasl_client/client.py +240 -23
- dasl_client/exec_rule.py +92 -0
- dasl_client/helpers.py +1 -1
- dasl_client/preset_development/errors.py +42 -0
- dasl_client/preset_development/preview_engine.py +122 -61
- dasl_client/preset_development/preview_parameters.py +237 -97
- dasl_client/preset_development/stage.py +87 -24
- dasl_client/regions.json +3 -0
- dasl_client/regions.py +18 -0
- dasl_client/types/datasource.py +51 -0
- dasl_client/types/rule.py +33 -0
- dasl_client/types/workspace_config.py +121 -9
- dasl_client-1.0.24.dist-info/METADATA +18 -0
- dasl_client-1.0.24.dist-info/RECORD +32 -0
- {dasl_client-1.0.22.dist-info → dasl_client-1.0.24.dist-info}/WHEEL +1 -1
- {dasl_client-1.0.22.dist-info → dasl_client-1.0.24.dist-info}/top_level.txt +0 -1
- dasl_client-1.0.22.dist-info/METADATA +0 -34
- dasl_client-1.0.22.dist-info/RECORD +0 -36
- test/__init__.py +0 -0
- test/conftest.py +0 -18
- test/constants.py +0 -10
- test/test_api_changes.py +0 -137
- test/test_api_surface.py +0 -304
- test/test_databricks_secret_auth.py +0 -116
- test/test_marshaling.py +0 -910
- {dasl_client-1.0.22.dist-info → dasl_client-1.0.24.dist-info/licenses}/LICENSE +0 -0
|
@@ -9,11 +9,6 @@ import uuid
|
|
|
9
9
|
from IPython import get_ipython
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
@udf(StringType())
|
|
13
|
-
def constant_udf(*args):
|
|
14
|
-
return "<sortable_random_id>"
|
|
15
|
-
|
|
16
|
-
|
|
17
12
|
class PreviewParameters:
|
|
18
13
|
"""
|
|
19
14
|
This class provides three methods for supplying input records to the preset development environment.
|
|
@@ -60,6 +55,78 @@ class PreviewParameters:
|
|
|
60
55
|
.set_table("system.access.audit")
|
|
61
56
|
```
|
|
62
57
|
|
|
58
|
+
**4. SilverBronze Mode:**
|
|
59
|
+
"silverbronze" mode, works like a more advanced "table" mode. It allows for joining of multiple
|
|
60
|
+
tables as input. This mode requires setting bronze table definitions. This mode behaves in 2
|
|
61
|
+
seperate ways depending on whether an autoloader location is set or not. If an autoloader location
|
|
62
|
+
is set the first entry in the bronze table definitions is used to name and alias the autoloader's
|
|
63
|
+
input and these can be used in later join expressions. Used in this way, the autoloader will be
|
|
64
|
+
loaded as in "autoloader" mode, and run through preBronze stages before being joined with the
|
|
65
|
+
remainder of the bronze table definitions. This mimics not skipping bronze in a DataSource and
|
|
66
|
+
joining what was read in silver. If an autoloader location is not set, the behaviour instead
|
|
67
|
+
attempts to emulate a DataSource set to skip the bronze stage. That is, all preBronze and bronze
|
|
68
|
+
stages will be skipped, and the name of the first entry in the given bronze table definitions will
|
|
69
|
+
be read from instead. Any subsequent bronze table definitions will be joined against this table.
|
|
70
|
+
|
|
71
|
+
Using no autoloader location (this will read from the first table):
|
|
72
|
+
```python
|
|
73
|
+
bronze_tables = [
|
|
74
|
+
{
|
|
75
|
+
"name": "databricks_dev.default.sev_map",
|
|
76
|
+
"alias": "tab1"
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"name": "databricks_dev.alan_bronze.akamai_waf",
|
|
80
|
+
"alias": "tab2",
|
|
81
|
+
"joinExpr": "id::string = tab2.serviceID",
|
|
82
|
+
"joinType": "left"
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"name": "databricks_dev.alan_silver.cloudflare_hjttp_request",
|
|
86
|
+
"alias": "tab3",
|
|
87
|
+
"joinExpr": "tab1.id::string = tab3.ClientRequestsBytes",
|
|
88
|
+
"joinType": "inner"
|
|
89
|
+
}
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
ds_params = (
|
|
93
|
+
PreviewParameters(spark)
|
|
94
|
+
.from_silverbronze_tables()
|
|
95
|
+
.set_bronze_table_definitions(bronze_tables)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
ps = PreviewEngine(spark, yaml_string, ds_params)
|
|
99
|
+
ps.evaluate("stage.gold")
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Using an autoloader location (this will read from the autoloader and name the df tab1):
|
|
103
|
+
```python
|
|
104
|
+
bronze_tables = [
|
|
105
|
+
{
|
|
106
|
+
"name": "tab1"
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"name": "databricks_dev.alan_bronze.akamai_waf",
|
|
110
|
+
"alias": "tab2",
|
|
111
|
+
"joinExpr": "id::string = tab2.serviceID",
|
|
112
|
+
"joinType": "left"
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
"name": "databricks_dev.alan_silver.cloudflare_hjttp_request",
|
|
116
|
+
"alias": "tab3",
|
|
117
|
+
"joinExpr": "tab1.id::string = tab3.ClientRequestsBytes",
|
|
118
|
+
"joinType": "inner"
|
|
119
|
+
}
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
ds_params = (
|
|
123
|
+
PreviewParameters(spark)
|
|
124
|
+
.from_silverbronze_tables()
|
|
125
|
+
.set_bronze_table_definitions(bronze_tables)
|
|
126
|
+
.set_autoloader_location("s3://antimatter-dasl-testing/csamples3/mars/area1/")
|
|
127
|
+
)
|
|
128
|
+
```
|
|
129
|
+
|
|
63
130
|
**Note:**
|
|
64
131
|
When using autoloader mode, this implementation requires a location to store a temporary schema for
|
|
65
132
|
the loaded records. By default, this is set to `"dbfs:/tmp/schemas"`. You can change this using
|
|
@@ -94,9 +161,10 @@ class PreviewParameters:
|
|
|
94
161
|
df (DataFrame): Internal Spark DataFrame loaded using the specified parameters.
|
|
95
162
|
"""
|
|
96
163
|
self._spark = spark
|
|
97
|
-
self._mode = None # [input, autoloader]
|
|
164
|
+
self._mode = None # [input, table, autoloader, silverbronze]
|
|
98
165
|
self._record_limit = 10
|
|
99
166
|
self._autoloader_temp_schema_location = "dbfs:/tmp/schemas"
|
|
167
|
+
self._gold_test_schemas = []
|
|
100
168
|
|
|
101
169
|
self._time_column = None
|
|
102
170
|
self._start_time = None
|
|
@@ -109,18 +177,97 @@ class PreviewParameters:
|
|
|
109
177
|
self._cloudfiles_schema_hints = None
|
|
110
178
|
self._cloudfiles_reader_case_sensitive = "true"
|
|
111
179
|
self._cloudfiles_multiline = "true"
|
|
180
|
+
self._cloudfiles_wholetext = "false"
|
|
112
181
|
self._schema_uuid_str = str(uuid.uuid4())
|
|
182
|
+
self._single_variant_column = None
|
|
113
183
|
|
|
114
184
|
self._schema = None
|
|
115
185
|
self._data = None
|
|
116
186
|
|
|
117
187
|
self._table = None
|
|
118
188
|
|
|
189
|
+
self._bronze_tables = None
|
|
190
|
+
|
|
119
191
|
self._pretransform_name = None
|
|
120
|
-
self._bronze_pre_transform: Optional[List[str]] = None
|
|
121
192
|
|
|
122
193
|
self._df = None
|
|
123
194
|
|
|
195
|
+
def __create_from_autoloader(self) -> DataFrame:
|
|
196
|
+
stream_df = (
|
|
197
|
+
self._spark.readStream.format("cloudFiles")
|
|
198
|
+
.option("cloudFiles.format", self._autoloader_format)
|
|
199
|
+
.option("readerCaseSensitive", self._cloudfiles_reader_case_sensitive)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# text and wholetext needs to be handled seperately.
|
|
203
|
+
stream_df = (
|
|
204
|
+
stream_df.option("multiline", self._cloudfiles_multiline)
|
|
205
|
+
if self._autoloader_format != "text"
|
|
206
|
+
else stream_df.option("wholetext", self._cloudfiles_wholetext)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if self._single_variant_column:
|
|
210
|
+
stream_df = stream_df.option(
|
|
211
|
+
"singleVariantColumn", self._single_variant_column
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if self._schema_file:
|
|
215
|
+
with open(self._schema_file, "r") as f:
|
|
216
|
+
stream_df = stream_df.schema(f.read().strip())
|
|
217
|
+
else:
|
|
218
|
+
stream_df = (
|
|
219
|
+
stream_df.option("inferSchema", "true")
|
|
220
|
+
.option("cloudFiles.inferColumnTypes", "true")
|
|
221
|
+
.option(
|
|
222
|
+
"cloudFiles.schemaLocation",
|
|
223
|
+
f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if self._cloudfiles_schema_hints:
|
|
228
|
+
stream_df = stream_df.option(
|
|
229
|
+
"cloudFiles.schemaHints", self._cloudfiles_schema_hints
|
|
230
|
+
)
|
|
231
|
+
elif self._clouldfiles_schema_hints_file:
|
|
232
|
+
stream_df = stream_df.option(
|
|
233
|
+
"cloudFiles.schemaHintsFile", self._clouldfiles_schema_hints_file
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
stream_df = stream_df.load(self._autoloader_location).limit(self._record_limit)
|
|
237
|
+
|
|
238
|
+
query = (
|
|
239
|
+
stream_df.writeStream.format("memory")
|
|
240
|
+
.queryName("batch_data")
|
|
241
|
+
.trigger(availableNow=True)
|
|
242
|
+
.start()
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
query.awaitTermination()
|
|
246
|
+
|
|
247
|
+
def __create_from_silverbronze_tables_join(self) -> DataFrame:
|
|
248
|
+
if not self._bronze_tables or not len(self._bronze_tables):
|
|
249
|
+
raise MissingBronzeTablesError()
|
|
250
|
+
|
|
251
|
+
# Validate name and joinExpr are set.
|
|
252
|
+
for i in range(len(self._bronze_tables)):
|
|
253
|
+
if not self._bronze_tables[i].get("name", None):
|
|
254
|
+
raise MissingBronzeTableFieldError("name")
|
|
255
|
+
if i > 0 and not self._bronze_tables[i].get("joinExpr", None):
|
|
256
|
+
raise MissingBronzeTableFieldError("joinExpr")
|
|
257
|
+
|
|
258
|
+
# If there is an autoloader location given, we create the df now and
|
|
259
|
+
# then allow preBronze stage to run. Otherwise we skip preBronze stages
|
|
260
|
+
# and as part of running the silverbronze joins we create the df from
|
|
261
|
+
# the first entry in the bronze tables list.
|
|
262
|
+
df = None
|
|
263
|
+
if self._autoloader_location:
|
|
264
|
+
self.__create_from_autoloader()
|
|
265
|
+
df = self._spark.table("batch_data").alias(
|
|
266
|
+
self._bronze_tables[0].get("name", "")
|
|
267
|
+
) # Use first's name.
|
|
268
|
+
|
|
269
|
+
return df
|
|
270
|
+
|
|
124
271
|
def __enter__(self):
|
|
125
272
|
"""
|
|
126
273
|
Creates a DataFrame with data using the method specified. In the case of "autoloader",
|
|
@@ -136,59 +283,10 @@ class PreviewParameters:
|
|
|
136
283
|
elif self._mode == "table":
|
|
137
284
|
self._df = self._spark.table(self._table).limit(self._record_limit)
|
|
138
285
|
elif self._mode == "autoloader":
|
|
139
|
-
|
|
140
|
-
self._spark.readStream.format("cloudFiles")
|
|
141
|
-
.option("cloudFiles.format", self._autoloader_format)
|
|
142
|
-
.option("multiline", self._cloudfiles_multiline)
|
|
143
|
-
.option("readerCaseSensitive", self._cloudfiles_reader_case_sensitive)
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
if self._schema_file:
|
|
147
|
-
with open(self._schema_file, "r") as f:
|
|
148
|
-
stream_df = stream_df.schema(f.read().strip())
|
|
149
|
-
else:
|
|
150
|
-
stream_df = (
|
|
151
|
-
stream_df.option("inferSchema", "true")
|
|
152
|
-
.option("cloudFiles.inferColumnTypes", "true")
|
|
153
|
-
.option(
|
|
154
|
-
"cloudFiles.schemaLocation",
|
|
155
|
-
f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
|
|
156
|
-
)
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
if self._cloudfiles_schema_hints:
|
|
160
|
-
stream_df = stream_df.option(
|
|
161
|
-
"cloudFiles.schemaHints", self._cloudfiles_schema_hints
|
|
162
|
-
)
|
|
163
|
-
elif self._clouldfiles_schema_hints_file:
|
|
164
|
-
stream_df = stream_df.option(
|
|
165
|
-
"cloudFiles.schemaHintsFile", self._clouldfiles_schema_hints_file
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
stream_df = stream_df.load(self._autoloader_location).limit(
|
|
169
|
-
self._record_limit
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
if self._bronze_pre_transform is not None:
|
|
173
|
-
stream_df = stream_df.selectExpr(*self._bronze_pre_transform)
|
|
174
|
-
|
|
175
|
-
query = (
|
|
176
|
-
stream_df.writeStream.format("memory")
|
|
177
|
-
.queryName("batch_data")
|
|
178
|
-
.trigger(availableNow=True)
|
|
179
|
-
.start()
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
query.awaitTermination()
|
|
183
|
-
|
|
286
|
+
self.__create_from_autoloader()
|
|
184
287
|
self._df = self._spark.table("batch_data")
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
self._df = self._df.filter(
|
|
188
|
-
f"timestamp({self._time_column}) >= timestamp('{self._start_time}') AND timestamp({self._time_column}) < timestamp('{self._end_time}')"
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
self._df = self._df.withColumn("dasl_id", constant_udf())
|
|
288
|
+
elif self._mode == "silverbronze":
|
|
289
|
+
self._df = self.__create_from_silverbronze_tables_join()
|
|
192
290
|
|
|
193
291
|
return self._df
|
|
194
292
|
|
|
@@ -206,10 +304,21 @@ class PreviewParameters:
|
|
|
206
304
|
f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
|
|
207
305
|
recurse=True,
|
|
208
306
|
)
|
|
307
|
+
for gold_test_schema in self._gold_test_schemas:
|
|
308
|
+
dbutils.fs.rm(
|
|
309
|
+
f"{self._autoloader_temp_schema_location}/{gold_test_schema}",
|
|
310
|
+
recurse=True,
|
|
311
|
+
)
|
|
209
312
|
else:
|
|
210
|
-
|
|
211
|
-
f"FYI, we are leaking temp data {self._autoloader_temp_schema_location}/{self._schema_uuid_str}"
|
|
212
|
-
|
|
313
|
+
leaked_lines = [
|
|
314
|
+
f"FYI, we are leaking temp data {self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
|
|
315
|
+
*[
|
|
316
|
+
f"{self._autoloader_temp_schema_location}/{x}"
|
|
317
|
+
for x in self._gold_test_schemas
|
|
318
|
+
],
|
|
319
|
+
]
|
|
320
|
+
print(", ".join(leaked_lines))
|
|
321
|
+
self._gold_test_schemas = []
|
|
213
322
|
|
|
214
323
|
def from_input(self):
|
|
215
324
|
"""
|
|
@@ -242,6 +351,36 @@ class PreviewParameters:
|
|
|
242
351
|
self._mode = "table"
|
|
243
352
|
return self
|
|
244
353
|
|
|
354
|
+
def from_silverbronze_tables(self):
|
|
355
|
+
"""
|
|
356
|
+
Set the data source loader to "bronze tables" mode. Requires a list of bronze table
|
|
357
|
+
definitions to be provided.
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
PreviewParameters: The current instance with updated configuration.
|
|
361
|
+
"""
|
|
362
|
+
self._mode = "silverbronze"
|
|
363
|
+
return self
|
|
364
|
+
|
|
365
|
+
def set_bronze_table_definitions(self, definitions: List[Dict[str, str]]):
|
|
366
|
+
"""
|
|
367
|
+
Set the bronze table definitions for bronze tables mode. `name` and `joinExpr` are
|
|
368
|
+
required. If `alias` is not provided, one can use the `name` to refer to the table.
|
|
369
|
+
If `joinType` is not provided, "left" is used as a default value. If pr
|
|
370
|
+
|
|
371
|
+
[
|
|
372
|
+
{
|
|
373
|
+
"name": "name",
|
|
374
|
+
"alias": "alias1",
|
|
375
|
+
"joinType": "inner",
|
|
376
|
+
"joinExpr": "base_table.col1 = alias1.col1
|
|
377
|
+
},
|
|
378
|
+
...
|
|
379
|
+
]
|
|
380
|
+
"""
|
|
381
|
+
self._bronze_tables = definitions
|
|
382
|
+
return self
|
|
383
|
+
|
|
245
384
|
def set_autoloader_temp_schema_location(self, path: str):
|
|
246
385
|
"""
|
|
247
386
|
Set the location for the autoloader's streaming mode schema to be created. This is
|
|
@@ -253,6 +392,15 @@ class PreviewParameters:
|
|
|
253
392
|
self._autoloader_temp_schema_location = path
|
|
254
393
|
return self
|
|
255
394
|
|
|
395
|
+
def get_autoloader_temp_schema_location(self) -> str:
|
|
396
|
+
"""
|
|
397
|
+
Get the location for the autoloader's streaming mode schema to be created.
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
str: The location for the autoloader's streaming mode schema to be created.
|
|
401
|
+
"""
|
|
402
|
+
return self._autoloader_temp_schema_location
|
|
403
|
+
|
|
256
404
|
def set_data_schema(self, schema: StructType):
|
|
257
405
|
"""
|
|
258
406
|
Set the input schema for "input" mode. For example:
|
|
@@ -290,7 +438,7 @@ class PreviewParameters:
|
|
|
290
438
|
self._autoloader_location = location
|
|
291
439
|
return self
|
|
292
440
|
|
|
293
|
-
def
|
|
441
|
+
def _set_autoloader_format(self, file_format: str):
|
|
294
442
|
"""
|
|
295
443
|
Used internally to set the autoloader format.
|
|
296
444
|
|
|
@@ -299,12 +447,16 @@ class PreviewParameters:
|
|
|
299
447
|
"""
|
|
300
448
|
if file_format.lower() == "jsonl":
|
|
301
449
|
self._autoloader_format = "json"
|
|
302
|
-
self.
|
|
450
|
+
self._cloudfiles_multiline = "false"
|
|
451
|
+
return self
|
|
452
|
+
if file_format.lower() == "wholetext":
|
|
453
|
+
self._autoloader_format = "text"
|
|
454
|
+
self._cloudfiles_wholetext = "true"
|
|
303
455
|
return self
|
|
304
456
|
self._autoloader_format = file_format
|
|
305
457
|
return self
|
|
306
458
|
|
|
307
|
-
def
|
|
459
|
+
def _set_autoloader_schema_file(self, path: str):
|
|
308
460
|
"""
|
|
309
461
|
Set the schema file path for "autoloader" mode.
|
|
310
462
|
|
|
@@ -314,7 +466,7 @@ class PreviewParameters:
|
|
|
314
466
|
self._schema_file = path
|
|
315
467
|
return self
|
|
316
468
|
|
|
317
|
-
def
|
|
469
|
+
def _set_autoloader_cloudfiles_schema_hint_file(self, path: str):
|
|
318
470
|
"""
|
|
319
471
|
Set the cloudFiles schema hints file path for "autoloader" mode.
|
|
320
472
|
|
|
@@ -324,7 +476,7 @@ class PreviewParameters:
|
|
|
324
476
|
self._clouldfiles_schema_hints_file = path
|
|
325
477
|
return self
|
|
326
478
|
|
|
327
|
-
def
|
|
479
|
+
def _set_autoloader_cloudfiles_schema_hints(self, cloudfiles_schema_hints: str):
|
|
328
480
|
"""
|
|
329
481
|
Set the cloudFiles schema hints string for "autoloader" mode.
|
|
330
482
|
|
|
@@ -334,26 +486,6 @@ class PreviewParameters:
|
|
|
334
486
|
self._cloudfiles_schema_hints = cloudfiles_schema_hints
|
|
335
487
|
return self
|
|
336
488
|
|
|
337
|
-
def set_autoloader_reader_case_sensitive(self, b: bool):
|
|
338
|
-
"""
|
|
339
|
-
Set the cloudFiles reader case-sensitive boolean for "autoloader" mode.
|
|
340
|
-
|
|
341
|
-
Returns:
|
|
342
|
-
PreviewParameters: The current instance with updated configuration.
|
|
343
|
-
"""
|
|
344
|
-
self._cloudfiles_reader_case_sensitive = "true" if b else "false"
|
|
345
|
-
return self
|
|
346
|
-
|
|
347
|
-
def set_autoloader_multiline(self, b: bool):
|
|
348
|
-
"""
|
|
349
|
-
Set the cloudFiles multiline boolean for "autoloader" mode.
|
|
350
|
-
|
|
351
|
-
Returns:
|
|
352
|
-
PreviewParameters: The current instance with updated configuration.
|
|
353
|
-
"""
|
|
354
|
-
self._cloudfiles_multiline = "true" if b else "false"
|
|
355
|
-
return self
|
|
356
|
-
|
|
357
489
|
def set_pretransform_name(self, pretransform_name: str):
|
|
358
490
|
"""
|
|
359
491
|
Set the pretransform name to use, if desired. If not set, Silver PreTransform
|
|
@@ -365,16 +497,6 @@ class PreviewParameters:
|
|
|
365
497
|
self._pretransform_name = pretransform_name
|
|
366
498
|
return self
|
|
367
499
|
|
|
368
|
-
def set_bronze_pre_transform(self, expr: List[str]):
|
|
369
|
-
"""
|
|
370
|
-
Sets a pre-transform expression that will run before data is written to bronze
|
|
371
|
-
|
|
372
|
-
Returns:
|
|
373
|
-
PreviewParameters: The current instance with updated configuration.
|
|
374
|
-
"""
|
|
375
|
-
self._bronze_pre_transform = expr
|
|
376
|
-
return self
|
|
377
|
-
|
|
378
500
|
def set_date_range(self, column: str, start_time: str, end_time: str):
|
|
379
501
|
"""
|
|
380
502
|
Set the TIMESTAMP column and date range to use as the input data filter to
|
|
@@ -409,3 +531,21 @@ class PreviewParameters:
|
|
|
409
531
|
"""
|
|
410
532
|
self._table = table_name
|
|
411
533
|
return self
|
|
534
|
+
|
|
535
|
+
def _set_load_as_single_variant(self, col_name: Optional[str] = None):
|
|
536
|
+
"""
|
|
537
|
+
Enable loadAsSingleVariant mode. This will ingest data into a single VARIANT-typed column.
|
|
538
|
+
The default name of that column is `data`.
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
PreviewParameters: The current instance with updated configuration.
|
|
542
|
+
"""
|
|
543
|
+
self._single_variant_column = col_name if col_name is not None else "data"
|
|
544
|
+
return self
|
|
545
|
+
|
|
546
|
+
def add_gold_schema_table(self, gold_schema_table_name: str):
|
|
547
|
+
"""
|
|
548
|
+
Add a gold schema temporary table name that will need to be cleaned
|
|
549
|
+
up at the end of the run.
|
|
550
|
+
"""
|
|
551
|
+
self._gold_test_schemas.append(gold_schema_table_name)
|
|
@@ -4,6 +4,8 @@ from pyspark.sql.dataframe import DataFrame
|
|
|
4
4
|
from pyspark.sql.functions import col, lit
|
|
5
5
|
from dasl_client.preset_development.errors import *
|
|
6
6
|
|
|
7
|
+
import re
|
|
8
|
+
|
|
7
9
|
FieldSpec = Dict[str, Any]
|
|
8
10
|
|
|
9
11
|
|
|
@@ -45,6 +47,57 @@ class Stage:
|
|
|
45
47
|
|
|
46
48
|
__op_list = ["assert", "literal", "from", "alias", "expr", "join"]
|
|
47
49
|
|
|
50
|
+
# From fieldSpec valid characters.
|
|
51
|
+
__invalid_char_pattern = re.compile(r"[\s,;{}\(\)\n\t=]")
|
|
52
|
+
|
|
53
|
+
def __validate_field_spec(self, fields: List[Dict[str, str]], stage: str):
|
|
54
|
+
names = []
|
|
55
|
+
for field in self._fields:
|
|
56
|
+
# Check for name. If no name check for assert.
|
|
57
|
+
if not (name := field.get("name", None)):
|
|
58
|
+
if not field.get("assert", None): # Can't walrus em all :/
|
|
59
|
+
raise MissingFieldNameError(self._stage, self._name)
|
|
60
|
+
|
|
61
|
+
# Check this new name does not duplicate an existing.
|
|
62
|
+
if name in names:
|
|
63
|
+
raise DuplicateFieldNameError(self._stage, self._name, name)
|
|
64
|
+
names += [name] if name != None else []
|
|
65
|
+
|
|
66
|
+
# Check for only 1 defined operation.
|
|
67
|
+
missing_op_count = [
|
|
68
|
+
spec for spec in [field.get(op, None) for op in self.__op_list]
|
|
69
|
+
].count(None)
|
|
70
|
+
if (missing_op_count == len(self.__op_list)) or (
|
|
71
|
+
len(self.__op_list) - missing_op_count > 1
|
|
72
|
+
):
|
|
73
|
+
raise MalformedFieldError(
|
|
74
|
+
self._stage, self._name, field.get("name", None)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Literal must be a string.
|
|
78
|
+
if lit := field.get("literal", None):
|
|
79
|
+
if type(lit) != str:
|
|
80
|
+
raise InvalidLiteralError(
|
|
81
|
+
self._stage, self._name, field.get("name", None)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Validate from (makes sure its not an expression, etc.). This mirrors Scala code's validation.
|
|
85
|
+
if frm := field.get("from", None):
|
|
86
|
+
if len(frm) >= 256:
|
|
87
|
+
raise InvalidFromError(
|
|
88
|
+
self._stage,
|
|
89
|
+
self._name,
|
|
90
|
+
field.get("name", None),
|
|
91
|
+
"Column name too long",
|
|
92
|
+
)
|
|
93
|
+
if frm.strip() == "" or self.__invalid_char_pattern.search(frm):
|
|
94
|
+
raise InvalidFromError(
|
|
95
|
+
self._stage,
|
|
96
|
+
self._name,
|
|
97
|
+
field.get("name", None),
|
|
98
|
+
"Malformed column name referenced",
|
|
99
|
+
)
|
|
100
|
+
|
|
48
101
|
def __init__(self, spark: SparkSession, stage: str, table: Dict[str, any]):
|
|
49
102
|
"""
|
|
50
103
|
Initializes a Stage object that encapsulates all operations required for a single
|
|
@@ -68,9 +121,10 @@ class Stage:
|
|
|
68
121
|
self._utils = table.get("utils", {})
|
|
69
122
|
self._input = table.get("input", None)
|
|
70
123
|
|
|
124
|
+
# The dasl_id does not exist before bronze or when dealing with temp fields.
|
|
71
125
|
fields = (
|
|
72
126
|
[{"name": "dasl_id", "from": "dasl_id"}] + table.get("fields", [])
|
|
73
|
-
if self._stage
|
|
127
|
+
if self._stage not in ["temp_fields", "bronze pretransform"]
|
|
74
128
|
else table.get("fields", [])
|
|
75
129
|
)
|
|
76
130
|
self._fields = [
|
|
@@ -78,24 +132,7 @@ class Stage:
|
|
|
78
132
|
]
|
|
79
133
|
self._assertions = [f for f in fields if f.get("assert", None)]
|
|
80
134
|
|
|
81
|
-
|
|
82
|
-
for field in self._fields:
|
|
83
|
-
if not (name := field.get("name", None)):
|
|
84
|
-
if not field.get("assert", None): # Can't walrus em all :/
|
|
85
|
-
raise MissingFieldNameError(self._stage, self._name)
|
|
86
|
-
if name in names:
|
|
87
|
-
raise DuplicateFieldNameError(self._stage, self._name, name)
|
|
88
|
-
names += [name]
|
|
89
|
-
|
|
90
|
-
missing_op_count = [
|
|
91
|
-
spec for spec in [field.get(op, None) for op in self.__op_list]
|
|
92
|
-
].count(None)
|
|
93
|
-
if (missing_op_count == len(self.__op_list)) or (
|
|
94
|
-
len(self.__op_list) - missing_op_count > 1
|
|
95
|
-
):
|
|
96
|
-
raise MalformedFieldError(
|
|
97
|
-
self._stage, self._name, field.get("name", None)
|
|
98
|
-
)
|
|
135
|
+
self.__validate_field_spec(self._fields, self._stage)
|
|
99
136
|
|
|
100
137
|
def _referenced_columns(self) -> List[str]:
|
|
101
138
|
"""
|
|
@@ -311,9 +348,15 @@ class Stage:
|
|
|
311
348
|
# check that the from column exists in the df?
|
|
312
349
|
return f"{self.auto_backtick(field['from'])} AS {self.auto_backtick(name)}"
|
|
313
350
|
elif field.get("literal", None):
|
|
314
|
-
return f"
|
|
351
|
+
return f"{repr(field['literal'])} AS {self.auto_backtick(name)}"
|
|
315
352
|
elif field.get("expr", None) is not None:
|
|
316
|
-
|
|
353
|
+
expr = field["expr"].replace("\\", "\\\\")
|
|
354
|
+
# If we are in a bronze pretransform, we do not want the fieldSpec.name.
|
|
355
|
+
return (
|
|
356
|
+
f"{expr} AS {self.auto_backtick(name)}"
|
|
357
|
+
if self._stage != "bronze pretransform"
|
|
358
|
+
else expr
|
|
359
|
+
)
|
|
317
360
|
else:
|
|
318
361
|
return ""
|
|
319
362
|
|
|
@@ -426,6 +469,8 @@ class Stage:
|
|
|
426
469
|
select_fields = self.render_fields(self._fields)
|
|
427
470
|
|
|
428
471
|
if preserve := self._utils.get("unreferencedColumns", None):
|
|
472
|
+
if self._stage == "gold": # No utils run in gold.
|
|
473
|
+
raise DisallowedUtilityConfigurationError("unreferencedColumns", "gold")
|
|
429
474
|
should_preserve = preserve.get("preserve", None)
|
|
430
475
|
if type(should_preserve) != bool:
|
|
431
476
|
raise MissingUtilityConfigurationFieldError(
|
|
@@ -438,9 +483,23 @@ class Stage:
|
|
|
438
483
|
# applying backticks to all of them is OK here
|
|
439
484
|
# since they will never use "obj.key" to reference nested fields of structs
|
|
440
485
|
# so we just go ahead and apply backticks to all across the board
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
486
|
+
colType = preserve.get("embedColumnType", "struct")
|
|
487
|
+
if colType == "struct":
|
|
488
|
+
select_fields += [
|
|
489
|
+
f"struct({', '.join(list(map(lambda x: self.force_apply_backticks(x), preserved_columns)))}) AS {self.auto_backtick(embed_col)}"
|
|
490
|
+
]
|
|
491
|
+
elif colType == "json":
|
|
492
|
+
select_fields += [
|
|
493
|
+
f"to_json(struct({', '.join(list(map(lambda x: self.force_apply_backticks(x), preserved_columns)))})) AS {self.auto_backtick(embed_col)}"
|
|
494
|
+
]
|
|
495
|
+
elif colType == "variant":
|
|
496
|
+
select_fields += [
|
|
497
|
+
f"parse_json(to_json(struct({', '.join(list(map(lambda x: self.force_apply_backticks(x), preserved_columns)))}))) AS {self.auto_backtick(embed_col)}"
|
|
498
|
+
]
|
|
499
|
+
else:
|
|
500
|
+
raise UnknownUtilityConfigurationFieldError(
|
|
501
|
+
"embedColumnType", "unreferencedColumns"
|
|
502
|
+
)
|
|
444
503
|
else:
|
|
445
504
|
(
|
|
446
505
|
preserved_columns,
|
|
@@ -476,6 +535,8 @@ class Stage:
|
|
|
476
535
|
A DataFrame with the resultant operation's records.
|
|
477
536
|
"""
|
|
478
537
|
if json_extracts := self._utils.get("jsonExtract", None):
|
|
538
|
+
if self._stage == "gold": # No utils run in gold.
|
|
539
|
+
raise DisallowedUtilityConfigurationError("jsonExtract", "gold")
|
|
479
540
|
for json_extract in json_extracts:
|
|
480
541
|
source = json_extract.get("source")
|
|
481
542
|
if not source:
|
|
@@ -610,6 +671,8 @@ class Stage:
|
|
|
610
671
|
A DataFrame with the resultant operation's records.
|
|
611
672
|
"""
|
|
612
673
|
if temp_fields := self._utils.get("temporaryFields", None):
|
|
674
|
+
if self._stage == "gold": # No utils run in gold.
|
|
675
|
+
raise DisallowedUtilityConfigurationError("temporaryFields", "gold")
|
|
613
676
|
df = Stage(self._spark, "temp_fields", {"fields": temp_fields}).run(df)
|
|
614
677
|
return df
|
|
615
678
|
|
dasl_client/regions.json
ADDED
dasl_client/regions.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from importlib import resources
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
_data = json.loads(resources.files(__package__).joinpath("regions.json").read_text())
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Regions:
|
|
9
|
+
@staticmethod
|
|
10
|
+
def lookup(name: str) -> str:
|
|
11
|
+
try:
|
|
12
|
+
return _data(name)
|
|
13
|
+
except KeyError as e:
|
|
14
|
+
raise ValueError(f"unknown region {name}") from e
|
|
15
|
+
|
|
16
|
+
@staticmethod
|
|
17
|
+
def list() -> List[str]:
|
|
18
|
+
return list(_data.keys())
|