dasl-client 1.0.23__py3-none-any.whl → 1.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dasl-client might be problematic. Click here for more details.
- dasl_client/__init__.py +1 -0
- dasl_client/client.py +240 -23
- dasl_client/exec_rule.py +92 -0
- dasl_client/helpers.py +1 -1
- dasl_client/preset_development/errors.py +42 -0
- dasl_client/preset_development/preview_engine.py +106 -25
- dasl_client/preset_development/preview_parameters.py +206 -94
- dasl_client/preset_development/stage.py +87 -24
- dasl_client/regions.json +3 -0
- dasl_client/regions.py +18 -0
- dasl_client/types/datasource.py +51 -0
- dasl_client/types/rule.py +33 -0
- dasl_client/types/workspace_config.py +13 -1
- dasl_client-1.0.24.dist-info/METADATA +18 -0
- dasl_client-1.0.24.dist-info/RECORD +32 -0
- {dasl_client-1.0.23.dist-info → dasl_client-1.0.24.dist-info}/WHEEL +1 -1
- {dasl_client-1.0.23.dist-info → dasl_client-1.0.24.dist-info}/top_level.txt +0 -1
- dasl_client-1.0.23.dist-info/METADATA +0 -34
- dasl_client-1.0.23.dist-info/RECORD +0 -36
- test/__init__.py +0 -0
- test/conftest.py +0 -18
- test/constants.py +0 -10
- test/test_api_changes.py +0 -137
- test/test_api_surface.py +0 -306
- test/test_databricks_secret_auth.py +0 -119
- test/test_marshaling.py +0 -921
- {dasl_client-1.0.23.dist-info → dasl_client-1.0.24.dist-info/licenses}/LICENSE +0 -0
|
@@ -7,6 +7,12 @@ from dasl_client.preset_development.stage import *
|
|
|
7
7
|
from dasl_client.preset_development.errors import *
|
|
8
8
|
import yaml
|
|
9
9
|
from IPython import get_ipython
|
|
10
|
+
from itertools import count
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@udf(StringType())
|
|
14
|
+
def constant_udf(*args):
|
|
15
|
+
return "<sortable_random_id>"
|
|
10
16
|
|
|
11
17
|
|
|
12
18
|
class PreviewEngine:
|
|
@@ -54,8 +60,9 @@ class PreviewEngine:
|
|
|
54
60
|
self._preset.get("silver", None), self._pretransform_name
|
|
55
61
|
)
|
|
56
62
|
|
|
63
|
+
self._pre_bronze = None
|
|
57
64
|
self._bronze = None
|
|
58
|
-
self.
|
|
65
|
+
self._pre_silver = None
|
|
59
66
|
self._silver = []
|
|
60
67
|
self._gold = []
|
|
61
68
|
self._result_df_map: Tuple[
|
|
@@ -124,13 +131,32 @@ class PreviewEngine:
|
|
|
124
131
|
|
|
125
132
|
def _compile_stages(self) -> None:
|
|
126
133
|
"""
|
|
127
|
-
Creates Stage objects, setting pretransform to None if not provided.
|
|
134
|
+
Creates Stage objects, setting silver pretransform to None if not provided.
|
|
128
135
|
"""
|
|
136
|
+
pre_bronze_field_counter = count()
|
|
137
|
+
pre_bronze_name_counter = count()
|
|
138
|
+
pre_bronze_expr_groups = self._preset.get("bronze", {}).get("preTransform", [])
|
|
139
|
+
if pre_bronze_expr_groups:
|
|
140
|
+
tables = [
|
|
141
|
+
{
|
|
142
|
+
"name": f"Index {next(pre_bronze_name_counter)}",
|
|
143
|
+
"fields": [
|
|
144
|
+
{"name": str(next(pre_bronze_field_counter)), "expr": expr}
|
|
145
|
+
for expr in expr_group
|
|
146
|
+
],
|
|
147
|
+
}
|
|
148
|
+
for expr_group in pre_bronze_expr_groups
|
|
149
|
+
]
|
|
150
|
+
for table in tables:
|
|
151
|
+
self._pre_bronze = [
|
|
152
|
+
Stage(self._spark, "bronze pretransform", table) for table in tables
|
|
153
|
+
]
|
|
154
|
+
|
|
129
155
|
pretransform = None
|
|
130
156
|
if self._pretransform_name:
|
|
131
157
|
for table in self._preset["silver"]["preTransform"]:
|
|
132
158
|
if table["name"] == self._pretransform_name:
|
|
133
|
-
self.
|
|
159
|
+
self._pre_silver = Stage(self._spark, "silver pretransform", table)
|
|
134
160
|
break
|
|
135
161
|
|
|
136
162
|
self._silver = [
|
|
@@ -151,8 +177,56 @@ class PreviewEngine:
|
|
|
151
177
|
Returns:
|
|
152
178
|
Dataframes containing the output from each run Stage.
|
|
153
179
|
"""
|
|
154
|
-
|
|
155
|
-
|
|
180
|
+
# If we are in silverbronze mode, and an autoloader has been provided, or we are
|
|
181
|
+
# not in silverbronze mode, we need to run the preBronze stage.
|
|
182
|
+
pre_bronze_output = {}
|
|
183
|
+
if (
|
|
184
|
+
self._ds_params._mode != "silverbronze"
|
|
185
|
+
or self._ds_params._autoloader_location
|
|
186
|
+
):
|
|
187
|
+
if self._pre_bronze:
|
|
188
|
+
for stage in self._pre_bronze:
|
|
189
|
+
df = stage.run(df)
|
|
190
|
+
pre_bronze_output[stage._name] = df
|
|
191
|
+
else:
|
|
192
|
+
# We are in silverbronze mode with no autoloader, so we treat first
|
|
193
|
+
# silverbronze table as initial df.
|
|
194
|
+
df = (
|
|
195
|
+
self._spark.table(self._ds_params._bronze_tables[0].get("name", ""))
|
|
196
|
+
.drop("dasl_id")
|
|
197
|
+
.limit(self._ds_params._record_limit)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
if time_col := self._ds_params._time_column:
|
|
201
|
+
df = df.filter(
|
|
202
|
+
f"timestamp({time_col}) >= timestamp('{self._ds_params._start_time}') AND timestamp({time_col}) < timestamp('{self._ds_params._end_time}')"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
df = df.withColumn("dasl_id", constant_udf())
|
|
206
|
+
|
|
207
|
+
self._bronze = df
|
|
208
|
+
|
|
209
|
+
# Deal with silverbronze table joins.
|
|
210
|
+
# Note: We can blind get here as validation should've caught anything missing.
|
|
211
|
+
if self._ds_params._mode == "silverbronze":
|
|
212
|
+
if alias := self._ds_params._bronze_tables[0].get("alias", None):
|
|
213
|
+
df = df.alias(alias)
|
|
214
|
+
for bronze_table in self._ds_params._bronze_tables[1:]:
|
|
215
|
+
join_df = (
|
|
216
|
+
spark.table(bronze_table["name"])
|
|
217
|
+
.drop("dasl_id")
|
|
218
|
+
.limit(self._ds_params._record_limit)
|
|
219
|
+
)
|
|
220
|
+
if alias := bronze_table.get("alias", None):
|
|
221
|
+
join_df = join_df.alias(alias)
|
|
222
|
+
df = df.join(
|
|
223
|
+
join_df,
|
|
224
|
+
expr(bronze_table["joinExpr"]),
|
|
225
|
+
bronze_table.get("joinType", "left"),
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
if self._pre_silver:
|
|
229
|
+
df = self._pre_silver.run(df)
|
|
156
230
|
|
|
157
231
|
silver_output_map = {}
|
|
158
232
|
for table in self._silver:
|
|
@@ -166,15 +240,17 @@ class PreviewEngine:
|
|
|
166
240
|
)
|
|
167
241
|
|
|
168
242
|
return (
|
|
169
|
-
(df, silver_output_map, gold_output_map)
|
|
170
|
-
if self.
|
|
171
|
-
else (None, silver_output_map, gold_output_map)
|
|
243
|
+
(df, silver_output_map, gold_output_map, pre_bronze_output)
|
|
244
|
+
if self._pre_silver
|
|
245
|
+
else (None, silver_output_map, gold_output_map, pre_bronze_output)
|
|
172
246
|
)
|
|
173
247
|
|
|
174
248
|
def _render_output(
|
|
175
249
|
self,
|
|
176
250
|
input_df: DataFrame,
|
|
177
|
-
stage_dataframes: Tuple[
|
|
251
|
+
stage_dataframes: Tuple[
|
|
252
|
+
List[DataFrame], DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]
|
|
253
|
+
],
|
|
178
254
|
gold_table_catalog: str,
|
|
179
255
|
gold_table_schema: str,
|
|
180
256
|
) -> None:
|
|
@@ -195,7 +271,7 @@ class PreviewEngine:
|
|
|
195
271
|
def d(txt, lvl) -> None:
|
|
196
272
|
displayHTML(
|
|
197
273
|
f"""
|
|
198
|
-
<div style="background-color:
|
|
274
|
+
<div style="background-color:
|
|
199
275
|
background-color: rgb(18, 23, 26); padding: 0; margin: 0;">
|
|
200
276
|
<h{lvl} style="margin: 0; background-color: rgb(244, 234, 229);">{txt}</h{lvl}>
|
|
201
277
|
</div>
|
|
@@ -227,12 +303,16 @@ class PreviewEngine:
|
|
|
227
303
|
prefix=prefix + target_field.name + ".",
|
|
228
304
|
)
|
|
229
305
|
|
|
230
|
-
(
|
|
231
|
-
d("Input", 1)
|
|
306
|
+
(pre_silver, silver, gold, pre_bronze) = stage_dataframes
|
|
307
|
+
d("Autoloader Input", 1)
|
|
232
308
|
display(input_df)
|
|
309
|
+
d("Bronze Pre-Transform", 1)
|
|
310
|
+
for name, df in pre_bronze.items():
|
|
311
|
+
d(f"{name}", 2)
|
|
312
|
+
display(df)
|
|
233
313
|
d("Silver Pre-Transform", 1)
|
|
234
|
-
if
|
|
235
|
-
display(
|
|
314
|
+
if pre_silver:
|
|
315
|
+
display(pre_silver)
|
|
236
316
|
else:
|
|
237
317
|
d("Skipped", 2)
|
|
238
318
|
d("Silver Transform", 1)
|
|
@@ -326,31 +406,32 @@ class PreviewEngine:
|
|
|
326
406
|
)
|
|
327
407
|
|
|
328
408
|
# If we are using the autoloader, fetch format from preset and others.
|
|
329
|
-
if self._ds_params._mode == "autoloader"
|
|
409
|
+
if self._ds_params._mode == "autoloader" or (
|
|
410
|
+
self._ds_params._mode == "silverbronze"
|
|
411
|
+
and self._ds_params._autoloader_location
|
|
412
|
+
):
|
|
413
|
+
if self._preset.get("bronze", {}).get("loadAsSingleVariant", False) == True:
|
|
414
|
+
self._ds_params._set_load_as_single_variant()
|
|
330
415
|
if not (autoloader_conf := self._preset.get("autoloader", None)):
|
|
331
416
|
raise MissingAutoloaderConfigError()
|
|
332
417
|
if not (file_format := autoloader_conf.get("format", None)):
|
|
333
418
|
raise AutoloaderMissingFieldError("format")
|
|
334
|
-
self._ds_params.
|
|
419
|
+
self._ds_params._set_autoloader_format(file_format)
|
|
335
420
|
if schemaFile := autoloader_conf.get("schemaFile", None):
|
|
336
|
-
self._ds_params.
|
|
337
|
-
if multiline := autoloader_conf.get("multiline", None):
|
|
338
|
-
if multiline == "true":
|
|
339
|
-
self._ds_params.set_multiline(True)
|
|
340
|
-
else:
|
|
341
|
-
self._ds_params.set_multiline(False)
|
|
421
|
+
self._ds_params._set_autoloader_schema_file(schemaFile)
|
|
342
422
|
if cloudFiles := autoloader_conf.get("cloudFiles", None):
|
|
343
423
|
if schema_hints := cloudFiles.get("schemaHints", None):
|
|
344
|
-
self._ds_params.
|
|
424
|
+
self._ds_params._set_autoloader_cloudfiles_schema_hints(
|
|
425
|
+
schema_hints
|
|
426
|
+
)
|
|
345
427
|
if schema_hints_file := cloudFiles.get("schemaHintsFile", None):
|
|
346
|
-
self._ds_params.
|
|
428
|
+
self._ds_params._set_autoloader_cloudfiles_schema_hint_file(
|
|
347
429
|
schema_hints_file
|
|
348
430
|
)
|
|
349
431
|
|
|
350
432
|
self._compile_stages()
|
|
351
433
|
|
|
352
434
|
with self._ds_params as df:
|
|
353
|
-
self._bronze = df
|
|
354
435
|
self._result_df_map = self._run(df)
|
|
355
436
|
if display:
|
|
356
437
|
self._render_output(
|
|
@@ -9,11 +9,6 @@ import uuid
|
|
|
9
9
|
from IPython import get_ipython
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
@udf(StringType())
|
|
13
|
-
def constant_udf(*args):
|
|
14
|
-
return "<sortable_random_id>"
|
|
15
|
-
|
|
16
|
-
|
|
17
12
|
class PreviewParameters:
|
|
18
13
|
"""
|
|
19
14
|
This class provides three methods for supplying input records to the preset development environment.
|
|
@@ -60,6 +55,78 @@ class PreviewParameters:
|
|
|
60
55
|
.set_table("system.access.audit")
|
|
61
56
|
```
|
|
62
57
|
|
|
58
|
+
**4. SilverBronze Mode:**
|
|
59
|
+
"silverbronze" mode, works like a more advanced "table" mode. It allows for joining of multiple
|
|
60
|
+
tables as input. This mode requires setting bronze table definitions. This mode behaves in 2
|
|
61
|
+
seperate ways depending on whether an autoloader location is set or not. If an autoloader location
|
|
62
|
+
is set the first entry in the bronze table definitions is used to name and alias the autoloader's
|
|
63
|
+
input and these can be used in later join expressions. Used in this way, the autoloader will be
|
|
64
|
+
loaded as in "autoloader" mode, and run through preBronze stages before being joined with the
|
|
65
|
+
remainder of the bronze table definitions. This mimics not skipping bronze in a DataSource and
|
|
66
|
+
joining what was read in silver. If an autoloader location is not set, the behaviour instead
|
|
67
|
+
attempts to emulate a DataSource set to skip the bronze stage. That is, all preBronze and bronze
|
|
68
|
+
stages will be skipped, and the name of the first entry in the given bronze table definitions will
|
|
69
|
+
be read from instead. Any subsequent bronze table definitions will be joined against this table.
|
|
70
|
+
|
|
71
|
+
Using no autoloader location (this will read from the first table):
|
|
72
|
+
```python
|
|
73
|
+
bronze_tables = [
|
|
74
|
+
{
|
|
75
|
+
"name": "databricks_dev.default.sev_map",
|
|
76
|
+
"alias": "tab1"
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"name": "databricks_dev.alan_bronze.akamai_waf",
|
|
80
|
+
"alias": "tab2",
|
|
81
|
+
"joinExpr": "id::string = tab2.serviceID",
|
|
82
|
+
"joinType": "left"
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"name": "databricks_dev.alan_silver.cloudflare_hjttp_request",
|
|
86
|
+
"alias": "tab3",
|
|
87
|
+
"joinExpr": "tab1.id::string = tab3.ClientRequestsBytes",
|
|
88
|
+
"joinType": "inner"
|
|
89
|
+
}
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
ds_params = (
|
|
93
|
+
PreviewParameters(spark)
|
|
94
|
+
.from_silverbronze_tables()
|
|
95
|
+
.set_bronze_table_definitions(bronze_tables)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
ps = PreviewEngine(spark, yaml_string, ds_params)
|
|
99
|
+
ps.evaluate("stage.gold")
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Using an autoloader location (this will read from the autoloader and name the df tab1):
|
|
103
|
+
```python
|
|
104
|
+
bronze_tables = [
|
|
105
|
+
{
|
|
106
|
+
"name": "tab1"
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"name": "databricks_dev.alan_bronze.akamai_waf",
|
|
110
|
+
"alias": "tab2",
|
|
111
|
+
"joinExpr": "id::string = tab2.serviceID",
|
|
112
|
+
"joinType": "left"
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
"name": "databricks_dev.alan_silver.cloudflare_hjttp_request",
|
|
116
|
+
"alias": "tab3",
|
|
117
|
+
"joinExpr": "tab1.id::string = tab3.ClientRequestsBytes",
|
|
118
|
+
"joinType": "inner"
|
|
119
|
+
}
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
ds_params = (
|
|
123
|
+
PreviewParameters(spark)
|
|
124
|
+
.from_silverbronze_tables()
|
|
125
|
+
.set_bronze_table_definitions(bronze_tables)
|
|
126
|
+
.set_autoloader_location("s3://antimatter-dasl-testing/csamples3/mars/area1/")
|
|
127
|
+
)
|
|
128
|
+
```
|
|
129
|
+
|
|
63
130
|
**Note:**
|
|
64
131
|
When using autoloader mode, this implementation requires a location to store a temporary schema for
|
|
65
132
|
the loaded records. By default, this is set to `"dbfs:/tmp/schemas"`. You can change this using
|
|
@@ -94,7 +161,7 @@ class PreviewParameters:
|
|
|
94
161
|
df (DataFrame): Internal Spark DataFrame loaded using the specified parameters.
|
|
95
162
|
"""
|
|
96
163
|
self._spark = spark
|
|
97
|
-
self._mode = None # [input, autoloader]
|
|
164
|
+
self._mode = None # [input, table, autoloader, silverbronze]
|
|
98
165
|
self._record_limit = 10
|
|
99
166
|
self._autoloader_temp_schema_location = "dbfs:/tmp/schemas"
|
|
100
167
|
self._gold_test_schemas = []
|
|
@@ -110,18 +177,97 @@ class PreviewParameters:
|
|
|
110
177
|
self._cloudfiles_schema_hints = None
|
|
111
178
|
self._cloudfiles_reader_case_sensitive = "true"
|
|
112
179
|
self._cloudfiles_multiline = "true"
|
|
180
|
+
self._cloudfiles_wholetext = "false"
|
|
113
181
|
self._schema_uuid_str = str(uuid.uuid4())
|
|
182
|
+
self._single_variant_column = None
|
|
114
183
|
|
|
115
184
|
self._schema = None
|
|
116
185
|
self._data = None
|
|
117
186
|
|
|
118
187
|
self._table = None
|
|
119
188
|
|
|
189
|
+
self._bronze_tables = None
|
|
190
|
+
|
|
120
191
|
self._pretransform_name = None
|
|
121
|
-
self._bronze_pre_transform: Optional[List[str]] = None
|
|
122
192
|
|
|
123
193
|
self._df = None
|
|
124
194
|
|
|
195
|
+
def __create_from_autoloader(self) -> DataFrame:
|
|
196
|
+
stream_df = (
|
|
197
|
+
self._spark.readStream.format("cloudFiles")
|
|
198
|
+
.option("cloudFiles.format", self._autoloader_format)
|
|
199
|
+
.option("readerCaseSensitive", self._cloudfiles_reader_case_sensitive)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# text and wholetext needs to be handled seperately.
|
|
203
|
+
stream_df = (
|
|
204
|
+
stream_df.option("multiline", self._cloudfiles_multiline)
|
|
205
|
+
if self._autoloader_format != "text"
|
|
206
|
+
else stream_df.option("wholetext", self._cloudfiles_wholetext)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if self._single_variant_column:
|
|
210
|
+
stream_df = stream_df.option(
|
|
211
|
+
"singleVariantColumn", self._single_variant_column
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if self._schema_file:
|
|
215
|
+
with open(self._schema_file, "r") as f:
|
|
216
|
+
stream_df = stream_df.schema(f.read().strip())
|
|
217
|
+
else:
|
|
218
|
+
stream_df = (
|
|
219
|
+
stream_df.option("inferSchema", "true")
|
|
220
|
+
.option("cloudFiles.inferColumnTypes", "true")
|
|
221
|
+
.option(
|
|
222
|
+
"cloudFiles.schemaLocation",
|
|
223
|
+
f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if self._cloudfiles_schema_hints:
|
|
228
|
+
stream_df = stream_df.option(
|
|
229
|
+
"cloudFiles.schemaHints", self._cloudfiles_schema_hints
|
|
230
|
+
)
|
|
231
|
+
elif self._clouldfiles_schema_hints_file:
|
|
232
|
+
stream_df = stream_df.option(
|
|
233
|
+
"cloudFiles.schemaHintsFile", self._clouldfiles_schema_hints_file
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
stream_df = stream_df.load(self._autoloader_location).limit(self._record_limit)
|
|
237
|
+
|
|
238
|
+
query = (
|
|
239
|
+
stream_df.writeStream.format("memory")
|
|
240
|
+
.queryName("batch_data")
|
|
241
|
+
.trigger(availableNow=True)
|
|
242
|
+
.start()
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
query.awaitTermination()
|
|
246
|
+
|
|
247
|
+
def __create_from_silverbronze_tables_join(self) -> DataFrame:
|
|
248
|
+
if not self._bronze_tables or not len(self._bronze_tables):
|
|
249
|
+
raise MissingBronzeTablesError()
|
|
250
|
+
|
|
251
|
+
# Validate name and joinExpr are set.
|
|
252
|
+
for i in range(len(self._bronze_tables)):
|
|
253
|
+
if not self._bronze_tables[i].get("name", None):
|
|
254
|
+
raise MissingBronzeTableFieldError("name")
|
|
255
|
+
if i > 0 and not self._bronze_tables[i].get("joinExpr", None):
|
|
256
|
+
raise MissingBronzeTableFieldError("joinExpr")
|
|
257
|
+
|
|
258
|
+
# If there is an autoloader location given, we create the df now and
|
|
259
|
+
# then allow preBronze stage to run. Otherwise we skip preBronze stages
|
|
260
|
+
# and as part of running the silverbronze joins we create the df from
|
|
261
|
+
# the first entry in the bronze tables list.
|
|
262
|
+
df = None
|
|
263
|
+
if self._autoloader_location:
|
|
264
|
+
self.__create_from_autoloader()
|
|
265
|
+
df = self._spark.table("batch_data").alias(
|
|
266
|
+
self._bronze_tables[0].get("name", "")
|
|
267
|
+
) # Use first's name.
|
|
268
|
+
|
|
269
|
+
return df
|
|
270
|
+
|
|
125
271
|
def __enter__(self):
|
|
126
272
|
"""
|
|
127
273
|
Creates a DataFrame with data using the method specified. In the case of "autoloader",
|
|
@@ -137,59 +283,10 @@ class PreviewParameters:
|
|
|
137
283
|
elif self._mode == "table":
|
|
138
284
|
self._df = self._spark.table(self._table).limit(self._record_limit)
|
|
139
285
|
elif self._mode == "autoloader":
|
|
140
|
-
|
|
141
|
-
self._spark.readStream.format("cloudFiles")
|
|
142
|
-
.option("cloudFiles.format", self._autoloader_format)
|
|
143
|
-
.option("multiline", self._cloudfiles_multiline)
|
|
144
|
-
.option("readerCaseSensitive", self._cloudfiles_reader_case_sensitive)
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
if self._schema_file:
|
|
148
|
-
with open(self._schema_file, "r") as f:
|
|
149
|
-
stream_df = stream_df.schema(f.read().strip())
|
|
150
|
-
else:
|
|
151
|
-
stream_df = (
|
|
152
|
-
stream_df.option("inferSchema", "true")
|
|
153
|
-
.option("cloudFiles.inferColumnTypes", "true")
|
|
154
|
-
.option(
|
|
155
|
-
"cloudFiles.schemaLocation",
|
|
156
|
-
f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
|
|
157
|
-
)
|
|
158
|
-
)
|
|
159
|
-
|
|
160
|
-
if self._cloudfiles_schema_hints:
|
|
161
|
-
stream_df = stream_df.option(
|
|
162
|
-
"cloudFiles.schemaHints", self._cloudfiles_schema_hints
|
|
163
|
-
)
|
|
164
|
-
elif self._clouldfiles_schema_hints_file:
|
|
165
|
-
stream_df = stream_df.option(
|
|
166
|
-
"cloudFiles.schemaHintsFile", self._clouldfiles_schema_hints_file
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
stream_df = stream_df.load(self._autoloader_location).limit(
|
|
170
|
-
self._record_limit
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
if self._bronze_pre_transform is not None:
|
|
174
|
-
stream_df = stream_df.selectExpr(*self._bronze_pre_transform)
|
|
175
|
-
|
|
176
|
-
query = (
|
|
177
|
-
stream_df.writeStream.format("memory")
|
|
178
|
-
.queryName("batch_data")
|
|
179
|
-
.trigger(availableNow=True)
|
|
180
|
-
.start()
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
query.awaitTermination()
|
|
184
|
-
|
|
286
|
+
self.__create_from_autoloader()
|
|
185
287
|
self._df = self._spark.table("batch_data")
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
self._df = self._df.filter(
|
|
189
|
-
f"timestamp({self._time_column}) >= timestamp('{self._start_time}') AND timestamp({self._time_column}) < timestamp('{self._end_time}')"
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
self._df = self._df.withColumn("dasl_id", constant_udf())
|
|
288
|
+
elif self._mode == "silverbronze":
|
|
289
|
+
self._df = self.__create_from_silverbronze_tables_join()
|
|
193
290
|
|
|
194
291
|
return self._df
|
|
195
292
|
|
|
@@ -254,6 +351,36 @@ class PreviewParameters:
|
|
|
254
351
|
self._mode = "table"
|
|
255
352
|
return self
|
|
256
353
|
|
|
354
|
+
def from_silverbronze_tables(self):
|
|
355
|
+
"""
|
|
356
|
+
Set the data source loader to "bronze tables" mode. Requires a list of bronze table
|
|
357
|
+
definitions to be provided.
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
PreviewParameters: The current instance with updated configuration.
|
|
361
|
+
"""
|
|
362
|
+
self._mode = "silverbronze"
|
|
363
|
+
return self
|
|
364
|
+
|
|
365
|
+
def set_bronze_table_definitions(self, definitions: List[Dict[str, str]]):
|
|
366
|
+
"""
|
|
367
|
+
Set the bronze table definitions for bronze tables mode. `name` and `joinExpr` are
|
|
368
|
+
required. If `alias` is not provided, one can use the `name` to refer to the table.
|
|
369
|
+
If `joinType` is not provided, "left" is used as a default value. If pr
|
|
370
|
+
|
|
371
|
+
[
|
|
372
|
+
{
|
|
373
|
+
"name": "name",
|
|
374
|
+
"alias": "alias1",
|
|
375
|
+
"joinType": "inner",
|
|
376
|
+
"joinExpr": "base_table.col1 = alias1.col1
|
|
377
|
+
},
|
|
378
|
+
...
|
|
379
|
+
]
|
|
380
|
+
"""
|
|
381
|
+
self._bronze_tables = definitions
|
|
382
|
+
return self
|
|
383
|
+
|
|
257
384
|
def set_autoloader_temp_schema_location(self, path: str):
|
|
258
385
|
"""
|
|
259
386
|
Set the location for the autoloader's streaming mode schema to be created. This is
|
|
@@ -311,7 +438,7 @@ class PreviewParameters:
|
|
|
311
438
|
self._autoloader_location = location
|
|
312
439
|
return self
|
|
313
440
|
|
|
314
|
-
def
|
|
441
|
+
def _set_autoloader_format(self, file_format: str):
|
|
315
442
|
"""
|
|
316
443
|
Used internally to set the autoloader format.
|
|
317
444
|
|
|
@@ -320,12 +447,16 @@ class PreviewParameters:
|
|
|
320
447
|
"""
|
|
321
448
|
if file_format.lower() == "jsonl":
|
|
322
449
|
self._autoloader_format = "json"
|
|
323
|
-
self.
|
|
450
|
+
self._cloudfiles_multiline = "false"
|
|
451
|
+
return self
|
|
452
|
+
if file_format.lower() == "wholetext":
|
|
453
|
+
self._autoloader_format = "text"
|
|
454
|
+
self._cloudfiles_wholetext = "true"
|
|
324
455
|
return self
|
|
325
456
|
self._autoloader_format = file_format
|
|
326
457
|
return self
|
|
327
458
|
|
|
328
|
-
def
|
|
459
|
+
def _set_autoloader_schema_file(self, path: str):
|
|
329
460
|
"""
|
|
330
461
|
Set the schema file path for "autoloader" mode.
|
|
331
462
|
|
|
@@ -335,7 +466,7 @@ class PreviewParameters:
|
|
|
335
466
|
self._schema_file = path
|
|
336
467
|
return self
|
|
337
468
|
|
|
338
|
-
def
|
|
469
|
+
def _set_autoloader_cloudfiles_schema_hint_file(self, path: str):
|
|
339
470
|
"""
|
|
340
471
|
Set the cloudFiles schema hints file path for "autoloader" mode.
|
|
341
472
|
|
|
@@ -345,7 +476,7 @@ class PreviewParameters:
|
|
|
345
476
|
self._clouldfiles_schema_hints_file = path
|
|
346
477
|
return self
|
|
347
478
|
|
|
348
|
-
def
|
|
479
|
+
def _set_autoloader_cloudfiles_schema_hints(self, cloudfiles_schema_hints: str):
|
|
349
480
|
"""
|
|
350
481
|
Set the cloudFiles schema hints string for "autoloader" mode.
|
|
351
482
|
|
|
@@ -355,26 +486,6 @@ class PreviewParameters:
|
|
|
355
486
|
self._cloudfiles_schema_hints = cloudfiles_schema_hints
|
|
356
487
|
return self
|
|
357
488
|
|
|
358
|
-
def set_autoloader_reader_case_sensitive(self, b: bool):
|
|
359
|
-
"""
|
|
360
|
-
Set the cloudFiles reader case-sensitive boolean for "autoloader" mode.
|
|
361
|
-
|
|
362
|
-
Returns:
|
|
363
|
-
PreviewParameters: The current instance with updated configuration.
|
|
364
|
-
"""
|
|
365
|
-
self._cloudfiles_reader_case_sensitive = "true" if b else "false"
|
|
366
|
-
return self
|
|
367
|
-
|
|
368
|
-
def set_autoloader_multiline(self, b: bool):
|
|
369
|
-
"""
|
|
370
|
-
Set the cloudFiles multiline boolean for "autoloader" mode.
|
|
371
|
-
|
|
372
|
-
Returns:
|
|
373
|
-
PreviewParameters: The current instance with updated configuration.
|
|
374
|
-
"""
|
|
375
|
-
self._cloudfiles_multiline = "true" if b else "false"
|
|
376
|
-
return self
|
|
377
|
-
|
|
378
489
|
def set_pretransform_name(self, pretransform_name: str):
|
|
379
490
|
"""
|
|
380
491
|
Set the pretransform name to use, if desired. If not set, Silver PreTransform
|
|
@@ -386,16 +497,6 @@ class PreviewParameters:
|
|
|
386
497
|
self._pretransform_name = pretransform_name
|
|
387
498
|
return self
|
|
388
499
|
|
|
389
|
-
def set_bronze_pre_transform(self, expr: List[str]):
|
|
390
|
-
"""
|
|
391
|
-
Sets a pre-transform expression that will run before data is written to bronze
|
|
392
|
-
|
|
393
|
-
Returns:
|
|
394
|
-
PreviewParameters: The current instance with updated configuration.
|
|
395
|
-
"""
|
|
396
|
-
self._bronze_pre_transform = expr
|
|
397
|
-
return self
|
|
398
|
-
|
|
399
500
|
def set_date_range(self, column: str, start_time: str, end_time: str):
|
|
400
501
|
"""
|
|
401
502
|
Set the TIMESTAMP column and date range to use as the input data filter to
|
|
@@ -431,6 +532,17 @@ class PreviewParameters:
|
|
|
431
532
|
self._table = table_name
|
|
432
533
|
return self
|
|
433
534
|
|
|
535
|
+
def _set_load_as_single_variant(self, col_name: Optional[str] = None):
|
|
536
|
+
"""
|
|
537
|
+
Enable loadAsSingleVariant mode. This will ingest data into a single VARIANT-typed column.
|
|
538
|
+
The default name of that column is `data`.
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
PreviewParameters: The current instance with updated configuration.
|
|
542
|
+
"""
|
|
543
|
+
self._single_variant_column = col_name if col_name is not None else "data"
|
|
544
|
+
return self
|
|
545
|
+
|
|
434
546
|
def add_gold_schema_table(self, gold_schema_table_name: str):
|
|
435
547
|
"""
|
|
436
548
|
Add a gold schema temporary table name that will need to be cleaned
|