dasl-client 1.0.14__tar.gz → 1.0.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dasl-client might be problematic. Click here for more details.
- {dasl_client-1.0.14 → dasl_client-1.0.16}/PKG-INFO +2 -3
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/client.py +22 -6
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/preset_development/errors.py +4 -1
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/preset_development/preview_engine.py +23 -7
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/preset_development/preview_parameters.py +31 -6
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/preset_development/stage.py +87 -20
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/types/dbui.py +12 -7
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client.egg-info/PKG-INFO +2 -3
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client.egg-info/SOURCES.txt +4 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client.egg-info/requires.txt +1 -2
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client.egg-info/top_level.txt +1 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/pyproject.toml +2 -3
- dasl_client-1.0.16/test/__init__.py +0 -0
- dasl_client-1.0.16/test/conftest.py +18 -0
- dasl_client-1.0.16/test/constants.py +10 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/test/test_api_surface.py +1 -24
- dasl_client-1.0.16/test/test_databricks_secret_auth.py +116 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/test/test_marshaling.py +5 -6
- {dasl_client-1.0.14 → dasl_client-1.0.16}/LICENSE +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/README.md +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/__init__.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/auth/__init__.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/auth/auth.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/conn/__init__.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/conn/client_identifier.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/conn/conn.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/errors/__init__.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/errors/errors.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/helpers.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/preset_development/__init__.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/types/__init__.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/types/admin_config.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/types/content.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/types/datasource.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/types/helpers.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/types/rule.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/types/types.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/types/workspace_config.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client.egg-info/dependency_links.txt +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/setup.cfg +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/setup.py +0 -0
- {dasl_client-1.0.14 → dasl_client-1.0.16}/test/test_api_changes.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dasl_client
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.16
|
|
4
4
|
Summary: The DASL client library used for interacting with the DASL workspace
|
|
5
5
|
Home-page: https://github.com/antimatter/asl
|
|
6
6
|
Author: Antimatter Team
|
|
@@ -11,8 +11,7 @@ License-File: LICENSE
|
|
|
11
11
|
Requires-Dist: dasl_api==0.1.17
|
|
12
12
|
Requires-Dist: databricks-sdk>=0.41.0
|
|
13
13
|
Requires-Dist: pydantic>=2
|
|
14
|
-
Requires-Dist:
|
|
15
|
-
Requires-Dist: typing_extensions==4.10.0
|
|
14
|
+
Requires-Dist: typing_extensions>=4.10.0
|
|
16
15
|
|
|
17
16
|
# DASL Client Library
|
|
18
17
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
from copy import deepcopy
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
|
+
from time import sleep
|
|
2
4
|
from typing import Any, Callable, Iterator, List, Optional, Tuple, TypeVar
|
|
3
5
|
from pydantic import Field
|
|
4
6
|
from pyspark.sql import DataFrame
|
|
@@ -608,6 +610,7 @@ class Client:
|
|
|
608
610
|
self,
|
|
609
611
|
warehouse: str,
|
|
610
612
|
request: TransformRequest,
|
|
613
|
+
timeout: timedelta = timedelta(minutes=5),
|
|
611
614
|
) -> TransformResponse:
|
|
612
615
|
"""
|
|
613
616
|
Run a sequence of ADHOC transforms against a SQL warehouse to
|
|
@@ -618,16 +621,29 @@ class Client:
|
|
|
618
621
|
:return: a TransformResponse object containing the results
|
|
619
622
|
after running the transforms.
|
|
620
623
|
:raises: NotFoundError if the rule does not exist
|
|
624
|
+
:raises: Exception for a server-side error or timeout
|
|
621
625
|
"""
|
|
622
626
|
with error_handler():
|
|
623
|
-
|
|
624
|
-
self.
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
request.to_api_obj(),
|
|
628
|
-
)
|
|
627
|
+
status = self._dbui_client().dbui_v1_transform(
|
|
628
|
+
self._workspace(),
|
|
629
|
+
warehouse,
|
|
630
|
+
request.to_api_obj(),
|
|
629
631
|
)
|
|
630
632
|
|
|
633
|
+
begin = datetime.now()
|
|
634
|
+
while datetime.now() - begin < timeout:
|
|
635
|
+
sleep(5)
|
|
636
|
+
status = self._dbui_client().dbui_v1_transform_status(
|
|
637
|
+
self._workspace(), status.id
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
if status.status == "failure":
|
|
641
|
+
raise Exception(f"adhoc transform failed with {status.error}")
|
|
642
|
+
elif status.status == "success":
|
|
643
|
+
return TransformResponse.from_api_obj(status.result)
|
|
644
|
+
|
|
645
|
+
raise Exception("timed out waiting for adhoc transform result")
|
|
646
|
+
|
|
631
647
|
def get_observable_events(
|
|
632
648
|
self,
|
|
633
649
|
warehouse: str,
|
|
@@ -149,7 +149,10 @@ class AssertionFailedError(PresetError):
|
|
|
149
149
|
def __init__(self, expr: str, assertion_message: str, df: DataFrame):
|
|
150
150
|
# Get the Databricks built-in functions out the namespace.
|
|
151
151
|
ipython = get_ipython()
|
|
152
|
-
|
|
152
|
+
if ipython is not None:
|
|
153
|
+
display = ipython.user_ns["display"]
|
|
154
|
+
else:
|
|
155
|
+
display = lambda x: x.show()
|
|
153
156
|
|
|
154
157
|
self.expr = expr
|
|
155
158
|
self.assertion_message = assertion_message
|
|
@@ -54,10 +54,13 @@ class PreviewEngine:
|
|
|
54
54
|
self._preset.get("silver", None), self._pretransform_name
|
|
55
55
|
)
|
|
56
56
|
|
|
57
|
+
self._bronze = None
|
|
57
58
|
self._pre = None
|
|
58
59
|
self._silver = []
|
|
59
60
|
self._gold = []
|
|
60
|
-
self._result_df_map
|
|
61
|
+
self._result_df_map: Tuple[
|
|
62
|
+
DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]
|
|
63
|
+
] = (None, {}, {})
|
|
61
64
|
|
|
62
65
|
def _validate_pretransform_name(
|
|
63
66
|
self, silver: Dict[str, str], pretransform_name: str
|
|
@@ -181,8 +184,12 @@ class PreviewEngine:
|
|
|
181
184
|
|
|
182
185
|
# Get the Databricks built-in functions out the namespace.
|
|
183
186
|
ipython = get_ipython()
|
|
184
|
-
|
|
185
|
-
|
|
187
|
+
if ipython is not None:
|
|
188
|
+
displayHTML = ipython.user_ns["displayHTML"]
|
|
189
|
+
display = ipython.user_ns["display"]
|
|
190
|
+
else:
|
|
191
|
+
displayHTML = lambda x: print(x)
|
|
192
|
+
display = lambda x: x.show()
|
|
186
193
|
|
|
187
194
|
def d(txt, lvl) -> None:
|
|
188
195
|
displayHTML(
|
|
@@ -245,7 +252,7 @@ class PreviewEngine:
|
|
|
245
252
|
raise UnknownGoldTableError(name, gold_table_schema)
|
|
246
253
|
|
|
247
254
|
# Performs the type check.
|
|
248
|
-
delta_df = self._spark.table(f"{gold_table_schema}
|
|
255
|
+
delta_df = self._spark.table(f"`{gold_table_schema}`.`{name}`").limit(0)
|
|
249
256
|
unioned_df = delta_df.unionByName(df, allowMissingColumns=True)
|
|
250
257
|
|
|
251
258
|
# Now we check no new columns.
|
|
@@ -286,7 +293,7 @@ class PreviewEngine:
|
|
|
286
293
|
d("Resultant gold table preview", 3)
|
|
287
294
|
display(unioned_df)
|
|
288
295
|
|
|
289
|
-
def evaluate(self, gold_table_schema: str) -> None:
|
|
296
|
+
def evaluate(self, gold_table_schema: str, display: bool = True) -> None:
|
|
290
297
|
"""
|
|
291
298
|
Evaluates the loaded preset YAML using the input datasource configuration to load
|
|
292
299
|
records. Finally, checks that the output from the Gold stages is compatible with
|
|
@@ -303,7 +310,9 @@ class PreviewEngine:
|
|
|
303
310
|
):
|
|
304
311
|
if not any(
|
|
305
312
|
row.databaseName == schema_name
|
|
306
|
-
for row in self._spark.sql(
|
|
313
|
+
for row in self._spark.sql(
|
|
314
|
+
f"SHOW SCHEMAS IN `{catalog_name}`"
|
|
315
|
+
).collect()
|
|
307
316
|
):
|
|
308
317
|
raise InvalidGoldTableSchemaError(
|
|
309
318
|
gold_table_schema,
|
|
@@ -340,5 +349,12 @@ class PreviewEngine:
|
|
|
340
349
|
self._compile_stages()
|
|
341
350
|
|
|
342
351
|
with self._ds_params as df:
|
|
352
|
+
self._bronze = df
|
|
343
353
|
self._result_df_map = self._run(df)
|
|
344
|
-
|
|
354
|
+
if display:
|
|
355
|
+
self._render_output(df, self._result_df_map, gold_table_schema)
|
|
356
|
+
|
|
357
|
+
def results(
|
|
358
|
+
self,
|
|
359
|
+
) -> Tuple[DataFrame, DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]]:
|
|
360
|
+
return self._bronze, *self._result_df_map
|
{dasl_client-1.0.14 → dasl_client-1.0.16}/dasl_client/preset_development/preview_parameters.py
RENAMED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
1
3
|
from pyspark.sql import DataFrame, SparkSession
|
|
2
4
|
from pyspark.sql.types import *
|
|
3
5
|
from pyspark.sql.dataframe import DataFrame
|
|
@@ -115,6 +117,7 @@ class PreviewParameters:
|
|
|
115
117
|
self._table = None
|
|
116
118
|
|
|
117
119
|
self._pretransform_name = None
|
|
120
|
+
self._bronze_pre_transform: Optional[List[str]] = None
|
|
118
121
|
|
|
119
122
|
self._df = None
|
|
120
123
|
|
|
@@ -166,10 +169,13 @@ class PreviewParameters:
|
|
|
166
169
|
self._record_limit
|
|
167
170
|
)
|
|
168
171
|
|
|
172
|
+
if self._bronze_pre_transform is not None:
|
|
173
|
+
stream_df = stream_df.selectExpr(*self._bronze_pre_transform)
|
|
174
|
+
|
|
169
175
|
query = (
|
|
170
176
|
stream_df.writeStream.format("memory")
|
|
171
177
|
.queryName("batch_data")
|
|
172
|
-
.trigger(
|
|
178
|
+
.trigger(availableNow=True)
|
|
173
179
|
.start()
|
|
174
180
|
)
|
|
175
181
|
|
|
@@ -193,12 +199,17 @@ class PreviewParameters:
|
|
|
193
199
|
|
|
194
200
|
# Get the Databricks built-in functions out the namespace.
|
|
195
201
|
ipython = get_ipython()
|
|
196
|
-
|
|
202
|
+
if ipython is not None:
|
|
203
|
+
dbutils = ipython.user_ns["dbutils"]
|
|
197
204
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
205
|
+
dbutils.fs.rm(
|
|
206
|
+
f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
|
|
207
|
+
recurse=True,
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
print(
|
|
211
|
+
f"FYI, we are leaking temp data {self._autoloader_temp_schema_location}/{self._schema_uuid_str}"
|
|
212
|
+
)
|
|
202
213
|
|
|
203
214
|
def from_input(self):
|
|
204
215
|
"""
|
|
@@ -286,6 +297,10 @@ class PreviewParameters:
|
|
|
286
297
|
Returns:
|
|
287
298
|
PreviewParameters: The current instance with updated configuration.
|
|
288
299
|
"""
|
|
300
|
+
if file_format.lower() == "jsonl":
|
|
301
|
+
self._autoloader_format = "json"
|
|
302
|
+
self.set_autoloader_multiline(False)
|
|
303
|
+
return self
|
|
289
304
|
self._autoloader_format = file_format
|
|
290
305
|
return self
|
|
291
306
|
|
|
@@ -350,6 +365,16 @@ class PreviewParameters:
|
|
|
350
365
|
self._pretransform_name = pretransform_name
|
|
351
366
|
return self
|
|
352
367
|
|
|
368
|
+
def set_bronze_pre_transform(self, expr: List[str]):
|
|
369
|
+
"""
|
|
370
|
+
Sets a pre-transform expression that will run before data is written to bronze
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
PreviewParameters: The current instance with updated configuration.
|
|
374
|
+
"""
|
|
375
|
+
self._bronze_pre_transform = expr
|
|
376
|
+
return self
|
|
377
|
+
|
|
353
378
|
def set_date_range(self, column: str, start_time: str, end_time: str):
|
|
354
379
|
"""
|
|
355
380
|
Set the TIMESTAMP column and date range to use as the input data filter to
|
|
@@ -150,17 +150,19 @@ class Stage:
|
|
|
150
150
|
if target_col not in existing_columns:
|
|
151
151
|
raise ReferencedColumnMissingError("jsonExtract", target_col)
|
|
152
152
|
schema = self._spark.sql(
|
|
153
|
-
f"SELECT schema_of_json_agg({target_col}) AS sc FROM {{df}}",
|
|
153
|
+
f"SELECT schema_of_json_agg({self.auto_backtick(target_col)}) AS sc FROM {{df}}",
|
|
154
|
+
df=df,
|
|
154
155
|
).collect()[0][0]
|
|
155
156
|
extract_df = self._spark.createDataFrame(data=[], schema=schema)
|
|
156
157
|
columns = extract_df.columns
|
|
157
158
|
columns = [
|
|
158
|
-
f"extract.{col} AS {col}"
|
|
159
|
+
self.auto_backtick(f"extract.{col}") + f" AS {self.auto_backtick(col)}"
|
|
159
160
|
for col in columns
|
|
160
161
|
if col not in omit_fields and col not in existing_columns
|
|
161
162
|
]
|
|
162
163
|
columns += [
|
|
163
|
-
f"extract.{col}
|
|
164
|
+
self.auto_backtick(f"extract.{col}")
|
|
165
|
+
+ f" AS {self.auto_backtick(duplicate_prefix + col)}"
|
|
164
166
|
for col in columns
|
|
165
167
|
if col not in omit_fields and col in existing_columns
|
|
166
168
|
]
|
|
@@ -176,7 +178,10 @@ class Stage:
|
|
|
176
178
|
A DataFrame with the resultant operation's records.
|
|
177
179
|
"""
|
|
178
180
|
return (
|
|
179
|
-
df.selectExpr(
|
|
181
|
+
df.selectExpr(
|
|
182
|
+
"*",
|
|
183
|
+
f"from_json({self.auto_backtick(target_col)}, '{schema}') AS extract",
|
|
184
|
+
)
|
|
180
185
|
.selectExpr("*", *columns)
|
|
181
186
|
.drop("extract")
|
|
182
187
|
)
|
|
@@ -198,7 +203,10 @@ class Stage:
|
|
|
198
203
|
"""
|
|
199
204
|
extract_df = self._spark.createDataFrame(data=[], schema=schema)
|
|
200
205
|
schema = extract_df.drop(omit_fields).schema.simpleString()
|
|
201
|
-
return df.selectExpr(
|
|
206
|
+
return df.selectExpr(
|
|
207
|
+
"*",
|
|
208
|
+
f"from_json({self.auto_backtick(target_col)}, '{schema}') AS {self.auto_backtick(name)}",
|
|
209
|
+
)
|
|
202
210
|
|
|
203
211
|
def preserved_columns(
|
|
204
212
|
self, df: DataFrame
|
|
@@ -224,7 +232,7 @@ class Stage:
|
|
|
224
232
|
duplicate_prefix = self._duplicate_prefix()
|
|
225
233
|
column_names = self._column_names()
|
|
226
234
|
duplicate_renames = [
|
|
227
|
-
f"{col} AS {duplicate_prefix
|
|
235
|
+
f"{self.auto_backtick(col)} AS {self.auto_backtick(duplicate_prefix + col)}"
|
|
228
236
|
for col in preserved_columns
|
|
229
237
|
if col in column_names
|
|
230
238
|
]
|
|
@@ -296,14 +304,46 @@ class Stage:
|
|
|
296
304
|
"""
|
|
297
305
|
if field.get("from", None):
|
|
298
306
|
# check that the from column exists in the df?
|
|
299
|
-
return f"{field['from']} AS {name}"
|
|
307
|
+
return f"{self.auto_backtick(field['from'])} AS {self.auto_backtick(name)}"
|
|
300
308
|
elif field.get("literal", None):
|
|
301
|
-
return f"'{field['literal']}' AS {name}"
|
|
302
|
-
elif field.get("expr", None):
|
|
303
|
-
return f"{field['expr']} AS {name}"
|
|
309
|
+
return f"'{field['literal']}' AS {self.auto_backtick(name)}"
|
|
310
|
+
elif field.get("expr", None) is not None:
|
|
311
|
+
return f"{field['expr']} AS {self.auto_backtick(name)}"
|
|
304
312
|
else:
|
|
305
313
|
return ""
|
|
306
314
|
|
|
315
|
+
def is_backtick_escaped(self, name: str) -> bool:
|
|
316
|
+
"""
|
|
317
|
+
check if a given (column) name is backtick escaped or not
|
|
318
|
+
:param name: column name
|
|
319
|
+
:return: bool
|
|
320
|
+
"""
|
|
321
|
+
return name.startswith("`") and name.endswith("`")
|
|
322
|
+
|
|
323
|
+
def auto_backtick(self, name: str) -> str:
|
|
324
|
+
"""
|
|
325
|
+
auto-backtick given name in case it isn't already backtick escaped.
|
|
326
|
+
if the name contains dots it will get split and each component backticked individually.
|
|
327
|
+
Returns the name wrapped in backticks or the passed name if it already had backticks.
|
|
328
|
+
:param name: column name
|
|
329
|
+
:return: str
|
|
330
|
+
"""
|
|
331
|
+
if self.is_backtick_escaped(name):
|
|
332
|
+
return name
|
|
333
|
+
parts = name.split(".")
|
|
334
|
+
return ".".join(list(map(lambda s: f"`{s}`", parts)))
|
|
335
|
+
|
|
336
|
+
def force_apply_backticks(self, name: str) -> str:
|
|
337
|
+
"""
|
|
338
|
+
forces application of backticks to the given (column) name as a single unit
|
|
339
|
+
if it already has backticks this is a noop
|
|
340
|
+
:param name: column name
|
|
341
|
+
:return: str
|
|
342
|
+
"""
|
|
343
|
+
if self.is_backtick_escaped(name):
|
|
344
|
+
return name
|
|
345
|
+
return f"`{name}`"
|
|
346
|
+
|
|
307
347
|
def process_node(self, name: str, node: Node) -> str:
|
|
308
348
|
"""
|
|
309
349
|
Processes a single node in a tree of nodes.
|
|
@@ -319,7 +359,7 @@ class Stage:
|
|
|
319
359
|
child_expr = self.process_node(child_name, child_node)
|
|
320
360
|
fields_list.append(f"{child_expr}")
|
|
321
361
|
joined_fields = ",\n".join(fields_list)
|
|
322
|
-
return f"struct(\n{joined_fields}\n) AS {name}"
|
|
362
|
+
return f"struct(\n{joined_fields}\n) AS {self.auto_backtick(name)}"
|
|
323
363
|
else:
|
|
324
364
|
return ""
|
|
325
365
|
|
|
@@ -341,12 +381,22 @@ class Stage:
|
|
|
341
381
|
"""
|
|
342
382
|
Renders a list of field specifications containing both simple and
|
|
343
383
|
STRUCT references into valid, STRUCT cognicient, SELECT expressions.
|
|
384
|
+
if a nested field is wrapped in backticks it will be treated as a simple field
|
|
385
|
+
for example field of name `col.with.dots` will NOT be treated as nested field.
|
|
344
386
|
|
|
345
387
|
Returns:
|
|
346
388
|
The SQL expression.
|
|
347
389
|
"""
|
|
348
|
-
simple_fields = [
|
|
349
|
-
|
|
390
|
+
simple_fields = [
|
|
391
|
+
f
|
|
392
|
+
for f in fields
|
|
393
|
+
if "." not in f["name"] or self.is_backtick_escaped(f["name"])
|
|
394
|
+
]
|
|
395
|
+
nested_fields = [
|
|
396
|
+
f
|
|
397
|
+
for f in fields
|
|
398
|
+
if "." in f["name"] and not self.is_backtick_escaped(f["name"])
|
|
399
|
+
]
|
|
350
400
|
|
|
351
401
|
result_parts = []
|
|
352
402
|
for field in simple_fields:
|
|
@@ -358,7 +408,7 @@ class Stage:
|
|
|
358
408
|
nested_str = self.parse_to_string(tree)
|
|
359
409
|
result_parts.append(nested_str)
|
|
360
410
|
|
|
361
|
-
return [p for p in result_parts if p]
|
|
411
|
+
return [p for p in result_parts if p is not None and len(p) > 0]
|
|
362
412
|
|
|
363
413
|
def select_expr(self, df: DataFrame) -> str:
|
|
364
414
|
"""
|
|
@@ -379,8 +429,12 @@ class Stage:
|
|
|
379
429
|
if should_preserve:
|
|
380
430
|
if embed_col := preserve.get("embedColumn", None):
|
|
381
431
|
preserved_columns = self.preserved_columns_embed_column(df)
|
|
432
|
+
# preserved_columns is obtained from df.columns
|
|
433
|
+
# applying backticks to all of them is OK here
|
|
434
|
+
# since they will never use "obj.key" to reference nested fields of structs
|
|
435
|
+
# so we just go ahead and apply backticks to all across the board
|
|
382
436
|
select_fields += [
|
|
383
|
-
f"struct({', '.join(preserved_columns)}) AS {embed_col}"
|
|
437
|
+
f"struct({', '.join(list(map(lambda x: self.force_apply_backticks(x), preserved_columns)))}) AS {self.auto_backtick(embed_col)}"
|
|
384
438
|
]
|
|
385
439
|
else:
|
|
386
440
|
(
|
|
@@ -388,8 +442,13 @@ class Stage:
|
|
|
388
442
|
duplicate_renames,
|
|
389
443
|
column_names,
|
|
390
444
|
) = self.preserved_columns(df)
|
|
391
|
-
|
|
392
|
-
select_fields +=
|
|
445
|
+
# see note above: same here - apply backticks to all columns across the board
|
|
446
|
+
select_fields += list(
|
|
447
|
+
map(lambda x: self.force_apply_backticks(x), preserved_columns)
|
|
448
|
+
)
|
|
449
|
+
select_fields += list(
|
|
450
|
+
map(lambda x: self.force_apply_backticks(x), duplicate_renames)
|
|
451
|
+
)
|
|
393
452
|
|
|
394
453
|
return ["*"] + select_fields if self._stage == "temp_fields" else select_fields
|
|
395
454
|
|
|
@@ -475,7 +534,9 @@ class Stage:
|
|
|
475
534
|
df = (
|
|
476
535
|
df.alias("tmp")
|
|
477
536
|
.join(df_joined, on=[df[lhs] == df_joined[rhs]], how="left")
|
|
478
|
-
.selectExpr(
|
|
537
|
+
.selectExpr(
|
|
538
|
+
"tmp.*", f"{select} AS {self.auto_backtick(field.get('name'))}"
|
|
539
|
+
)
|
|
479
540
|
)
|
|
480
541
|
elif csv := join.get("withCSV", None):
|
|
481
542
|
if path := csv.get("path", None):
|
|
@@ -485,7 +546,10 @@ class Stage:
|
|
|
485
546
|
df = (
|
|
486
547
|
df.alias("tmp")
|
|
487
548
|
.join(df_joined, on=[df[lhs] == df_joined[rhs]], how="left")
|
|
488
|
-
.selectExpr(
|
|
549
|
+
.selectExpr(
|
|
550
|
+
"tmp.*",
|
|
551
|
+
f"{select} AS {self.auto_backtick(field.get('name'))}",
|
|
552
|
+
)
|
|
489
553
|
)
|
|
490
554
|
else:
|
|
491
555
|
raise MissingJoinFieldError("withTable or withCSV (please supply 1)")
|
|
@@ -500,7 +564,10 @@ class Stage:
|
|
|
500
564
|
"""
|
|
501
565
|
for field in self._fields:
|
|
502
566
|
if field.get("alias", None):
|
|
503
|
-
df = df.selectExpr(
|
|
567
|
+
df = df.selectExpr(
|
|
568
|
+
"*",
|
|
569
|
+
f"{self.auto_backtick(field.get('alias'))} AS {self.auto_backtick(field.get('name'))}",
|
|
570
|
+
)
|
|
504
571
|
return df
|
|
505
572
|
|
|
506
573
|
def run_assertions(self, df: DataFrame) -> DataFrame:
|
|
@@ -11,7 +11,8 @@ from dasl_api import (
|
|
|
11
11
|
DbuiV1TransformRequestTransformsInner,
|
|
12
12
|
DbuiV1TransformRequestTransformsInnerPresetOverrides,
|
|
13
13
|
DbuiV1TransformResponse,
|
|
14
|
-
DbuiV1TransformResponseStagesInner,
|
|
14
|
+
DbuiV1TransformResponseStagesInner,
|
|
15
|
+
ContentV1DatasourcePresetAutoloaderCloudFiles,
|
|
15
16
|
DbuiV1TransformRequestAutoloaderInput,
|
|
16
17
|
)
|
|
17
18
|
|
|
@@ -202,7 +203,7 @@ class TransformRequest(BaseModel):
|
|
|
202
203
|
|
|
203
204
|
@staticmethod
|
|
204
205
|
def from_api_obj(
|
|
205
|
-
|
|
206
|
+
obj: Optional[ContentV1DatasourcePresetAutoloaderCloudFiles],
|
|
206
207
|
) -> "TransformRequest.Autoloader.CloudFiles":
|
|
207
208
|
if obj is None:
|
|
208
209
|
return None
|
|
@@ -227,7 +228,7 @@ class TransformRequest(BaseModel):
|
|
|
227
228
|
|
|
228
229
|
@staticmethod
|
|
229
230
|
def from_api_obj(
|
|
230
|
-
|
|
231
|
+
obj: Optional[DbuiV1TransformRequestAutoloaderInput],
|
|
231
232
|
) -> "Optional[TransformRequest.Autoloader]":
|
|
232
233
|
if obj is None:
|
|
233
234
|
return None
|
|
@@ -235,10 +236,13 @@ class TransformRequest(BaseModel):
|
|
|
235
236
|
format=obj.format,
|
|
236
237
|
location=obj.location,
|
|
237
238
|
schema_file=obj.schema_file,
|
|
238
|
-
cloud_files=TransformRequest.Autoloader.CloudFiles.from_api_obj(
|
|
239
|
+
cloud_files=TransformRequest.Autoloader.CloudFiles.from_api_obj(
|
|
240
|
+
obj.cloud_files
|
|
241
|
+
),
|
|
239
242
|
row_count=obj.row_count,
|
|
240
243
|
row_offset=obj.row_offset,
|
|
241
244
|
)
|
|
245
|
+
|
|
242
246
|
def to_api_obj(self) -> DbuiV1TransformRequestAutoloaderInput:
|
|
243
247
|
return DbuiV1TransformRequestAutoloaderInput(
|
|
244
248
|
format=self.format,
|
|
@@ -247,10 +251,9 @@ class TransformRequest(BaseModel):
|
|
|
247
251
|
schema=self.schema_file,
|
|
248
252
|
cloudFiles=Helpers.maybe(lambda o: o.to_api_obj(), self.cloud_files),
|
|
249
253
|
rowCount=self.row_count,
|
|
250
|
-
rowOffset=self.row_offset
|
|
254
|
+
rowOffset=self.row_offset,
|
|
251
255
|
)
|
|
252
256
|
|
|
253
|
-
|
|
254
257
|
class Transform(BaseModel):
|
|
255
258
|
"""
|
|
256
259
|
A transform configuration to apply to the data.
|
|
@@ -360,7 +363,9 @@ class TransformRequest(BaseModel):
|
|
|
360
363
|
def from_api_obj(obj: DbuiV1TransformRequest) -> "TransformRequest":
|
|
361
364
|
return TransformRequest(
|
|
362
365
|
input=TransformRequest.Input.from_api_obj(obj.input),
|
|
363
|
-
autoloader_input=TransformRequest.Autoloader.from_api_obj(
|
|
366
|
+
autoloader_input=TransformRequest.Autoloader.from_api_obj(
|
|
367
|
+
obj.autoloader_input
|
|
368
|
+
),
|
|
364
369
|
use_preset=obj.use_preset,
|
|
365
370
|
transforms=[
|
|
366
371
|
TransformRequest.Transform.from_api_obj(item) for item in obj.transforms
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dasl_client
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.16
|
|
4
4
|
Summary: The DASL client library used for interacting with the DASL workspace
|
|
5
5
|
Home-page: https://github.com/antimatter/asl
|
|
6
6
|
Author: Antimatter Team
|
|
@@ -11,8 +11,7 @@ License-File: LICENSE
|
|
|
11
11
|
Requires-Dist: dasl_api==0.1.17
|
|
12
12
|
Requires-Dist: databricks-sdk>=0.41.0
|
|
13
13
|
Requires-Dist: pydantic>=2
|
|
14
|
-
Requires-Dist:
|
|
15
|
-
Requires-Dist: typing_extensions==4.10.0
|
|
14
|
+
Requires-Dist: typing_extensions>=4.10.0
|
|
16
15
|
|
|
17
16
|
# DASL Client Library
|
|
18
17
|
|
|
@@ -31,6 +31,10 @@ dasl_client/types/helpers.py
|
|
|
31
31
|
dasl_client/types/rule.py
|
|
32
32
|
dasl_client/types/types.py
|
|
33
33
|
dasl_client/types/workspace_config.py
|
|
34
|
+
test/__init__.py
|
|
35
|
+
test/conftest.py
|
|
36
|
+
test/constants.py
|
|
34
37
|
test/test_api_changes.py
|
|
35
38
|
test/test_api_surface.py
|
|
39
|
+
test/test_databricks_secret_auth.py
|
|
36
40
|
test/test_marshaling.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "dasl_client"
|
|
7
|
-
version = "1.0.
|
|
7
|
+
version = "1.0.16"
|
|
8
8
|
description = "The DASL client library used for interacting with the DASL workspace"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [
|
|
@@ -16,6 +16,5 @@ dependencies = [
|
|
|
16
16
|
"dasl_api==0.1.17",
|
|
17
17
|
"databricks-sdk>=0.41.0",
|
|
18
18
|
"pydantic>=2",
|
|
19
|
-
"
|
|
20
|
-
"typing_extensions==4.10.0",
|
|
19
|
+
"typing_extensions>=4.10.0",
|
|
21
20
|
]
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from dasl_client import Client
|
|
4
|
+
|
|
5
|
+
from .constants import *
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.fixture(scope="session")
|
|
9
|
+
def api_client():
|
|
10
|
+
client = Client.new_workspace(
|
|
11
|
+
admin_email="test@antimatter.io",
|
|
12
|
+
app_client_id=app_client_id,
|
|
13
|
+
service_principal_id=databricks_client_id,
|
|
14
|
+
service_principal_secret=databricks_client_secret,
|
|
15
|
+
workspace_url=databricks_host,
|
|
16
|
+
dasl_host=dasl_host,
|
|
17
|
+
)
|
|
18
|
+
yield client
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from urllib.parse import urlparse
|
|
3
|
+
|
|
4
|
+
dasl_host = os.environ["DASL_API_URL"]
|
|
5
|
+
databricks_host = os.environ["DASL_DATABRICKS_HOST"]
|
|
6
|
+
databricks_client_id = os.environ["DASL_DATABRICKS_CLIENT_ID"]
|
|
7
|
+
databricks_client_secret = os.environ["DASL_DATABRICKS_CLIENT_SECRET"]
|
|
8
|
+
workspace = urlparse(databricks_host).hostname
|
|
9
|
+
app_client_id = "22853b93-68ba-4ae2-8e41-976417f501dd"
|
|
10
|
+
alternate_app_client_id = "335ac0d3-e0ea-4732-ba93-0277423b5029"
|
|
@@ -1,29 +1,6 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import pytest
|
|
3
|
-
from urllib.parse import urlparse
|
|
4
|
-
|
|
5
1
|
from dasl_client import *
|
|
6
2
|
|
|
7
|
-
|
|
8
|
-
databricks_host = os.environ["DATABRICKS_HOST"]
|
|
9
|
-
databricks_client_id = os.environ["DATABRICKS_CLIENT_ID"]
|
|
10
|
-
databricks_client_secret = os.environ["DATABRICKS_CLIENT_SECRET"]
|
|
11
|
-
workspace = urlparse(databricks_host).hostname
|
|
12
|
-
app_client_id = "22853b93-68ba-4ae2-8e41-976417f501dd"
|
|
13
|
-
alternate_app_client_id = "335ac0d3-e0ea-4732-ba93-0277423b5029"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@pytest.fixture(scope="session")
|
|
17
|
-
def api_client():
|
|
18
|
-
client = Client.new_workspace(
|
|
19
|
-
admin_email="test@antimatter.io",
|
|
20
|
-
app_client_id=app_client_id,
|
|
21
|
-
service_principal_id=databricks_client_id,
|
|
22
|
-
service_principal_secret=databricks_client_secret,
|
|
23
|
-
workspace_url=databricks_host,
|
|
24
|
-
dasl_host=dasl_host,
|
|
25
|
-
)
|
|
26
|
-
yield client
|
|
3
|
+
from .constants import *
|
|
27
4
|
|
|
28
5
|
|
|
29
6
|
def test_admin_config(api_client):
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import datetime
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from databricks.sdk import WorkspaceClient
|
|
7
|
+
from databricks.sdk.service import jobs, workspace as dbworkspace
|
|
8
|
+
|
|
9
|
+
from .constants import *
|
|
10
|
+
|
|
11
|
+
pylib_volume_path = os.environ["PYLIB_VOLUME_PATH"]
|
|
12
|
+
pylib_wheel_path = os.environ["PYLIB_WHEEL_PATH"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_secret_auth(api_client):
|
|
16
|
+
# making sure it's even possible to get a config
|
|
17
|
+
api_client.get_config()
|
|
18
|
+
|
|
19
|
+
# need to do an API operation using databricks secret auth.
|
|
20
|
+
notebook_data = f"""
|
|
21
|
+
%pip install {pylib_wheel_path}
|
|
22
|
+
dbutils.library.restartPython()
|
|
23
|
+
# COMMAND ----------
|
|
24
|
+
from dasl_client.client import Client
|
|
25
|
+
|
|
26
|
+
Client.for_workspace(
|
|
27
|
+
workspace_url="{databricks_host}",
|
|
28
|
+
dasl_host="{dasl_host}",
|
|
29
|
+
).get_config()
|
|
30
|
+
# COMMAND ----------
|
|
31
|
+
dbutils.notebook.exit("SUCCESS")
|
|
32
|
+
"""
|
|
33
|
+
print(f"notebook_data={notebook_data}")
|
|
34
|
+
|
|
35
|
+
wsc = WorkspaceClient()
|
|
36
|
+
wsc.workspace.mkdirs(path=pylib_volume_path)
|
|
37
|
+
|
|
38
|
+
notebook_path = f"{pylib_volume_path}/test_secret_auth_notebook"
|
|
39
|
+
wsc.workspace.import_(
|
|
40
|
+
path=notebook_path,
|
|
41
|
+
format=dbworkspace.ImportFormat.SOURCE,
|
|
42
|
+
language=dbworkspace.Language.PYTHON,
|
|
43
|
+
content=base64.b64encode(notebook_data.encode("utf-8")).decode("utf-8"),
|
|
44
|
+
overwrite=True,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
job_id = None
|
|
48
|
+
try:
|
|
49
|
+
job_id = wsc.jobs.create(
|
|
50
|
+
name="run test_secret_auth notebook",
|
|
51
|
+
tasks=[
|
|
52
|
+
jobs.Task(
|
|
53
|
+
task_key="run_notebook",
|
|
54
|
+
notebook_task=jobs.NotebookTask(notebook_path=notebook_path),
|
|
55
|
+
),
|
|
56
|
+
],
|
|
57
|
+
).job_id
|
|
58
|
+
|
|
59
|
+
wsc.jobs.run_now(job_id=job_id)
|
|
60
|
+
|
|
61
|
+
logs = []
|
|
62
|
+
start = datetime.datetime.now()
|
|
63
|
+
complete = False
|
|
64
|
+
while not complete:
|
|
65
|
+
elapsed = datetime.datetime.now() - start
|
|
66
|
+
if elapsed > datetime.timedelta(seconds=300):
|
|
67
|
+
raise Exception(f"timed out waiting for job")
|
|
68
|
+
|
|
69
|
+
time.sleep(5)
|
|
70
|
+
|
|
71
|
+
status, logs = fetch_latest_run_status_and_logs(wsc, job_id)
|
|
72
|
+
print(f"logs={logs}")
|
|
73
|
+
|
|
74
|
+
if status == jobs.TerminationCodeCode.RUN_EXECUTION_ERROR:
|
|
75
|
+
raise Exception(f"job terminated with error")
|
|
76
|
+
|
|
77
|
+
complete = status == jobs.TerminationCodeCode.SUCCESS
|
|
78
|
+
|
|
79
|
+
print(logs)
|
|
80
|
+
assert len(logs) == 1
|
|
81
|
+
assert logs[0] == "SUCCESS"
|
|
82
|
+
finally:
|
|
83
|
+
wsc.workspace.delete(pylib_volume_path, recursive=True)
|
|
84
|
+
if job_id is not None:
|
|
85
|
+
wsc.jobs.delete(job_id=job_id)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def fetch_latest_run_status_and_logs(
|
|
89
|
+
wsc: WorkspaceClient,
|
|
90
|
+
job_id: str,
|
|
91
|
+
):
|
|
92
|
+
runs = list(wsc.jobs.list_runs(job_id=job_id, expand_tasks=True))
|
|
93
|
+
if not runs:
|
|
94
|
+
return "No runs found", None
|
|
95
|
+
|
|
96
|
+
# Find the latest run based on the start time
|
|
97
|
+
latest_run = max(runs, key=lambda r: r.start_time)
|
|
98
|
+
if latest_run.status.termination_details is None:
|
|
99
|
+
return "No runs found", None
|
|
100
|
+
status = latest_run.status.termination_details.code
|
|
101
|
+
logs = []
|
|
102
|
+
for task in latest_run.tasks:
|
|
103
|
+
output = wsc.jobs.get_run_output(task.run_id)
|
|
104
|
+
if output.error is not None:
|
|
105
|
+
logs.append(output.error)
|
|
106
|
+
elif output.logs is not None:
|
|
107
|
+
logs.append(output.logs)
|
|
108
|
+
elif output.notebook_output is not None:
|
|
109
|
+
logs.append(output.notebook_output.result)
|
|
110
|
+
elif output.run_job_output is not None:
|
|
111
|
+
raise Exception("Nested jobs are not supported")
|
|
112
|
+
elif output.sql_output is not None:
|
|
113
|
+
raise Exception("SQL jobs are unsupported")
|
|
114
|
+
else:
|
|
115
|
+
logs.append("")
|
|
116
|
+
return status, logs
|
|
@@ -1,9 +1,6 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
1
|
from dasl_client import *
|
|
4
2
|
|
|
5
|
-
|
|
6
|
-
workspace = databricks_host.split("//")[1]
|
|
3
|
+
from .constants import *
|
|
7
4
|
|
|
8
5
|
|
|
9
6
|
def test_workspace_config_marshal_unmarshal():
|
|
@@ -701,13 +698,15 @@ def test_transform_request_marshal_unmarshal():
|
|
|
701
698
|
],
|
|
702
699
|
data=[{"col1": "1", "col2": "a"}, {"col1": "2", "col2": "b"}],
|
|
703
700
|
),
|
|
704
|
-
autoloader_input=
|
|
701
|
+
autoloader_input=TransformRequest.Autoloader(
|
|
705
702
|
format="csv",
|
|
706
703
|
location="s3://bucket/data",
|
|
707
704
|
schema_file="schema.json",
|
|
708
|
-
cloud_files=
|
|
705
|
+
cloud_files=TransformRequest.Autoloader.CloudFiles(
|
|
709
706
|
schema_hints_file="hints_file.csv", schema_hints="hint1, hint2"
|
|
710
707
|
),
|
|
708
|
+
row_count=1,
|
|
709
|
+
row_offset=5,
|
|
711
710
|
),
|
|
712
711
|
use_preset="preset_value",
|
|
713
712
|
transforms=[
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|