dasl-client 1.0.7__py3-none-any.whl → 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dasl-client might be problematic. Click here for more details.
- dasl_client/auth/auth.py +6 -5
- dasl_client/client.py +45 -9
- dasl_client/preset_development/__init__.py +4 -0
- dasl_client/preset_development/errors.py +159 -0
- dasl_client/preset_development/preview_engine.py +344 -0
- dasl_client/preset_development/preview_parameters.py +386 -0
- dasl_client/preset_development/stage.py +559 -0
- dasl_client/types/__init__.py +1 -0
- dasl_client/types/admin_config.py +10 -7
- dasl_client/types/content.py +235 -0
- dasl_client/types/datasource.py +177 -138
- dasl_client/types/dbui.py +46 -34
- dasl_client/types/rule.py +91 -65
- dasl_client/types/types.py +67 -54
- dasl_client/types/workspace_config.py +90 -74
- {dasl_client-1.0.7.dist-info → dasl_client-1.0.11.dist-info}/METADATA +3 -2
- dasl_client-1.0.11.dist-info/RECORD +29 -0
- dasl_client-1.0.7.dist-info/RECORD +0 -23
- {dasl_client-1.0.7.dist-info → dasl_client-1.0.11.dist-info}/LICENSE +0 -0
- {dasl_client-1.0.7.dist-info → dasl_client-1.0.11.dist-info}/WHEEL +0 -0
- {dasl_client-1.0.7.dist-info → dasl_client-1.0.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
2
|
+
from pyspark.sql.types import *
|
|
3
|
+
from pyspark.sql.dataframe import DataFrame
|
|
4
|
+
from pyspark.sql.functions import col, lit, udf
|
|
5
|
+
from dasl_client.preset_development.errors import *
|
|
6
|
+
import uuid
|
|
7
|
+
from IPython import get_ipython
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@udf(StringType())
|
|
11
|
+
def constant_udf(*args):
|
|
12
|
+
return "<sortable_random_id>"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PreviewParameters:
|
|
16
|
+
"""
|
|
17
|
+
This class provides three methods for supplying input records to the preset development environment.
|
|
18
|
+
|
|
19
|
+
**1. Input Mode:**
|
|
20
|
+
In "input" mode, the user provides the schema and data directly using `StructType`, `StructField`,
|
|
21
|
+
and a list of tuples representing the data. For example:
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
schema = StructType([
|
|
25
|
+
StructField('name', StringType(), True),
|
|
26
|
+
StructField('age', IntegerType(), True)
|
|
27
|
+
])
|
|
28
|
+
|
|
29
|
+
data = [("Mikhail", 15), ("Zaky", 13), ("Zoya", 8)]
|
|
30
|
+
|
|
31
|
+
data_source = PreviewParameters() \
|
|
32
|
+
.from_input() \
|
|
33
|
+
.set_data_schema(schema) \
|
|
34
|
+
.set_data(data)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
**2. Autoloader Mode:**
|
|
38
|
+
In "autoloader" mode, input is loaded using the `cloudFiles` format and settings defined in the preset's
|
|
39
|
+
`autoloader` field. The format is fetched directly from the preset, while other required options must be
|
|
40
|
+
provided manually. Example:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
ds_params = PreviewParameters() \
|
|
44
|
+
.from_autoloader() \
|
|
45
|
+
.set_autoloader_location("s3://test-databucket/test-data") \
|
|
46
|
+
.set_pretransform_name("name_of_pretransform") \
|
|
47
|
+
.set_date_range("EdgeStartTimestamp", "2024-02-15 11:27:21", "2024-02-15 11:27:25")
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
If you wish to skip the Silver PreTransform stage, simply omit the `pretransform_name` setting.
|
|
51
|
+
|
|
52
|
+
**3. Table Mode:**
|
|
53
|
+
This method reads input directly from a table:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
ds_params = DataSourceParameters(spark) \
|
|
57
|
+
.from_table() \
|
|
58
|
+
.set_table("system.access.audit")
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Note:**
|
|
62
|
+
When using autoloader mode, this implementation requires a location to store a temporary schema for
|
|
63
|
+
the loaded records. By default, this is set to `"dbfs:/tmp/schemas"`. You can change this using
|
|
64
|
+
`set_autoloader_temp_schema_location`. Regardless of whether you use the default or a custom path,
|
|
65
|
+
you must have write permissions for that location.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(self, spark: SparkSession) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Initializes the PreviewParameters instance with sparse default settings.
|
|
71
|
+
|
|
72
|
+
Note: The preset development environment is intended to process only a small number
|
|
73
|
+
of records at a time. By default, the record limit is set to 10, but this can be overridden
|
|
74
|
+
if needed.
|
|
75
|
+
|
|
76
|
+
Instance Attributes:
|
|
77
|
+
mode (str): Indicates the source type ("input" or "autoloader").
|
|
78
|
+
record_limit (int): Maximum number of records to load. Defaults to 10.
|
|
79
|
+
autoloader_temp_schema_location (str): Temporary location to store the autoloader schema.
|
|
80
|
+
time_column (str): Column name used for time-based filtering.
|
|
81
|
+
start_time (str): Start time for filtering.
|
|
82
|
+
end_time (str): End time for filtering.
|
|
83
|
+
autoloader_location (str): Filesystem location for autoloader input.
|
|
84
|
+
autoloader_format (str): Format of the data for autoloader.
|
|
85
|
+
schema_file (str): Path to a file containing the schema definition.
|
|
86
|
+
cloudfiles_schema_hints_file (str): Path to a file containing CloudFiles schema hints.
|
|
87
|
+
cloudfiles_schema_hints (str): Directly provided CloudFiles schema hints.
|
|
88
|
+
schema_uuid_str (str): Unique identifier for the schema (used in the autoloader schema path).
|
|
89
|
+
schema (StructType): Schema definition for input data.
|
|
90
|
+
data (dict): In-memory data used to create a DataFrame in "input" mode.
|
|
91
|
+
pretransform_name (str): Name of the pre-transformation step.
|
|
92
|
+
df (DataFrame): Internal Spark DataFrame loaded using the specified parameters.
|
|
93
|
+
"""
|
|
94
|
+
self._spark = spark
|
|
95
|
+
self._mode = None # [input, autoloader]
|
|
96
|
+
self._record_limit = 10
|
|
97
|
+
self._autoloader_temp_schema_location = "dbfs:/tmp/schemas"
|
|
98
|
+
|
|
99
|
+
self._time_column = None
|
|
100
|
+
self._start_time = None
|
|
101
|
+
self._end_time = None
|
|
102
|
+
|
|
103
|
+
self._autoloader_location = None
|
|
104
|
+
self._autoloader_format = None
|
|
105
|
+
self._schema_file = None
|
|
106
|
+
self._clouldfiles_schema_hints_file = None
|
|
107
|
+
self._cloudfiles_schema_hints = None
|
|
108
|
+
self._cloudfiles_reader_case_sensitive = "true"
|
|
109
|
+
self._cloudfiles_multiline = "true"
|
|
110
|
+
self._schema_uuid_str = str(uuid.uuid4())
|
|
111
|
+
|
|
112
|
+
self._schema = None
|
|
113
|
+
self._data = None
|
|
114
|
+
|
|
115
|
+
self._table = None
|
|
116
|
+
|
|
117
|
+
self._pretransform_name = None
|
|
118
|
+
|
|
119
|
+
self._df = None
|
|
120
|
+
|
|
121
|
+
def __enter__(self):
|
|
122
|
+
"""
|
|
123
|
+
Creates a DataFrame with data using the method specified. In the case of "autoloader",
|
|
124
|
+
this will stream to a DataFrame that is then treated as a batch. This allows for easier
|
|
125
|
+
emulation of some operations, while not giving up some of the options allowed by
|
|
126
|
+
streaming.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
DataFrame: The resulting DataFrame with input data.
|
|
130
|
+
"""
|
|
131
|
+
if self._mode == "input":
|
|
132
|
+
self._df = self._spark.createDataFrame(self._data, self._schema)
|
|
133
|
+
elif self._mode == "table":
|
|
134
|
+
self._df = self._spark.table(self._table).limit(self._record_limit)
|
|
135
|
+
elif self._mode == "autoloader":
|
|
136
|
+
stream_df = (
|
|
137
|
+
self._spark.readStream.format("cloudFiles")
|
|
138
|
+
.option("cloudFiles.format", self._autoloader_format)
|
|
139
|
+
.option("multiline", self._cloudfiles_multiline)
|
|
140
|
+
.option("readerCaseSensitive", self._cloudfiles_reader_case_sensitive)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if self._schema_file:
|
|
144
|
+
with open(self._schema_file, "r") as f:
|
|
145
|
+
stream_df = stream_df.schema(f.read().strip())
|
|
146
|
+
else:
|
|
147
|
+
stream_df = (
|
|
148
|
+
stream_df.option("inferSchema", "true")
|
|
149
|
+
.option("cloudFiles.inferColumnTypes", "true")
|
|
150
|
+
.option(
|
|
151
|
+
"cloudFiles.schemaLocation",
|
|
152
|
+
f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if self._cloudfiles_schema_hints:
|
|
157
|
+
stream_df = stream_df.option(
|
|
158
|
+
"cloudFiles.schemaHints", self._cloudfiles_schema_hints
|
|
159
|
+
)
|
|
160
|
+
elif self._clouldfiles_schema_hints_file:
|
|
161
|
+
stream_df = stream_df.option(
|
|
162
|
+
"cloudFiles.schemaHintsFile", self._clouldfiles_schema_hints_file
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
stream_df = stream_df.load(self._autoloader_location).limit(
|
|
166
|
+
self._record_limit
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
query = (
|
|
170
|
+
stream_df.writeStream.format("memory")
|
|
171
|
+
.queryName("batch_data")
|
|
172
|
+
.trigger(once=True)
|
|
173
|
+
.start()
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
query.awaitTermination()
|
|
177
|
+
|
|
178
|
+
self._df = self._spark.table("batch_data")
|
|
179
|
+
|
|
180
|
+
if self._time_column:
|
|
181
|
+
self._df = self._df.filter(
|
|
182
|
+
f"timestamp({self._time_column}) >= timestamp('{self._start_time}') AND timestamp({self._time_column}) < timestamp('{self._end_time}')"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
self._df = self._df.withColumn("dasl_id", constant_udf())
|
|
186
|
+
|
|
187
|
+
return self._df
|
|
188
|
+
|
|
189
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
190
|
+
"""
|
|
191
|
+
Cleans up the temporary schema created for streaming mode, if it was created.
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
# Get the Databricks built-in functions out the namespace.
|
|
195
|
+
ipython = get_ipython()
|
|
196
|
+
dbutils = ipython.user_ns["dbutils"]
|
|
197
|
+
|
|
198
|
+
dbutils.fs.rm(
|
|
199
|
+
f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
|
|
200
|
+
recurse=True,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def from_input(self):
|
|
204
|
+
"""
|
|
205
|
+
Set the data source loader to "input" mode. Requires a schema and data to be provided.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
PreviewParameters: The current instance with updated configuration.
|
|
209
|
+
"""
|
|
210
|
+
self._mode = "input"
|
|
211
|
+
return self
|
|
212
|
+
|
|
213
|
+
def from_autoloader(self):
|
|
214
|
+
"""
|
|
215
|
+
Set the data source loader to "autoloader" mode. Requires at least autoloader location
|
|
216
|
+
to be provided.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
PreviewParameters: The current instance with updated configuration.
|
|
220
|
+
"""
|
|
221
|
+
self._mode = "autoloader"
|
|
222
|
+
return self
|
|
223
|
+
|
|
224
|
+
def from_table(self):
|
|
225
|
+
"""
|
|
226
|
+
Set the data source loader to "table" mode. Requires a table name to be provided.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
PreviewParameters: The current instance with updated configuration.
|
|
230
|
+
"""
|
|
231
|
+
self._mode = "table"
|
|
232
|
+
return self
|
|
233
|
+
|
|
234
|
+
def set_autoloader_temp_schema_location(self, path: str):
|
|
235
|
+
"""
|
|
236
|
+
Set the location for the autoloader's streaming mode schema to be created. This is
|
|
237
|
+
deleted at the end of a run.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
PreviewParameters: The current instance with updated configuration.
|
|
241
|
+
"""
|
|
242
|
+
self._autoloader_temp_schema_location = path
|
|
243
|
+
return self
|
|
244
|
+
|
|
245
|
+
def set_data_schema(self, schema: StructType):
|
|
246
|
+
"""
|
|
247
|
+
Set the input schema for "input" mode. For example:
|
|
248
|
+
|
|
249
|
+
StructType([
|
|
250
|
+
StructField('name', StringType(), True),
|
|
251
|
+
StructField('age', IntegerType(), True)
|
|
252
|
+
])
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
PreviewParameters: The current instance with updated configuration.
|
|
256
|
+
"""
|
|
257
|
+
self._schema = schema
|
|
258
|
+
return self
|
|
259
|
+
|
|
260
|
+
def set_data(self, data: Dict[str, str]):
|
|
261
|
+
"""
|
|
262
|
+
Set the input data for "input" mode. For example:
|
|
263
|
+
|
|
264
|
+
[("Peter", 15), ("Urvi", 13), ("Graeme", 8)]
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
PreviewParameters: The current instance with updated configuration.
|
|
268
|
+
"""
|
|
269
|
+
self._data = data
|
|
270
|
+
return self
|
|
271
|
+
|
|
272
|
+
def set_autoloader_location(self, location: str):
|
|
273
|
+
"""
|
|
274
|
+
Set where to load data from for "autoloader" mode.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
PreviewParameters: The current instance with updated configuration.
|
|
278
|
+
"""
|
|
279
|
+
self._autoloader_location = location
|
|
280
|
+
return self
|
|
281
|
+
|
|
282
|
+
def set_autoloader_format(self, file_format: str):
|
|
283
|
+
"""
|
|
284
|
+
Used internally to set the autoloader format.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
PreviewParameters: The current instance with updated configuration.
|
|
288
|
+
"""
|
|
289
|
+
self._autoloader_format = file_format
|
|
290
|
+
return self
|
|
291
|
+
|
|
292
|
+
def set_autoloader_schema_file(self, path: str):
|
|
293
|
+
"""
|
|
294
|
+
Set the schema file path for "autoloader" mode.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
PreviewParameters: The current instance with updated configuration.
|
|
298
|
+
"""
|
|
299
|
+
self._schema_file = path
|
|
300
|
+
return self
|
|
301
|
+
|
|
302
|
+
def set_autoloader_cloudfiles_schema_hint_file(self, path: str):
|
|
303
|
+
"""
|
|
304
|
+
Set the cloudFiles schema hints file path for "autoloader" mode.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
PreviewParameters: The current instance with updated configuration.
|
|
308
|
+
"""
|
|
309
|
+
self._clouldfiles_schema_hints_file = path
|
|
310
|
+
return self
|
|
311
|
+
|
|
312
|
+
def set_autoloader_cloudfiles_schema_hints(self, cloudfiles_schema_hints: str):
|
|
313
|
+
"""
|
|
314
|
+
Set the cloudFiles schema hints string for "autoloader" mode.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
PreviewParameters: The current instance with updated configuration.
|
|
318
|
+
"""
|
|
319
|
+
self._cloudfiles_schema_hints = cloudfiles_schema_hints
|
|
320
|
+
return self
|
|
321
|
+
|
|
322
|
+
def set_autoloader_reader_case_sensitive(self, b: bool):
|
|
323
|
+
"""
|
|
324
|
+
Set the cloudFiles reader case-sensitive boolean for "autoloader" mode.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
PreviewParameters: The current instance with updated configuration.
|
|
328
|
+
"""
|
|
329
|
+
self._cloudfiles_reader_case_sensitive = "true" if b else "false"
|
|
330
|
+
return self
|
|
331
|
+
|
|
332
|
+
def set_autoloader_multiline(self, b: bool):
|
|
333
|
+
"""
|
|
334
|
+
Set the cloudFiles multiline boolean for "autoloader" mode.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
PreviewParameters: The current instance with updated configuration.
|
|
338
|
+
"""
|
|
339
|
+
self._cloudfiles_multiline = "true" if b else "false"
|
|
340
|
+
return self
|
|
341
|
+
|
|
342
|
+
def set_pretransform_name(self, pretransform_name: str):
|
|
343
|
+
"""
|
|
344
|
+
Set the pretransform name to use, if desired. If not set, Silver PreTransform
|
|
345
|
+
will be skipped.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
PreviewParameters: The current instance with updated configuration.
|
|
349
|
+
"""
|
|
350
|
+
self._pretransform_name = pretransform_name
|
|
351
|
+
return self
|
|
352
|
+
|
|
353
|
+
def set_date_range(self, column: str, start_time: str, end_time: str):
|
|
354
|
+
"""
|
|
355
|
+
Set the TIMESTAMP column and date range to use as the input data filter to
|
|
356
|
+
limit the number of records retrieved by the loader.
|
|
357
|
+
|
|
358
|
+
Both start and end time must be TIMESTAMP compatible.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
PreviewParameters: The current instance with updated configuration.
|
|
362
|
+
"""
|
|
363
|
+
self._time_column = column
|
|
364
|
+
self._start_time = start_time
|
|
365
|
+
self._end_time = end_time
|
|
366
|
+
return self
|
|
367
|
+
|
|
368
|
+
def set_input_record_limit(self, record_limit: int):
|
|
369
|
+
"""
|
|
370
|
+
Set the LIMIT clause when retrieving records from the data source.
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
PreviewParameters: The current instance with updated configuration.
|
|
374
|
+
"""
|
|
375
|
+
self._record_limit = record_limit
|
|
376
|
+
return self
|
|
377
|
+
|
|
378
|
+
def set_table(self, table_name: str):
|
|
379
|
+
"""
|
|
380
|
+
Set Unity Catalog table name for "table" mode.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
PreviewParameters: The current instance with updated configuration.
|
|
384
|
+
"""
|
|
385
|
+
self._table = table_name
|
|
386
|
+
return self
|