dasl-client 1.0.6__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dasl-client might be problematic. Click here for more details.

@@ -0,0 +1,386 @@
1
+ from pyspark.sql import DataFrame, SparkSession
2
+ from pyspark.sql.types import *
3
+ from pyspark.sql.dataframe import DataFrame
4
+ from pyspark.sql.functions import col, lit, udf
5
+ from dasl_client.preset_development.errors import *
6
+ import uuid
7
+ from IPython import get_ipython
8
+
9
+
10
+ @udf(StringType())
11
+ def constant_udf(*args):
12
+ return "<sortable_random_id>"
13
+
14
+
15
+ class PreviewParameters:
16
+ """
17
+ This class provides three methods for supplying input records to the preset development environment.
18
+
19
+ **1. Input Mode:**
20
+ In "input" mode, the user provides the schema and data directly using `StructType`, `StructField`,
21
+ and a list of tuples representing the data. For example:
22
+
23
+ ```python
24
+ schema = StructType([
25
+ StructField('name', StringType(), True),
26
+ StructField('age', IntegerType(), True)
27
+ ])
28
+
29
+ data = [("Mikhail", 15), ("Zaky", 13), ("Zoya", 8)]
30
+
31
+ data_source = PreviewParameters() \
32
+ .from_input() \
33
+ .set_data_schema(schema) \
34
+ .set_data(data)
35
+ ```
36
+
37
+ **2. Autoloader Mode:**
38
+ In "autoloader" mode, input is loaded using the `cloudFiles` format and settings defined in the preset's
39
+ `autoloader` field. The format is fetched directly from the preset, while other required options must be
40
+ provided manually. Example:
41
+
42
+ ```python
43
+ ds_params = PreviewParameters() \
44
+ .from_autoloader() \
45
+ .set_autoloader_location("s3://test-databucket/test-data") \
46
+ .set_pretransform_name("name_of_pretransform") \
47
+ .set_date_range("EdgeStartTimestamp", "2024-02-15 11:27:21", "2024-02-15 11:27:25")
48
+ ```
49
+
50
+ If you wish to skip the Silver PreTransform stage, simply omit the `pretransform_name` setting.
51
+
52
+ **3. Table Mode:**
53
+ This method reads input directly from a table:
54
+
55
+ ```python
56
+ ds_params = DataSourceParameters(spark) \
57
+ .from_table() \
58
+ .set_table("system.access.audit")
59
+ ```
60
+
61
+ **Note:**
62
+ When using autoloader mode, this implementation requires a location to store a temporary schema for
63
+ the loaded records. By default, this is set to `"dbfs:/tmp/schemas"`. You can change this using
64
+ `set_autoloader_temp_schema_location`. Regardless of whether you use the default or a custom path,
65
+ you must have write permissions for that location.
66
+ """
67
+
68
+ def __init__(self, spark: SparkSession) -> None:
69
+ """
70
+ Initializes the PreviewParameters instance with sparse default settings.
71
+
72
+ Note: The preset development environment is intended to process only a small number
73
+ of records at a time. By default, the record limit is set to 10, but this can be overridden
74
+ if needed.
75
+
76
+ Instance Attributes:
77
+ mode (str): Indicates the source type ("input" or "autoloader").
78
+ record_limit (int): Maximum number of records to load. Defaults to 10.
79
+ autoloader_temp_schema_location (str): Temporary location to store the autoloader schema.
80
+ time_column (str): Column name used for time-based filtering.
81
+ start_time (str): Start time for filtering.
82
+ end_time (str): End time for filtering.
83
+ autoloader_location (str): Filesystem location for autoloader input.
84
+ autoloader_format (str): Format of the data for autoloader.
85
+ schema_file (str): Path to a file containing the schema definition.
86
+ cloudfiles_schema_hints_file (str): Path to a file containing CloudFiles schema hints.
87
+ cloudfiles_schema_hints (str): Directly provided CloudFiles schema hints.
88
+ schema_uuid_str (str): Unique identifier for the schema (used in the autoloader schema path).
89
+ schema (StructType): Schema definition for input data.
90
+ data (dict): In-memory data used to create a DataFrame in "input" mode.
91
+ pretransform_name (str): Name of the pre-transformation step.
92
+ df (DataFrame): Internal Spark DataFrame loaded using the specified parameters.
93
+ """
94
+ self._spark = spark
95
+ self._mode = None # [input, autoloader]
96
+ self._record_limit = 10
97
+ self._autoloader_temp_schema_location = "dbfs:/tmp/schemas"
98
+
99
+ self._time_column = None
100
+ self._start_time = None
101
+ self._end_time = None
102
+
103
+ self._autoloader_location = None
104
+ self._autoloader_format = None
105
+ self._schema_file = None
106
+ self._clouldfiles_schema_hints_file = None
107
+ self._cloudfiles_schema_hints = None
108
+ self._cloudfiles_reader_case_sensitive = "true"
109
+ self._cloudfiles_multiline = "true"
110
+ self._schema_uuid_str = str(uuid.uuid4())
111
+
112
+ self._schema = None
113
+ self._data = None
114
+
115
+ self._table = None
116
+
117
+ self._pretransform_name = None
118
+
119
+ self._df = None
120
+
121
+ def __enter__(self):
122
+ """
123
+ Creates a DataFrame with data using the method specified. In the case of "autoloader",
124
+ this will stream to a DataFrame that is then treated as a batch. This allows for easier
125
+ emulation of some operations, while not giving up some of the options allowed by
126
+ streaming.
127
+
128
+ Returns:
129
+ DataFrame: The resulting DataFrame with input data.
130
+ """
131
+ if self._mode == "input":
132
+ self._df = self._spark.createDataFrame(self._data, self._schema)
133
+ elif self._mode == "table":
134
+ self._df = self._spark.table(self._table).limit(self._record_limit)
135
+ elif self._mode == "autoloader":
136
+ stream_df = (
137
+ self._spark.readStream.format("cloudFiles")
138
+ .option("cloudFiles.format", self._autoloader_format)
139
+ .option("multiline", self._cloudfiles_multiline)
140
+ .option("readerCaseSensitive", self._cloudfiles_reader_case_sensitive)
141
+ )
142
+
143
+ if self._schema_file:
144
+ with open(self._schema_file, "r") as f:
145
+ stream_df = stream_df.schema(f.read().strip())
146
+ else:
147
+ stream_df = (
148
+ stream_df.option("inferSchema", "true")
149
+ .option("cloudFiles.inferColumnTypes", "true")
150
+ .option(
151
+ "cloudFiles.schemaLocation",
152
+ f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
153
+ )
154
+ )
155
+
156
+ if self._cloudfiles_schema_hints:
157
+ stream_df = stream_df.option(
158
+ "cloudFiles.schemaHints", self._cloudfiles_schema_hints
159
+ )
160
+ elif self._clouldfiles_schema_hints_file:
161
+ stream_df = stream_df.option(
162
+ "cloudFiles.schemaHintsFile", self._clouldfiles_schema_hints_file
163
+ )
164
+
165
+ stream_df = stream_df.load(self._autoloader_location).limit(
166
+ self._record_limit
167
+ )
168
+
169
+ query = (
170
+ stream_df.writeStream.format("memory")
171
+ .queryName("batch_data")
172
+ .trigger(once=True)
173
+ .start()
174
+ )
175
+
176
+ query.awaitTermination()
177
+
178
+ self._df = self._spark.table("batch_data")
179
+
180
+ if self._time_column:
181
+ self._df = self._df.filter(
182
+ f"timestamp({self._time_column}) >= timestamp('{self._start_time}') AND timestamp({self._time_column}) < timestamp('{self._end_time}')"
183
+ )
184
+
185
+ self._df = self._df.withColumn("dasl_id", constant_udf())
186
+
187
+ return self._df
188
+
189
+ def __exit__(self, exc_type, exc_value, traceback):
190
+ """
191
+ Cleans up the temporary schema created for streaming mode, if it was created.
192
+ """
193
+
194
+ # Get the Databricks built-in functions out the namespace.
195
+ ipython = get_ipython()
196
+ dbutils = ipython.user_ns["dbutils"]
197
+
198
+ dbutils.fs.rm(
199
+ f"{self._autoloader_temp_schema_location}/{self._schema_uuid_str}",
200
+ recurse=True,
201
+ )
202
+
203
+ def from_input(self):
204
+ """
205
+ Set the data source loader to "input" mode. Requires a schema and data to be provided.
206
+
207
+ Returns:
208
+ PreviewParameters: The current instance with updated configuration.
209
+ """
210
+ self._mode = "input"
211
+ return self
212
+
213
+ def from_autoloader(self):
214
+ """
215
+ Set the data source loader to "autoloader" mode. Requires at least autoloader location
216
+ to be provided.
217
+
218
+ Returns:
219
+ PreviewParameters: The current instance with updated configuration.
220
+ """
221
+ self._mode = "autoloader"
222
+ return self
223
+
224
+ def from_table(self):
225
+ """
226
+ Set the data source loader to "table" mode. Requires a table name to be provided.
227
+
228
+ Returns:
229
+ PreviewParameters: The current instance with updated configuration.
230
+ """
231
+ self._mode = "table"
232
+ return self
233
+
234
+ def set_autoloader_temp_schema_location(self, path: str):
235
+ """
236
+ Set the location for the autoloader's streaming mode schema to be created. This is
237
+ deleted at the end of a run.
238
+
239
+ Returns:
240
+ PreviewParameters: The current instance with updated configuration.
241
+ """
242
+ self._autoloader_temp_schema_location = path
243
+ return self
244
+
245
+ def set_data_schema(self, schema: StructType):
246
+ """
247
+ Set the input schema for "input" mode. For example:
248
+
249
+ StructType([
250
+ StructField('name', StringType(), True),
251
+ StructField('age', IntegerType(), True)
252
+ ])
253
+
254
+ Returns:
255
+ PreviewParameters: The current instance with updated configuration.
256
+ """
257
+ self._schema = schema
258
+ return self
259
+
260
+ def set_data(self, data: Dict[str, str]):
261
+ """
262
+ Set the input data for "input" mode. For example:
263
+
264
+ [("Peter", 15), ("Urvi", 13), ("Graeme", 8)]
265
+
266
+ Returns:
267
+ PreviewParameters: The current instance with updated configuration.
268
+ """
269
+ self._data = data
270
+ return self
271
+
272
+ def set_autoloader_location(self, location: str):
273
+ """
274
+ Set where to load data from for "autoloader" mode.
275
+
276
+ Returns:
277
+ PreviewParameters: The current instance with updated configuration.
278
+ """
279
+ self._autoloader_location = location
280
+ return self
281
+
282
+ def set_autoloader_format(self, file_format: str):
283
+ """
284
+ Used internally to set the autoloader format.
285
+
286
+ Returns:
287
+ PreviewParameters: The current instance with updated configuration.
288
+ """
289
+ self._autoloader_format = file_format
290
+ return self
291
+
292
+ def set_autoloader_schema_file(self, path: str):
293
+ """
294
+ Set the schema file path for "autoloader" mode.
295
+
296
+ Returns:
297
+ PreviewParameters: The current instance with updated configuration.
298
+ """
299
+ self._schema_file = path
300
+ return self
301
+
302
+ def set_autoloader_cloudfiles_schema_hint_file(self, path: str):
303
+ """
304
+ Set the cloudFiles schema hints file path for "autoloader" mode.
305
+
306
+ Returns:
307
+ PreviewParameters: The current instance with updated configuration.
308
+ """
309
+ self._clouldfiles_schema_hints_file = path
310
+ return self
311
+
312
+ def set_autoloader_cloudfiles_schema_hints(self, cloudfiles_schema_hints: str):
313
+ """
314
+ Set the cloudFiles schema hints string for "autoloader" mode.
315
+
316
+ Returns:
317
+ PreviewParameters: The current instance with updated configuration.
318
+ """
319
+ self._cloudfiles_schema_hints = cloudfiles_schema_hints
320
+ return self
321
+
322
+ def set_autoloader_reader_case_sensitive(self, b: bool):
323
+ """
324
+ Set the cloudFiles reader case-sensitive boolean for "autoloader" mode.
325
+
326
+ Returns:
327
+ PreviewParameters: The current instance with updated configuration.
328
+ """
329
+ self._cloudfiles_reader_case_sensitive = "true" if b else "false"
330
+ return self
331
+
332
+ def set_autoloader_multiline(self, b: bool):
333
+ """
334
+ Set the cloudFiles multiline boolean for "autoloader" mode.
335
+
336
+ Returns:
337
+ PreviewParameters: The current instance with updated configuration.
338
+ """
339
+ self._cloudfiles_multiline = "true" if b else "false"
340
+ return self
341
+
342
+ def set_pretransform_name(self, pretransform_name: str):
343
+ """
344
+ Set the pretransform name to use, if desired. If not set, Silver PreTransform
345
+ will be skipped.
346
+
347
+ Returns:
348
+ PreviewParameters: The current instance with updated configuration.
349
+ """
350
+ self._pretransform_name = pretransform_name
351
+ return self
352
+
353
+ def set_date_range(self, column: str, start_time: str, end_time: str):
354
+ """
355
+ Set the TIMESTAMP column and date range to use as the input data filter to
356
+ limit the number of records retrieved by the loader.
357
+
358
+ Both start and end time must be TIMESTAMP compatible.
359
+
360
+ Returns:
361
+ PreviewParameters: The current instance with updated configuration.
362
+ """
363
+ self._time_column = column
364
+ self._start_time = start_time
365
+ self._end_time = end_time
366
+ return self
367
+
368
+ def set_input_record_limit(self, record_limit: int):
369
+ """
370
+ Set the LIMIT clause when retrieving records from the data source.
371
+
372
+ Returns:
373
+ PreviewParameters: The current instance with updated configuration.
374
+ """
375
+ self._record_limit = record_limit
376
+ return self
377
+
378
+ def set_table(self, table_name: str):
379
+ """
380
+ Set Unity Catalog table name for "table" mode.
381
+
382
+ Returns:
383
+ PreviewParameters: The current instance with updated configuration.
384
+ """
385
+ self._table = table_name
386
+ return self