dasl-client 1.0.28__tar.gz → 1.0.29__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dasl-client might be problematic. Click here for more details.
- {dasl_client-1.0.28 → dasl_client-1.0.29}/PKG-INFO +1 -1
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/client.py +12 -16
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/errors/errors.py +4 -1
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/metadata.py +29 -4
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/preset_development/preview_engine.py +43 -8
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/preset_development/preview_parameters.py +175 -12
- dasl_client-1.0.29/dasl_client/regions.json +6 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client.egg-info/PKG-INFO +1 -1
- {dasl_client-1.0.28 → dasl_client-1.0.29}/pyproject.toml +1 -1
- dasl_client-1.0.28/dasl_client/regions.json +0 -4
- {dasl_client-1.0.28 → dasl_client-1.0.29}/LICENSE +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/README.md +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/__init__.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/auth/__init__.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/auth/auth.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/conn/__init__.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/conn/client_identifier.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/conn/conn.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/errors/__init__.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/exec_rule.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/helpers.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/preset_development/__init__.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/preset_development/errors.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/preset_development/stage.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/regions.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/types/__init__.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/types/admin_config.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/types/content.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/types/datasource.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/types/dbui.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/types/helpers.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/types/rule.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/types/types.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/types/workspace_config.py +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client.egg-info/SOURCES.txt +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client.egg-info/dependency_links.txt +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client.egg-info/requires.txt +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client.egg-info/top_level.txt +0 -0
- {dasl_client-1.0.28 → dasl_client-1.0.29}/setup.cfg +0 -0
|
@@ -1,24 +1,22 @@
|
|
|
1
1
|
from copy import deepcopy
|
|
2
|
-
from datetime import datetime, timedelta
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
3
|
from time import sleep
|
|
4
|
-
from typing import Any, Callable, Iterator, List, Optional,
|
|
5
|
-
from pydantic import Field
|
|
6
|
-
from pyspark.sql import DataFrame
|
|
4
|
+
from typing import Any, Callable, Iterator, List, Optional, TypeVar
|
|
7
5
|
|
|
8
6
|
from dasl_api import (
|
|
7
|
+
ContentV1Api,
|
|
9
8
|
CoreV1Api,
|
|
9
|
+
CoreV1QueryExtendRequestDateRange,
|
|
10
10
|
DbuiV1Api,
|
|
11
11
|
DbuiV1QueryExtendRequest,
|
|
12
|
-
CoreV1QueryExtendRequestDateRange,
|
|
13
12
|
DbuiV1QueryGenerateRequest,
|
|
14
13
|
DbuiV1QueryGenerateRequestTimeRange,
|
|
15
14
|
DbuiV1QueryGenerateStatus,
|
|
16
|
-
DbuiV1QueryLookupRequest,
|
|
17
|
-
DbuiV1QueryLookupResult,
|
|
18
15
|
DbuiV1QueryHistogramRequest,
|
|
19
16
|
DbuiV1QueryHistogramResult,
|
|
17
|
+
DbuiV1QueryLookupRequest,
|
|
20
18
|
DbuiV1QueryLookupRequestPagination,
|
|
21
|
-
|
|
19
|
+
DbuiV1QueryLookupResult,
|
|
22
20
|
WorkspaceV1Api,
|
|
23
21
|
WorkspaceV1CreateWorkspaceRequest,
|
|
24
22
|
api,
|
|
@@ -26,30 +24,28 @@ from dasl_api import (
|
|
|
26
24
|
from dasl_client.auth.auth import (
|
|
27
25
|
Authorization,
|
|
28
26
|
DatabricksSecretAuth,
|
|
29
|
-
DatabricksTokenAuth,
|
|
30
27
|
ServiceAccountKeyAuth,
|
|
31
28
|
)
|
|
32
|
-
from dasl_client.metadata import WorkspaceMetadata
|
|
33
29
|
from dasl_client.conn.conn import get_base_conn
|
|
34
30
|
from dasl_client.errors.errors import ConflictError, error_handler
|
|
35
|
-
from .
|
|
31
|
+
from dasl_client.metadata import WorkspaceMetadata
|
|
32
|
+
|
|
36
33
|
from .exec_rule import ExecRule
|
|
34
|
+
from .helpers import Helpers
|
|
37
35
|
from .regions import Regions
|
|
38
|
-
|
|
39
36
|
from .types import (
|
|
40
37
|
AdminConfig,
|
|
41
38
|
DataSource,
|
|
39
|
+
DataSourcePreset,
|
|
40
|
+
DataSourcePresetsList,
|
|
42
41
|
Dbui,
|
|
43
42
|
Metadata,
|
|
44
43
|
Rule,
|
|
45
|
-
WorkspaceConfig,
|
|
46
44
|
TransformRequest,
|
|
47
45
|
TransformResponse,
|
|
48
|
-
|
|
49
|
-
DataSourcePreset,
|
|
46
|
+
WorkspaceConfig,
|
|
50
47
|
)
|
|
51
48
|
|
|
52
|
-
|
|
53
49
|
T = TypeVar("T")
|
|
54
50
|
|
|
55
51
|
|
|
@@ -3,7 +3,6 @@ from collections.abc import Callable
|
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
|
|
5
5
|
from dasl_api import ApiException
|
|
6
|
-
from urllib3.exceptions import MaxRetryError, RequestError
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class ConflictError(Exception):
|
|
@@ -136,3 +135,7 @@ def error_handler():
|
|
|
136
135
|
raise e
|
|
137
136
|
except Exception as e:
|
|
138
137
|
raise e
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class WorkspaceLookupError(Exception):
|
|
141
|
+
"""Internal exception wrapper for workspace lookup errors"""
|
|
@@ -5,6 +5,8 @@ from dasl_api import ApiClient, Configuration, WorkspaceV1Api
|
|
|
5
5
|
from dasl_api.models import WorkspaceV1WorkspaceMetadata
|
|
6
6
|
from dasl_api.exceptions import ApiException
|
|
7
7
|
|
|
8
|
+
from .errors.errors import WorkspaceLookupError
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
class WorkspaceMetadata:
|
|
10
12
|
"""Workspace metadata lookup functionality for auto-detecting API endpoints."""
|
|
@@ -21,13 +23,36 @@ class WorkspaceMetadata:
|
|
|
21
23
|
:param dasl_host: Optional DASL host to use for the lookup. If None, uses default region.
|
|
22
24
|
:returns: WorkspaceV1WorkspaceMetadata if successful, None if workspace not found
|
|
23
25
|
"""
|
|
26
|
+
hosts = []
|
|
24
27
|
if dasl_host is None:
|
|
25
28
|
# Use default region for metadata lookup
|
|
26
29
|
from .regions import Regions
|
|
27
|
-
from .helpers import Helpers
|
|
28
30
|
|
|
29
|
-
|
|
31
|
+
for region in Regions.list():
|
|
32
|
+
hosts.append(Regions.lookup(region))
|
|
33
|
+
else:
|
|
34
|
+
hosts.append(dasl_host)
|
|
35
|
+
|
|
36
|
+
last_exception = None
|
|
37
|
+
for host in hosts:
|
|
38
|
+
try:
|
|
39
|
+
metadata = WorkspaceMetadata._get_workspace_metadata(
|
|
40
|
+
workspace_url, host
|
|
41
|
+
)
|
|
42
|
+
if metadata:
|
|
43
|
+
return metadata
|
|
44
|
+
except WorkspaceLookupError as e:
|
|
45
|
+
last_exception = e
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
if last_exception:
|
|
49
|
+
raise last_exception
|
|
50
|
+
return None
|
|
30
51
|
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _get_workspace_metadata(
|
|
54
|
+
workspace_url: str, dasl_host: str
|
|
55
|
+
) -> Optional[WorkspaceV1WorkspaceMetadata]:
|
|
31
56
|
try:
|
|
32
57
|
# Create an unauthenticated client for the public metadata endpoint
|
|
33
58
|
configuration = Configuration(host=dasl_host)
|
|
@@ -55,10 +80,10 @@ class WorkspaceMetadata:
|
|
|
55
80
|
raise ValueError(f"Invalid workspace URL: {workspace_url}")
|
|
56
81
|
else:
|
|
57
82
|
# Other API errors
|
|
58
|
-
raise
|
|
83
|
+
raise WorkspaceLookupError(f"Failed to get workspace metadata: {e}")
|
|
59
84
|
except Exception as e:
|
|
60
85
|
# Network errors, encoding errors, etc.
|
|
61
|
-
raise
|
|
86
|
+
raise WorkspaceLookupError(f"Failed to get workspace metadata: {e}")
|
|
62
87
|
|
|
63
88
|
@staticmethod
|
|
64
89
|
def get_endpoint_for_workspace(workspace_url: str) -> Optional[str]:
|
|
@@ -2,13 +2,17 @@ from pyspark.sql import DataFrame, SparkSession
|
|
|
2
2
|
from pyspark.sql.types import *
|
|
3
3
|
from pyspark.sql.dataframe import DataFrame
|
|
4
4
|
from pyspark.sql.functions import lit, col as col_, sum as sum_, when
|
|
5
|
+
|
|
5
6
|
from dasl_client.preset_development.preview_parameters import *
|
|
6
7
|
from dasl_client.preset_development.stage import *
|
|
7
8
|
from dasl_client.preset_development.errors import *
|
|
9
|
+
|
|
8
10
|
import yaml
|
|
9
|
-
|
|
11
|
+
import os
|
|
10
12
|
from itertools import count
|
|
11
13
|
|
|
14
|
+
from IPython import get_ipython
|
|
15
|
+
|
|
12
16
|
|
|
13
17
|
@udf(StringType())
|
|
14
18
|
def constant_udf(*args):
|
|
@@ -362,7 +366,7 @@ class PreviewEngine:
|
|
|
362
366
|
display = ipython.user_ns["display"]
|
|
363
367
|
else:
|
|
364
368
|
displayHTML = lambda x: print(x)
|
|
365
|
-
display = lambda x: x.show()
|
|
369
|
+
display = lambda x, **kwargs: x.show()
|
|
366
370
|
|
|
367
371
|
def d(txt, lvl) -> None:
|
|
368
372
|
displayHTML(
|
|
@@ -376,25 +380,50 @@ class PreviewEngine:
|
|
|
376
380
|
|
|
377
381
|
(pre_silver, silver, gold, pre_bronze) = stage_dataframes
|
|
378
382
|
d("Autoloader Input", 1)
|
|
379
|
-
display(
|
|
383
|
+
display(
|
|
384
|
+
input_df,
|
|
385
|
+
checkpointLocation=os.path.join(
|
|
386
|
+
self._ds_params.get_checkpoint_temp_location(), "input"
|
|
387
|
+
),
|
|
388
|
+
)
|
|
380
389
|
d("Bronze Pre-Transform", 1)
|
|
381
390
|
for name, df in pre_bronze.items():
|
|
382
391
|
d(f"{name}", 2)
|
|
383
|
-
display(
|
|
392
|
+
display(
|
|
393
|
+
df,
|
|
394
|
+
checkpointLocation=os.path.join(
|
|
395
|
+
self._ds_params.get_checkpoint_temp_location(), f"pre_bronze-{name}"
|
|
396
|
+
),
|
|
397
|
+
)
|
|
384
398
|
d("Silver Pre-Transform", 1)
|
|
385
399
|
if pre_silver:
|
|
386
|
-
display(
|
|
400
|
+
display(
|
|
401
|
+
pre_silver,
|
|
402
|
+
checkpointLocation=os.path.join(
|
|
403
|
+
self._ds_params.get_checkpoint_temp_location(), "pre_silver"
|
|
404
|
+
),
|
|
405
|
+
)
|
|
387
406
|
else:
|
|
388
407
|
d("Skipped", 2)
|
|
389
408
|
d("Silver Transform", 1)
|
|
390
409
|
for name, df in silver.items():
|
|
391
410
|
d(f"{name}", 2)
|
|
392
|
-
display(
|
|
411
|
+
display(
|
|
412
|
+
df,
|
|
413
|
+
checkpointLocation=os.path.join(
|
|
414
|
+
self._ds_params.get_checkpoint_temp_location(), f"silver-{name}"
|
|
415
|
+
),
|
|
416
|
+
)
|
|
393
417
|
d("Gold", 1)
|
|
394
418
|
for full_name, df in gold.items():
|
|
395
419
|
d(f"{full_name}", 2)
|
|
396
420
|
d("Stage output", 3)
|
|
397
|
-
display(
|
|
421
|
+
display(
|
|
422
|
+
df,
|
|
423
|
+
checkpointLocation=os.path.join(
|
|
424
|
+
self._ds_params.get_checkpoint_temp_location(), f"gold-{full_name}"
|
|
425
|
+
),
|
|
426
|
+
)
|
|
398
427
|
|
|
399
428
|
# NOTE: Name is stored as Gold_name/Silver_input. So we need to get just the Gold table
|
|
400
429
|
# name that we are comparing the dataframe metadata to.
|
|
@@ -440,7 +469,13 @@ class PreviewEngine:
|
|
|
440
469
|
# alls good. display the output.
|
|
441
470
|
d("Resultant gold table preview", 3)
|
|
442
471
|
unioned_df = delta_df.unionByName(df, allowMissingColumns=True)
|
|
443
|
-
display(
|
|
472
|
+
display(
|
|
473
|
+
unioned_df,
|
|
474
|
+
checkpointLocation=os.path.join(
|
|
475
|
+
self._ds_params.get_checkpoint_temp_location(),
|
|
476
|
+
f"gold-unioned-{full_name}",
|
|
477
|
+
),
|
|
478
|
+
)
|
|
444
479
|
|
|
445
480
|
def is_backtick_escaped(self, name: str) -> bool:
|
|
446
481
|
"""
|
{dasl_client-1.0.28 → dasl_client-1.0.29}/dasl_client/preset_development/preview_parameters.py
RENAMED
|
@@ -1,13 +1,19 @@
|
|
|
1
|
-
from typing import Optional
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
2
|
|
|
3
3
|
from pyspark.sql import DataFrame, SparkSession
|
|
4
4
|
from pyspark.sql.types import *
|
|
5
5
|
from pyspark.sql.dataframe import DataFrame
|
|
6
6
|
from pyspark.sql.functions import col, lit, udf
|
|
7
7
|
from dasl_client.preset_development.errors import *
|
|
8
|
+
|
|
8
9
|
import uuid
|
|
10
|
+
import os
|
|
11
|
+
|
|
9
12
|
from IPython import get_ipython
|
|
10
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from dasl_client import Client
|
|
16
|
+
|
|
11
17
|
|
|
12
18
|
class PreviewParameters:
|
|
13
19
|
"""
|
|
@@ -128,13 +134,24 @@ class PreviewParameters:
|
|
|
128
134
|
```
|
|
129
135
|
|
|
130
136
|
**Note:**
|
|
131
|
-
When using autoloader mode, this implementation requires
|
|
132
|
-
|
|
133
|
-
`
|
|
134
|
-
|
|
137
|
+
When using autoloader mode, this implementation requires locations to store temporary schemas and
|
|
138
|
+
checkpoints. By default, these paths are automatically determined from your workspace's
|
|
139
|
+
`daslStoragePath` configuration:
|
|
140
|
+
- Schema location: `{daslStoragePath}/preset_preview/schemas`
|
|
141
|
+
- Checkpoint location: `{daslStoragePath}/preset_preview/checkpoints`
|
|
142
|
+
|
|
143
|
+
The workspace configuration is retrieved automatically via `Client.for_workspace()`. If you need
|
|
144
|
+
to use custom paths or don't have access to the DASL API, you can set them explicitly:
|
|
145
|
+
```python
|
|
146
|
+
ds_params = (PreviewParameters(spark)
|
|
147
|
+
.set_autoloader_temp_schema_location('/Volumes/catalog/schema/volume/schemas')
|
|
148
|
+
.set_checkpoint_temp_location_base('/Volumes/catalog/schema/volume/checkpoints'))
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Regardless of the paths used, you must have write permissions for those locations.
|
|
135
152
|
"""
|
|
136
153
|
|
|
137
|
-
def __init__(self, spark: SparkSession) -> None:
|
|
154
|
+
def __init__(self, spark: SparkSession, client: Optional["Client"] = None) -> None:
|
|
138
155
|
"""
|
|
139
156
|
Initializes the PreviewParameters instance with sparse default settings.
|
|
140
157
|
|
|
@@ -142,10 +159,19 @@ class PreviewParameters:
|
|
|
142
159
|
of records at a time. By default, the record limit is set to 10, but this can be overridden
|
|
143
160
|
if needed.
|
|
144
161
|
|
|
162
|
+
Args:
|
|
163
|
+
spark: SparkSession for DataFrame operations.
|
|
164
|
+
client: Optional DASL client for retrieving workspace configuration.
|
|
165
|
+
If not provided and storage paths are not set explicitly,
|
|
166
|
+
a client will be created automatically via Client.for_workspace().
|
|
167
|
+
|
|
145
168
|
Instance Attributes:
|
|
146
169
|
mode (str): Indicates the source type ("input" or "autoloader").
|
|
147
170
|
record_limit (int): Maximum number of records to load. Defaults to 10.
|
|
148
171
|
autoloader_temp_schema_location (str): Temporary location to store the autoloader schema.
|
|
172
|
+
Defaults to {daslStoragePath}/preset_preview/schemas.
|
|
173
|
+
checkpoint_temp_location_base (str): Temporary location to store checkpoints for stream and display.
|
|
174
|
+
Defaults to {daslStoragePath}/preset_preview/checkpoints.
|
|
149
175
|
time_column (str): Column name used for time-based filtering.
|
|
150
176
|
start_time (str): Start time for filtering.
|
|
151
177
|
end_time (str): End time for filtering.
|
|
@@ -161,10 +187,12 @@ class PreviewParameters:
|
|
|
161
187
|
df (DataFrame): Internal Spark DataFrame loaded using the specified parameters.
|
|
162
188
|
"""
|
|
163
189
|
self._spark = spark
|
|
190
|
+
self._client = client # Store client for lazy path resolution
|
|
164
191
|
self._mode = None # [input, table, autoloader, silverbronze]
|
|
165
192
|
self._record_limit = 10
|
|
166
|
-
self._autoloader_temp_schema_location =
|
|
193
|
+
self._autoloader_temp_schema_location = None # Will be resolved lazily
|
|
167
194
|
self._gold_test_schemas = []
|
|
195
|
+
self._checkpoint_temp_location_base = None # Will be resolved lazily
|
|
168
196
|
|
|
169
197
|
self._time_column = None
|
|
170
198
|
self._start_time = None
|
|
@@ -192,6 +220,69 @@ class PreviewParameters:
|
|
|
192
220
|
|
|
193
221
|
self._df = None
|
|
194
222
|
|
|
223
|
+
def _ensure_storage_paths_configured(self) -> None:
|
|
224
|
+
"""
|
|
225
|
+
Ensure storage paths are configured, either from explicit user settings
|
|
226
|
+
or from WorkspaceConfig. Only creates Client if paths are not explicitly set.
|
|
227
|
+
|
|
228
|
+
Raises:
|
|
229
|
+
RuntimeError: If daslStoragePath cannot be determined and paths not set
|
|
230
|
+
"""
|
|
231
|
+
# If both paths already set explicitly, nothing to do
|
|
232
|
+
if (
|
|
233
|
+
self._autoloader_temp_schema_location is not None
|
|
234
|
+
and self._checkpoint_temp_location_base is not None
|
|
235
|
+
):
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
# Need to get daslStoragePath from WorkspaceConfig
|
|
239
|
+
if self._client is None:
|
|
240
|
+
# Try to auto-create client
|
|
241
|
+
try:
|
|
242
|
+
from dasl_client import Client
|
|
243
|
+
|
|
244
|
+
self._client = Client.for_workspace()
|
|
245
|
+
except Exception as e:
|
|
246
|
+
raise RuntimeError(
|
|
247
|
+
"Could not create DASL client to retrieve workspace configuration. "
|
|
248
|
+
"Either provide a client explicitly: PreviewParameters(spark, client=client), "
|
|
249
|
+
"or set storage paths manually:\n"
|
|
250
|
+
" .set_autoloader_temp_schema_location('/path/to/schemas')\n"
|
|
251
|
+
" .set_checkpoint_temp_location_base('/path/to/checkpoints')\n"
|
|
252
|
+
f"Client creation error: {e}"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Get config and extract daslStoragePath
|
|
256
|
+
try:
|
|
257
|
+
config = self._client.get_config()
|
|
258
|
+
dasl_storage_path = config.dasl_storage_path
|
|
259
|
+
except Exception as e:
|
|
260
|
+
raise RuntimeError(
|
|
261
|
+
f"Failed to retrieve workspace configuration: {e}\n"
|
|
262
|
+
"Set storage paths manually if WorkspaceConfig is not available:\n"
|
|
263
|
+
" .set_autoloader_temp_schema_location('/path/to/schemas')\n"
|
|
264
|
+
" .set_checkpoint_temp_location_base('/path/to/checkpoints')"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
if not dasl_storage_path:
|
|
268
|
+
raise RuntimeError(
|
|
269
|
+
"WorkspaceConfig.dasl_storage_path is not set. "
|
|
270
|
+
"Configure this in your workspace settings or set paths explicitly:\n"
|
|
271
|
+
" .set_autoloader_temp_schema_location('/path/to/schemas')\n"
|
|
272
|
+
" .set_checkpoint_temp_location_base('/path/to/checkpoints')"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Build default paths from daslStoragePath
|
|
276
|
+
if self._autoloader_temp_schema_location is None:
|
|
277
|
+
self._autoloader_temp_schema_location = os.path.join(
|
|
278
|
+
dasl_storage_path, "preset_preview", "schemas"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if self._checkpoint_temp_location_base is None:
|
|
282
|
+
self._checkpoint_temp_location_base = os.path.join(
|
|
283
|
+
dasl_storage_path, "preset_preview", "checkpoints"
|
|
284
|
+
)
|
|
285
|
+
|
|
195
286
|
def __create_from_autoloader(self) -> DataFrame:
|
|
196
287
|
stream_df = (
|
|
197
288
|
self._spark.readStream.format("cloudFiles")
|
|
@@ -220,7 +311,10 @@ class PreviewParameters:
|
|
|
220
311
|
.option("cloudFiles.inferColumnTypes", "true")
|
|
221
312
|
.option(
|
|
222
313
|
"cloudFiles.schemaLocation",
|
|
223
|
-
|
|
314
|
+
os.path.join(
|
|
315
|
+
self.get_autoloader_temp_schema_location(),
|
|
316
|
+
self._schema_uuid_str,
|
|
317
|
+
),
|
|
224
318
|
)
|
|
225
319
|
)
|
|
226
320
|
|
|
@@ -239,6 +333,10 @@ class PreviewParameters:
|
|
|
239
333
|
stream_df.writeStream.format("memory")
|
|
240
334
|
.queryName("batch_data")
|
|
241
335
|
.trigger(availableNow=True)
|
|
336
|
+
.option(
|
|
337
|
+
"checkpointLocation",
|
|
338
|
+
os.path.join(self.get_checkpoint_temp_location(), "memory"),
|
|
339
|
+
)
|
|
242
340
|
.start()
|
|
243
341
|
)
|
|
244
342
|
|
|
@@ -294,6 +392,13 @@ class PreviewParameters:
|
|
|
294
392
|
"""
|
|
295
393
|
Cleans up the temporary schema created for streaming mode, if it was created.
|
|
296
394
|
"""
|
|
395
|
+
# Only clean up if paths were actually configured
|
|
396
|
+
# This handles the case where __exit__ is called after an exception in __enter__
|
|
397
|
+
if (
|
|
398
|
+
self._autoloader_temp_schema_location is None
|
|
399
|
+
or self._checkpoint_temp_location_base is None
|
|
400
|
+
):
|
|
401
|
+
return
|
|
297
402
|
|
|
298
403
|
# Get the Databricks built-in functions out the namespace.
|
|
299
404
|
ipython = get_ipython()
|
|
@@ -301,19 +406,32 @@ class PreviewParameters:
|
|
|
301
406
|
dbutils = ipython.user_ns["dbutils"]
|
|
302
407
|
|
|
303
408
|
dbutils.fs.rm(
|
|
304
|
-
|
|
409
|
+
os.path.join(
|
|
410
|
+
self._autoloader_temp_schema_location, self._schema_uuid_str
|
|
411
|
+
),
|
|
412
|
+
recurse=True,
|
|
413
|
+
)
|
|
414
|
+
dbutils.fs.rm(
|
|
415
|
+
os.path.join(
|
|
416
|
+
self._checkpoint_temp_location_base, self._schema_uuid_str
|
|
417
|
+
),
|
|
305
418
|
recurse=True,
|
|
306
419
|
)
|
|
307
420
|
for gold_test_schema in self._gold_test_schemas:
|
|
308
421
|
dbutils.fs.rm(
|
|
309
|
-
|
|
422
|
+
os.path.join(
|
|
423
|
+
self._autoloader_temp_schema_location, gold_test_schema
|
|
424
|
+
),
|
|
310
425
|
recurse=True,
|
|
311
426
|
)
|
|
312
427
|
else:
|
|
313
428
|
leaked_lines = [
|
|
314
|
-
f"FYI, we are leaking temp data {self._autoloader_temp_schema_location
|
|
429
|
+
f"FYI, we are leaking temp data {os.path.join(self._autoloader_temp_schema_location, self._schema_uuid_str)}",
|
|
430
|
+
os.path.join(
|
|
431
|
+
self._checkpoint_temp_location_base, self._schema_uuid_str
|
|
432
|
+
),
|
|
315
433
|
*[
|
|
316
|
-
|
|
434
|
+
os.path.join(self._autoloader_temp_schema_location, x)
|
|
317
435
|
for x in self._gold_test_schemas
|
|
318
436
|
],
|
|
319
437
|
]
|
|
@@ -396,11 +514,56 @@ class PreviewParameters:
|
|
|
396
514
|
"""
|
|
397
515
|
Get the location for the autoloader's streaming mode schema to be created.
|
|
398
516
|
|
|
517
|
+
If not explicitly set, defaults to {daslStoragePath}/preset_preview/schemas.
|
|
518
|
+
|
|
399
519
|
Returns:
|
|
400
520
|
str: The location for the autoloader's streaming mode schema to be created.
|
|
521
|
+
|
|
522
|
+
Raises:
|
|
523
|
+
RuntimeError: If path cannot be determined from WorkspaceConfig
|
|
401
524
|
"""
|
|
525
|
+
self._ensure_storage_paths_configured()
|
|
402
526
|
return self._autoloader_temp_schema_location
|
|
403
527
|
|
|
528
|
+
def set_checkpoint_temp_location_base(self, path: str):
|
|
529
|
+
"""
|
|
530
|
+
Set the base location for the checkpoint to be created. This is
|
|
531
|
+
deleted at the end of a run.
|
|
532
|
+
|
|
533
|
+
Returns:
|
|
534
|
+
PreviewParameters: The current instance with updated configuration.
|
|
535
|
+
"""
|
|
536
|
+
self._checkpoint_temp_location_base = path
|
|
537
|
+
return self
|
|
538
|
+
|
|
539
|
+
def get_checkpoint_temp_location_base(self) -> str:
|
|
540
|
+
"""
|
|
541
|
+
Get the location for the checkpoint to be created.
|
|
542
|
+
|
|
543
|
+
If not explicitly set, defaults to {daslStoragePath}/preset_preview/checkpoints.
|
|
544
|
+
|
|
545
|
+
Returns:
|
|
546
|
+
str: The location for the checkpoint to be created.
|
|
547
|
+
|
|
548
|
+
Raises:
|
|
549
|
+
RuntimeError: If path cannot be determined from WorkspaceConfig
|
|
550
|
+
"""
|
|
551
|
+
self._ensure_storage_paths_configured()
|
|
552
|
+
return self._checkpoint_temp_location_base
|
|
553
|
+
|
|
554
|
+
def get_checkpoint_temp_location(self) -> str:
|
|
555
|
+
"""
|
|
556
|
+
Get the location where checkpoints to be created.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
str: The location where checkpoints to be created.
|
|
560
|
+
|
|
561
|
+
Raises:
|
|
562
|
+
RuntimeError: If path cannot be determined from WorkspaceConfig
|
|
563
|
+
"""
|
|
564
|
+
self._ensure_storage_paths_configured()
|
|
565
|
+
return os.path.join(self._checkpoint_temp_location_base, self._schema_uuid_str)
|
|
566
|
+
|
|
404
567
|
def set_data_schema(self, schema: StructType):
|
|
405
568
|
"""
|
|
406
569
|
Set the input schema for "input" mode. For example:
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
{
|
|
2
|
+
"aws-us-east-1": "https://api.sl.us-east-1.cloud.databricks.com",
|
|
3
|
+
"aws-us-west-2": "https://api.sl.us-west-2.cloud.databricks.com",
|
|
4
|
+
"aws-ap-southeast-2": "https://api.sl.ap-southeast-2.cloud.databricks.com",
|
|
5
|
+
"aws-eu-central-1": "https://api.sl.eu-central-1.cloud.databricks.com"
|
|
6
|
+
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|