dasl-client 1.0.25__tar.gz → 1.0.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dasl-client might be problematic. Click here for more details.
- dasl_client-1.0.27/PKG-INFO +144 -0
- dasl_client-1.0.27/README.md +129 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/client.py +65 -3
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/conn/conn.py +3 -1
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/helpers.py +1 -1
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/errors.py +20 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/preview_engine.py +136 -42
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/stage.py +23 -2
- dasl_client-1.0.27/dasl_client/regions.json +4 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/datasource.py +7 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/dbui.py +138 -33
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/rule.py +29 -1
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/workspace_config.py +69 -24
- dasl_client-1.0.27/dasl_client.egg-info/PKG-INFO +144 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client.egg-info/SOURCES.txt +1 -1
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client.egg-info/requires.txt +2 -1
- {dasl_client-1.0.25 → dasl_client-1.0.27}/pyproject.toml +3 -2
- dasl_client-1.0.25/PKG-INFO +0 -18
- dasl_client-1.0.25/dasl_client/regions.json +0 -3
- dasl_client-1.0.25/dasl_client.egg-info/PKG-INFO +0 -18
- dasl_client-1.0.25/setup.py +0 -16
- {dasl_client-1.0.25 → dasl_client-1.0.27}/LICENSE +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/__init__.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/auth/__init__.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/auth/auth.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/conn/__init__.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/conn/client_identifier.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/errors/__init__.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/errors/errors.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/exec_rule.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/__init__.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/preview_parameters.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/regions.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/__init__.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/admin_config.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/content.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/helpers.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/types.py +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client.egg-info/dependency_links.txt +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client.egg-info/top_level.txt +0 -0
- {dasl_client-1.0.25 → dasl_client-1.0.27}/setup.cfg +0 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dasl_client
|
|
3
|
+
Version: 1.0.27
|
|
4
|
+
Summary: The DASL client library used for interacting with the DASL workspace
|
|
5
|
+
Author-email: Antimatter Team <support@antimatter.io>
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: dasl_api==0.1.26
|
|
10
|
+
Requires-Dist: databricks-sdk>=0.41.0
|
|
11
|
+
Requires-Dist: pydantic>=2
|
|
12
|
+
Requires-Dist: typing_extensions>=4.10.0
|
|
13
|
+
Requires-Dist: pyyaml==6.0.2
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
|
|
16
|
+
# DASL Client Library
|
|
17
|
+
|
|
18
|
+
The DASL (Databricks Antimatter Security Lakehouse) Client Library is a Python SDK for interacting with DASL services.
|
|
19
|
+
This library provides an interface for interacting with DASL services, allowing you to manage
|
|
20
|
+
datasources, rules, workspace configurations, and more from Databricks notebooks.
|
|
21
|
+
|
|
22
|
+
## Features
|
|
23
|
+
|
|
24
|
+
* **Simple Authentication**: Automatic workspace detection in Databricks notebooks
|
|
25
|
+
* **Datasource Management**: Create, update, list, and delete datasources
|
|
26
|
+
* **Rule Management**: Define and manage security detection rules to identify threats
|
|
27
|
+
* **Workspace Configuration**: Update and retrieve DASL's workspace-level settings
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
Install from PyPI:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install dasl-client
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
### Databricks Notebook Environment (Recommended)
|
|
40
|
+
|
|
41
|
+
The DASL client works best in Databricks notebooks with automatic authentication:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from dasl_client import Client
|
|
45
|
+
|
|
46
|
+
# Automatically detects Databricks context and authenticates
|
|
47
|
+
client = Client.for_workspace()
|
|
48
|
+
print("Connected to DASL!")
|
|
49
|
+
|
|
50
|
+
# List existing datasources
|
|
51
|
+
print("Existing datasources:")
|
|
52
|
+
for datasource in client.list_datasources():
|
|
53
|
+
print(f" - {datasource.metadata.name}")
|
|
54
|
+
|
|
55
|
+
# List detection rules
|
|
56
|
+
print("Existing detection rules:")
|
|
57
|
+
for rule in client.list_rules():
|
|
58
|
+
print(f" - {rule.metadata.name}")
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Creating a Datasource
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from dasl_client import DataSource, Schedule, BronzeSpec, SilverSpec
|
|
65
|
+
|
|
66
|
+
# Create a new datasource
|
|
67
|
+
datasource = Datasource(
|
|
68
|
+
source="aws",
|
|
69
|
+
source_type="cloudtrail",
|
|
70
|
+
autoloader=Autoloader(
|
|
71
|
+
enabled=True,
|
|
72
|
+
schedule=Schedule(
|
|
73
|
+
at_least_every="1h",
|
|
74
|
+
enabled=True
|
|
75
|
+
)
|
|
76
|
+
),
|
|
77
|
+
bronze=BronzeSpec(
|
|
78
|
+
bronze_table="security_logs_bronze",
|
|
79
|
+
skip_bronze_loading=False
|
|
80
|
+
),
|
|
81
|
+
silver=SilverSpec(
|
|
82
|
+
# Configure silver layer here, see the API reference for more details
|
|
83
|
+
),
|
|
84
|
+
gold=GoldSpec(
|
|
85
|
+
# Configure gold layer here, see the API reference for more details
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Create the datasource
|
|
90
|
+
created_datasource = client.create_datasource(datasource)
|
|
91
|
+
print(f"Created datasource: {created.metadata.name}")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Creating a Detection Rule
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from dasl_client.types import Rule, Schedule
|
|
98
|
+
|
|
99
|
+
# Create a new detection rule to detect failed logins
|
|
100
|
+
rule = Rule(
|
|
101
|
+
schedule=Schedule(
|
|
102
|
+
at_least_every="2h",
|
|
103
|
+
enabled=True,
|
|
104
|
+
),
|
|
105
|
+
input=Rule.Input(
|
|
106
|
+
stream=Rule.Input.Stream(
|
|
107
|
+
tables=[
|
|
108
|
+
Rule.Input.Stream.Table(name="http_activity"),
|
|
109
|
+
],
|
|
110
|
+
filter="disposition = 'Blocked'",
|
|
111
|
+
starting_timestamp=datetime(2025, 7, 8, 16, 47, 30),
|
|
112
|
+
),
|
|
113
|
+
),
|
|
114
|
+
output=Rule.Output(
|
|
115
|
+
summary="record was blocked",
|
|
116
|
+
),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
created_rule = client.create_rule("Detect Blocked HTTP Activity", rule)
|
|
121
|
+
print(f"Successfully created rule: {created_rule.metadata.name}")
|
|
122
|
+
except Exception as e:
|
|
123
|
+
print(f"Error creating rule: {e}")
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Requirements
|
|
127
|
+
|
|
128
|
+
- Python 3.8+
|
|
129
|
+
- Access to a Databricks workspace with DASL enabled
|
|
130
|
+
- `databricks-sdk>=0.41.0`
|
|
131
|
+
- `pydantic>=2`
|
|
132
|
+
|
|
133
|
+
## Documentation
|
|
134
|
+
|
|
135
|
+
For complete DASL Client documentation, examples, and API reference:
|
|
136
|
+
|
|
137
|
+
- [DASL Client Documentation](https://antimatter-dasl-client.readthedocs-hosted.com/)
|
|
138
|
+
- [API Reference](https://antimatter-dasl-client.readthedocs-hosted.com/en/latest/api-reference/)
|
|
139
|
+
- [Quickstart Guide](https://antimatter-dasl-client.readthedocs-hosted.com/en/latest/quickstart.html)
|
|
140
|
+
|
|
141
|
+
## Support
|
|
142
|
+
|
|
143
|
+
- **Email**: support@antimatter.io
|
|
144
|
+
- **Documentation**: [DASL Documentation](https://docs.sl.antimatter.io)
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# DASL Client Library
|
|
2
|
+
|
|
3
|
+
The DASL (Databricks Antimatter Security Lakehouse) Client Library is a Python SDK for interacting with DASL services.
|
|
4
|
+
This library provides an interface for interacting with DASL services, allowing you to manage
|
|
5
|
+
datasources, rules, workspace configurations, and more from Databricks notebooks.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
* **Simple Authentication**: Automatic workspace detection in Databricks notebooks
|
|
10
|
+
* **Datasource Management**: Create, update, list, and delete datasources
|
|
11
|
+
* **Rule Management**: Define and manage security detection rules to identify threats
|
|
12
|
+
* **Workspace Configuration**: Update and retrieve DASL's workspace-level settings
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
Install from PyPI:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install dasl-client
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
### Databricks Notebook Environment (Recommended)
|
|
25
|
+
|
|
26
|
+
The DASL client works best in Databricks notebooks with automatic authentication:
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from dasl_client import Client
|
|
30
|
+
|
|
31
|
+
# Automatically detects Databricks context and authenticates
|
|
32
|
+
client = Client.for_workspace()
|
|
33
|
+
print("Connected to DASL!")
|
|
34
|
+
|
|
35
|
+
# List existing datasources
|
|
36
|
+
print("Existing datasources:")
|
|
37
|
+
for datasource in client.list_datasources():
|
|
38
|
+
print(f" - {datasource.metadata.name}")
|
|
39
|
+
|
|
40
|
+
# List detection rules
|
|
41
|
+
print("Existing detection rules:")
|
|
42
|
+
for rule in client.list_rules():
|
|
43
|
+
print(f" - {rule.metadata.name}")
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Creating a Datasource
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from dasl_client import DataSource, Schedule, BronzeSpec, SilverSpec
|
|
50
|
+
|
|
51
|
+
# Create a new datasource
|
|
52
|
+
datasource = Datasource(
|
|
53
|
+
source="aws",
|
|
54
|
+
source_type="cloudtrail",
|
|
55
|
+
autoloader=Autoloader(
|
|
56
|
+
enabled=True,
|
|
57
|
+
schedule=Schedule(
|
|
58
|
+
at_least_every="1h",
|
|
59
|
+
enabled=True
|
|
60
|
+
)
|
|
61
|
+
),
|
|
62
|
+
bronze=BronzeSpec(
|
|
63
|
+
bronze_table="security_logs_bronze",
|
|
64
|
+
skip_bronze_loading=False
|
|
65
|
+
),
|
|
66
|
+
silver=SilverSpec(
|
|
67
|
+
# Configure silver layer here, see the API reference for more details
|
|
68
|
+
),
|
|
69
|
+
gold=GoldSpec(
|
|
70
|
+
# Configure gold layer here, see the API reference for more details
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Create the datasource
|
|
75
|
+
created_datasource = client.create_datasource(datasource)
|
|
76
|
+
print(f"Created datasource: {created.metadata.name}")
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Creating a Detection Rule
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from dasl_client.types import Rule, Schedule
|
|
83
|
+
|
|
84
|
+
# Create a new detection rule to detect failed logins
|
|
85
|
+
rule = Rule(
|
|
86
|
+
schedule=Schedule(
|
|
87
|
+
at_least_every="2h",
|
|
88
|
+
enabled=True,
|
|
89
|
+
),
|
|
90
|
+
input=Rule.Input(
|
|
91
|
+
stream=Rule.Input.Stream(
|
|
92
|
+
tables=[
|
|
93
|
+
Rule.Input.Stream.Table(name="http_activity"),
|
|
94
|
+
],
|
|
95
|
+
filter="disposition = 'Blocked'",
|
|
96
|
+
starting_timestamp=datetime(2025, 7, 8, 16, 47, 30),
|
|
97
|
+
),
|
|
98
|
+
),
|
|
99
|
+
output=Rule.Output(
|
|
100
|
+
summary="record was blocked",
|
|
101
|
+
),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
created_rule = client.create_rule("Detect Blocked HTTP Activity", rule)
|
|
106
|
+
print(f"Successfully created rule: {created_rule.metadata.name}")
|
|
107
|
+
except Exception as e:
|
|
108
|
+
print(f"Error creating rule: {e}")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Requirements
|
|
112
|
+
|
|
113
|
+
- Python 3.8+
|
|
114
|
+
- Access to a Databricks workspace with DASL enabled
|
|
115
|
+
- `databricks-sdk>=0.41.0`
|
|
116
|
+
- `pydantic>=2`
|
|
117
|
+
|
|
118
|
+
## Documentation
|
|
119
|
+
|
|
120
|
+
For complete DASL Client documentation, examples, and API reference:
|
|
121
|
+
|
|
122
|
+
- [DASL Client Documentation](https://antimatter-dasl-client.readthedocs-hosted.com/)
|
|
123
|
+
- [API Reference](https://antimatter-dasl-client.readthedocs-hosted.com/en/latest/api-reference/)
|
|
124
|
+
- [Quickstart Guide](https://antimatter-dasl-client.readthedocs-hosted.com/en/latest/quickstart.html)
|
|
125
|
+
|
|
126
|
+
## Support
|
|
127
|
+
|
|
128
|
+
- **Email**: support@antimatter.io
|
|
129
|
+
- **Documentation**: [DASL Documentation](https://docs.sl.antimatter.io)
|
|
@@ -8,6 +8,8 @@ from pyspark.sql import DataFrame
|
|
|
8
8
|
from dasl_api import (
|
|
9
9
|
CoreV1Api,
|
|
10
10
|
DbuiV1Api,
|
|
11
|
+
DbuiV1QueryExtendRequest,
|
|
12
|
+
CoreV1QueryExtendRequestDateRange,
|
|
11
13
|
DbuiV1QueryGenerateRequest,
|
|
12
14
|
DbuiV1QueryGenerateRequestTimeRange,
|
|
13
15
|
DbuiV1QueryGenerateStatus,
|
|
@@ -597,7 +599,7 @@ class Client:
|
|
|
597
599
|
def exec_rule(
|
|
598
600
|
self,
|
|
599
601
|
spark,
|
|
600
|
-
rule_in: Rule,
|
|
602
|
+
rule_in: Rule | str,
|
|
601
603
|
) -> ExecRule:
|
|
602
604
|
"""
|
|
603
605
|
Locally execute a Rule. Must be run from within a Databricks
|
|
@@ -607,19 +609,25 @@ class Client:
|
|
|
607
609
|
:param spark: Spark context from Databricks notebook. Will be
|
|
608
610
|
injected into the execution environment for use by the
|
|
609
611
|
Rule notebook.
|
|
610
|
-
:param rule_in:
|
|
612
|
+
:param rule_in:
|
|
613
|
+
The specification of the Rule to execute. If specified as
|
|
614
|
+
a string, it should be in YAML format.
|
|
611
615
|
:returns ExecRule: A class containing various information and
|
|
612
616
|
functionality relating to the execution. See the docs for
|
|
613
617
|
ExecRule for additional details, but note that you must
|
|
614
618
|
call its cleanup function or tables created just for this
|
|
615
619
|
request will leak.
|
|
616
620
|
"""
|
|
621
|
+
rule = rule_in
|
|
622
|
+
if isinstance(rule_in, str):
|
|
623
|
+
rule = Rule.from_yaml_str(rule_in)
|
|
624
|
+
|
|
617
625
|
Helpers.ensure_databricks()
|
|
618
626
|
|
|
619
627
|
with error_handler():
|
|
620
628
|
result = self._core_client().core_v1_render_rule(
|
|
621
629
|
self._workspace(),
|
|
622
|
-
|
|
630
|
+
rule.to_api_obj(),
|
|
623
631
|
)
|
|
624
632
|
|
|
625
633
|
try:
|
|
@@ -794,6 +802,60 @@ class Client:
|
|
|
794
802
|
.id
|
|
795
803
|
)
|
|
796
804
|
|
|
805
|
+
def extend_query(
|
|
806
|
+
self,
|
|
807
|
+
id: str,
|
|
808
|
+
warehouse: Optional[str] = None,
|
|
809
|
+
start_date: Optional[str] = None,
|
|
810
|
+
end_date: Optional[str] = None,
|
|
811
|
+
) -> str:
|
|
812
|
+
"""
|
|
813
|
+
Extend an existing query to cover a larger time range . If the query
|
|
814
|
+
is ordered by time and contains no aggregations, this will add the
|
|
815
|
+
additional data to the existing underlying query, returning the
|
|
816
|
+
existing ID. If the existing table cannot be extended, a new table
|
|
817
|
+
will be created to cover the updated time range.
|
|
818
|
+
|
|
819
|
+
:param id: The ID of the query to extend.
|
|
820
|
+
:param warehouse: The SQL warehouse use to execute the SQL. If
|
|
821
|
+
omitted, the default SQL warehouse specified in the workspace
|
|
822
|
+
config will be used.
|
|
823
|
+
:param start_date: An optional starting date to extend the existing
|
|
824
|
+
query by. If not provided, the current start date of the query
|
|
825
|
+
will be used.
|
|
826
|
+
:param end_date: An optional end date to extend the existing
|
|
827
|
+
query by. If not provided, the current end date of the query
|
|
828
|
+
will be used.
|
|
829
|
+
:returns str: The ID of the query generation operation. This value
|
|
830
|
+
can be used with get_query_status to track the progress of
|
|
831
|
+
the generation process, and eventually to perform lookups
|
|
832
|
+
on the completed query. If the current query could be extended,
|
|
833
|
+
this id will be the same as the one provided. If a new query had
|
|
834
|
+
to be generated, the new ID is returned.
|
|
835
|
+
"""
|
|
836
|
+
time_range = None
|
|
837
|
+
if start_date is not None or end_date is not None:
|
|
838
|
+
time_range = CoreV1QueryExtendRequestDateRange(
|
|
839
|
+
startDate=start_date,
|
|
840
|
+
endDate=end_date,
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
req = DbuiV1QueryExtendRequest(
|
|
844
|
+
warehouse=warehouse,
|
|
845
|
+
timeRange=time_range,
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
with error_handler():
|
|
849
|
+
return (
|
|
850
|
+
self._dbui_client()
|
|
851
|
+
.dbui_v1_query_extend(
|
|
852
|
+
self._workspace(),
|
|
853
|
+
id,
|
|
854
|
+
req,
|
|
855
|
+
)
|
|
856
|
+
.id
|
|
857
|
+
)
|
|
858
|
+
|
|
797
859
|
def get_query_status(
|
|
798
860
|
self,
|
|
799
861
|
id: str,
|
|
@@ -19,7 +19,9 @@ def get_base_conn(enable_retries: bool = True, host: Optional[str] = None) -> Ap
|
|
|
19
19
|
:return: An API conn without any auth
|
|
20
20
|
"""
|
|
21
21
|
if host is None:
|
|
22
|
-
host = os.getenv(
|
|
22
|
+
host = os.getenv(
|
|
23
|
+
"DASL_API_URL", "https://api.sl.us-east-1.cloud.databricks.com"
|
|
24
|
+
)
|
|
23
25
|
config = Configuration(host=host)
|
|
24
26
|
if enable_retries:
|
|
25
27
|
# configure retries with backup for all HTTP verbs; we do not limit this to only
|
|
@@ -9,6 +9,26 @@ class PresetError(Exception):
|
|
|
9
9
|
pass
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
class StageExecutionException(PresetError):
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
medallion_layer="unknown",
|
|
16
|
+
exception_map: Dict[str, List[str]] = {},
|
|
17
|
+
verbose: bool = False,
|
|
18
|
+
):
|
|
19
|
+
self.exception_map = exception_map
|
|
20
|
+
message = (
|
|
21
|
+
f"Field specification errors encountered in {medallion_layer} stage.\n\n"
|
|
22
|
+
)
|
|
23
|
+
for table, exceptions in exception_map.items():
|
|
24
|
+
message += f"Table: {table}\n"
|
|
25
|
+
count = 1
|
|
26
|
+
for exception in exceptions:
|
|
27
|
+
message += f"Exception {count}:\n{exception.split('JVM')[0] if not verbose else exception}\n\n"
|
|
28
|
+
count += 1
|
|
29
|
+
super().__init__(message)
|
|
30
|
+
|
|
31
|
+
|
|
12
32
|
class InvalidGoldTableSchemaError(PresetError):
|
|
13
33
|
def __init__(self, schema: str, additional_message: str = ""):
|
|
14
34
|
self.schema = schema
|
|
@@ -49,6 +49,7 @@ class PreviewEngine:
|
|
|
49
49
|
"""
|
|
50
50
|
self._spark = spark
|
|
51
51
|
self._ds_params = ds_params
|
|
52
|
+
self.__stage_exception = {}
|
|
52
53
|
self._preset = yaml.safe_load(preset_yaml_str)
|
|
53
54
|
self._pretransform_name = ds_params._pretransform_name
|
|
54
55
|
|
|
@@ -129,7 +130,7 @@ class PreviewEngine:
|
|
|
129
130
|
if missing_keys:
|
|
130
131
|
raise MissingSilverKeysError(missing_keys)
|
|
131
132
|
|
|
132
|
-
def _compile_stages(self) -> None:
|
|
133
|
+
def _compile_stages(self, force_evaluation: bool = False) -> None:
|
|
133
134
|
"""
|
|
134
135
|
Creates Stage objects, setting silver pretransform to None if not provided.
|
|
135
136
|
"""
|
|
@@ -160,15 +161,21 @@ class PreviewEngine:
|
|
|
160
161
|
break
|
|
161
162
|
|
|
162
163
|
self._silver = [
|
|
163
|
-
Stage(
|
|
164
|
+
Stage(
|
|
165
|
+
self._spark,
|
|
166
|
+
"silver transform",
|
|
167
|
+
table,
|
|
168
|
+
force_evaluation=force_evaluation,
|
|
169
|
+
)
|
|
164
170
|
for table in self._preset.get("silver", {}).get("transform", [])
|
|
165
171
|
]
|
|
166
172
|
self._gold = [
|
|
167
|
-
Stage(self._spark, "gold", table
|
|
173
|
+
Stage(self._spark, "gold", table, force_evaluation=force_evaluation)
|
|
174
|
+
for table in self._preset.get("gold", [])
|
|
168
175
|
]
|
|
169
176
|
|
|
170
177
|
def _run(
|
|
171
|
-
self, df: DataFrame
|
|
178
|
+
self, df: DataFrame, verbose: bool = False
|
|
172
179
|
) -> Tuple[DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]]:
|
|
173
180
|
"""
|
|
174
181
|
Runs all stages, in medallion stage order. This allows prior stage outputs to feed
|
|
@@ -232,6 +239,14 @@ class PreviewEngine:
|
|
|
232
239
|
for table in self._silver:
|
|
233
240
|
silver_output_map[table._name] = table.run(df)
|
|
234
241
|
|
|
242
|
+
# Check for silver stage exceptions.
|
|
243
|
+
# NOTE: These exception lists only get populated if force_evaluation is enabled.
|
|
244
|
+
for table in self._silver:
|
|
245
|
+
if exceptions := table.get_exceptions():
|
|
246
|
+
self.__stage_exception[table._name] = exceptions
|
|
247
|
+
if self.__stage_exception:
|
|
248
|
+
raise StageExecutionException("silver", self.__stage_exception, verbose)
|
|
249
|
+
|
|
235
250
|
gold_output_map = {}
|
|
236
251
|
for table in self._gold:
|
|
237
252
|
# We store as gold_name/silver_input to prevent clobbering on duplicate gold table use.
|
|
@@ -239,12 +254,92 @@ class PreviewEngine:
|
|
|
239
254
|
silver_output_map[table._input]
|
|
240
255
|
)
|
|
241
256
|
|
|
257
|
+
# Check for gold stage exceptions.
|
|
258
|
+
# NOTE: These exception lists only get populated if force_evaluation is enabled.
|
|
259
|
+
for table in self._gold:
|
|
260
|
+
if exceptions := table.get_exceptions():
|
|
261
|
+
self.__stage_exception[table._name] = exceptions
|
|
262
|
+
if self.__stage_exception:
|
|
263
|
+
raise StageExecutionException("gold", self.__stage_exception, verbose)
|
|
264
|
+
|
|
242
265
|
return (
|
|
243
266
|
(df, silver_output_map, gold_output_map, pre_bronze_output)
|
|
244
267
|
if self._pre_silver
|
|
245
268
|
else (None, silver_output_map, gold_output_map, pre_bronze_output)
|
|
246
269
|
)
|
|
247
270
|
|
|
271
|
+
def __get_sql_type(self, data_type) -> str:
|
|
272
|
+
"""
|
|
273
|
+
Helper to convert Spark data type objects to SQL type strings.
|
|
274
|
+
"""
|
|
275
|
+
if isinstance(data_type, StringType):
|
|
276
|
+
return "STRING"
|
|
277
|
+
elif isinstance(data_type, IntegerType):
|
|
278
|
+
return "INT"
|
|
279
|
+
elif isinstance(data_type, LongType):
|
|
280
|
+
return "BIGINT"
|
|
281
|
+
elif isinstance(data_type, FloatType):
|
|
282
|
+
return "FLOAT"
|
|
283
|
+
elif isinstance(data_type, DoubleType):
|
|
284
|
+
return "DOUBLE"
|
|
285
|
+
elif isinstance(data_type, BooleanType):
|
|
286
|
+
return "BOOLEAN"
|
|
287
|
+
elif isinstance(data_type, TimestampType):
|
|
288
|
+
return "TIMESTAMP"
|
|
289
|
+
elif isinstance(data_type, DateType):
|
|
290
|
+
return "DATE"
|
|
291
|
+
elif isinstance(data_type, ArrayType):
|
|
292
|
+
return f"ARRAY<{self.__get_sql_type(data_type.elementType)}>"
|
|
293
|
+
elif isinstance(data_type, MapType):
|
|
294
|
+
return f"MAP<{self.__get_sql_type(data_type.keyType)}, {self.__get_sql_type(data_type.valueType)}>"
|
|
295
|
+
elif isinstance(data_type, StructType):
|
|
296
|
+
fields = ", ".join(
|
|
297
|
+
[
|
|
298
|
+
f"{field.name}: {self.__get_sql_type(field.dataType)}"
|
|
299
|
+
for field in data_type.fields
|
|
300
|
+
]
|
|
301
|
+
)
|
|
302
|
+
return f"STRUCT<{fields}>"
|
|
303
|
+
elif isinstance(data_type, VariantType):
|
|
304
|
+
return f"VARIANT"
|
|
305
|
+
else:
|
|
306
|
+
return f"UNKNOWN ({data_type})"
|
|
307
|
+
|
|
308
|
+
def __format_gold_column_merge_exception(
|
|
309
|
+
self,
|
|
310
|
+
columns: Dict[str, List[Exception]],
|
|
311
|
+
gold_df: DataFrame,
|
|
312
|
+
verbose: bool = False,
|
|
313
|
+
):
|
|
314
|
+
"""
|
|
315
|
+
Formatter for various exceptions that occur during the merge of gold tables.
|
|
316
|
+
"""
|
|
317
|
+
missing_column_flag = False
|
|
318
|
+
for column, info in columns.items():
|
|
319
|
+
# RANT: it is annoying, but basically every exception comes back from the
|
|
320
|
+
# query analyzer as pyspark.errors.exceptions.connect.AnalysisException,
|
|
321
|
+
# so we are forced into this awkward string search.
|
|
322
|
+
str_e = str(info["exception"])
|
|
323
|
+
str_e = str_e.split("JVM")[0] if not verbose else str_e
|
|
324
|
+
if "LEGACY_ERROR_TEMP_DELTA_0007" in str_e:
|
|
325
|
+
print(
|
|
326
|
+
f"-> Column \"{column}\" of type \"{self.__get_sql_type(info['type'])}\" does not exist in gold table \"{info['table']}\"."
|
|
327
|
+
)
|
|
328
|
+
missing_column_flag = True
|
|
329
|
+
elif "DELTA_FAILED_TO_MERGE_FIELDS" in str_e:
|
|
330
|
+
print(
|
|
331
|
+
f"-> Column \"{column}\" of type \"{self.__get_sql_type(info['type'])}\" is not compatiable with gold table \"{info['table']}\"'s \"{column}\" of type \"{self.__get_sql_type(gold_df.schema[column].dataType)}\""
|
|
332
|
+
)
|
|
333
|
+
else:
|
|
334
|
+
print(
|
|
335
|
+
f"-> Column \"{column}\" raised the following unformatted exception when appending to gold table \"{info['table']}\":\n{str_e}"
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
if missing_column_flag:
|
|
339
|
+
print(
|
|
340
|
+
f"\nA write to 1 or more non-existent columns occured - available columns are: {', '.join(gold_df.columns)}"
|
|
341
|
+
)
|
|
342
|
+
|
|
248
343
|
def _render_output(
|
|
249
344
|
self,
|
|
250
345
|
input_df: DataFrame,
|
|
@@ -253,6 +348,7 @@ class PreviewEngine:
|
|
|
253
348
|
],
|
|
254
349
|
gold_table_catalog: str,
|
|
255
350
|
gold_table_schema: str,
|
|
351
|
+
verbose: bool = False,
|
|
256
352
|
) -> None:
|
|
257
353
|
"""
|
|
258
354
|
Displays formatted HTML output from executed Stages' DataFrames.
|
|
@@ -278,31 +374,6 @@ class PreviewEngine:
|
|
|
278
374
|
"""
|
|
279
375
|
)
|
|
280
376
|
|
|
281
|
-
def check_struct_compatibility(
|
|
282
|
-
target_field: StructField, df_field: StructField, prefix=""
|
|
283
|
-
):
|
|
284
|
-
if not (
|
|
285
|
-
isinstance(target_field.dataType, StructType)
|
|
286
|
-
and isinstance(df_field.dataType, StructType)
|
|
287
|
-
):
|
|
288
|
-
return
|
|
289
|
-
|
|
290
|
-
target_fields = {
|
|
291
|
-
field.name: field for field in target_field.dataType.fields
|
|
292
|
-
}
|
|
293
|
-
for field in df_field.dataType.fields:
|
|
294
|
-
if field.name not in target_fields:
|
|
295
|
-
raise GoldTableCompatibilityError(
|
|
296
|
-
f"Extra field found in gold stage output STRUCT column {prefix}{target_field.name}: {field.name}"
|
|
297
|
-
)
|
|
298
|
-
else:
|
|
299
|
-
if isinstance(field.dataType, StructType):
|
|
300
|
-
check_struct_compatibility(
|
|
301
|
-
target_fields[field.name],
|
|
302
|
-
field,
|
|
303
|
-
prefix=prefix + target_field.name + ".",
|
|
304
|
-
)
|
|
305
|
-
|
|
306
377
|
(pre_silver, silver, gold, pre_bronze) = stage_dataframes
|
|
307
378
|
d("Autoloader Input", 1)
|
|
308
379
|
display(input_df)
|
|
@@ -343,17 +414,33 @@ class PreviewEngine:
|
|
|
343
414
|
self._ds_params.add_gold_schema_table(full_name)
|
|
344
415
|
|
|
345
416
|
# Perform the type checks by trying to insert data into the table
|
|
346
|
-
try:
|
|
347
|
-
df.write.mode("append").save(
|
|
348
|
-
f"{self._ds_params.get_autoloader_temp_schema_location()}/{full_name}"
|
|
349
|
-
)
|
|
350
|
-
except Exception as e:
|
|
351
|
-
raise GoldTableCompatibilityError(
|
|
352
|
-
f"Preset gold table '{full_name}' did not match the gold schema for {fqn_gold_table_name}: {repr(e)}"
|
|
353
|
-
)
|
|
354
417
|
|
|
355
|
-
|
|
356
|
-
|
|
418
|
+
df_columns = df.columns
|
|
419
|
+
df_single_columns = {}
|
|
420
|
+
df_append_exceptions = {}
|
|
421
|
+
for column in df_columns:
|
|
422
|
+
df_single_columns[column] = df.select(column)
|
|
423
|
+
for column, df_single_column in df_single_columns.items():
|
|
424
|
+
try:
|
|
425
|
+
df_single_column.write.mode("append").save(
|
|
426
|
+
f"{self._ds_params.get_autoloader_temp_schema_location()}/{full_name}"
|
|
427
|
+
)
|
|
428
|
+
except Exception as e:
|
|
429
|
+
df_append_exceptions[column] = {
|
|
430
|
+
"type": df_single_column.schema[column].dataType,
|
|
431
|
+
"exception": e,
|
|
432
|
+
"table": name,
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
self.__format_gold_column_merge_exception(
|
|
436
|
+
df_append_exceptions, delta_df, verbose
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
if not df_append_exceptions:
|
|
440
|
+
# alls good. display the output.
|
|
441
|
+
d("Resultant gold table preview", 3)
|
|
442
|
+
unioned_df = delta_df.unionByName(df, allowMissingColumns=True)
|
|
443
|
+
display(unioned_df)
|
|
357
444
|
|
|
358
445
|
def is_backtick_escaped(self, name: str) -> bool:
|
|
359
446
|
"""
|
|
@@ -374,7 +461,13 @@ class PreviewEngine:
|
|
|
374
461
|
return name
|
|
375
462
|
return f"`{name}`"
|
|
376
463
|
|
|
377
|
-
def evaluate(
|
|
464
|
+
def evaluate(
|
|
465
|
+
self,
|
|
466
|
+
gold_table_schema: str,
|
|
467
|
+
display: bool = True,
|
|
468
|
+
force_evaluation: bool = False,
|
|
469
|
+
verbose: bool = False,
|
|
470
|
+
) -> None:
|
|
378
471
|
"""
|
|
379
472
|
Evaluates the loaded preset YAML using the input datasource configuration to load
|
|
380
473
|
records. Finally, checks that the output from the Gold stages is compatible with
|
|
@@ -429,16 +522,17 @@ class PreviewEngine:
|
|
|
429
522
|
schema_hints_file
|
|
430
523
|
)
|
|
431
524
|
|
|
432
|
-
self._compile_stages()
|
|
525
|
+
self._compile_stages(force_evaluation=force_evaluation)
|
|
433
526
|
|
|
434
527
|
with self._ds_params as df:
|
|
435
|
-
self._result_df_map = self._run(df)
|
|
528
|
+
self._result_df_map = self._run(df, verbose)
|
|
436
529
|
if display:
|
|
437
530
|
self._render_output(
|
|
438
531
|
df,
|
|
439
532
|
self._result_df_map,
|
|
440
533
|
self.force_apply_backticks(catalog_name),
|
|
441
534
|
self.force_apply_backticks(schema_name),
|
|
535
|
+
verbose,
|
|
442
536
|
)
|
|
443
537
|
|
|
444
538
|
def results(
|