dasl-client 1.0.25__tar.gz → 1.0.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dasl-client might be problematic. Click here for more details.

Files changed (41) hide show
  1. dasl_client-1.0.27/PKG-INFO +144 -0
  2. dasl_client-1.0.27/README.md +129 -0
  3. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/client.py +65 -3
  4. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/conn/conn.py +3 -1
  5. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/helpers.py +1 -1
  6. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/errors.py +20 -0
  7. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/preview_engine.py +136 -42
  8. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/stage.py +23 -2
  9. dasl_client-1.0.27/dasl_client/regions.json +4 -0
  10. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/datasource.py +7 -0
  11. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/dbui.py +138 -33
  12. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/rule.py +29 -1
  13. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/workspace_config.py +69 -24
  14. dasl_client-1.0.27/dasl_client.egg-info/PKG-INFO +144 -0
  15. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client.egg-info/SOURCES.txt +1 -1
  16. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client.egg-info/requires.txt +2 -1
  17. {dasl_client-1.0.25 → dasl_client-1.0.27}/pyproject.toml +3 -2
  18. dasl_client-1.0.25/PKG-INFO +0 -18
  19. dasl_client-1.0.25/dasl_client/regions.json +0 -3
  20. dasl_client-1.0.25/dasl_client.egg-info/PKG-INFO +0 -18
  21. dasl_client-1.0.25/setup.py +0 -16
  22. {dasl_client-1.0.25 → dasl_client-1.0.27}/LICENSE +0 -0
  23. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/__init__.py +0 -0
  24. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/auth/__init__.py +0 -0
  25. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/auth/auth.py +0 -0
  26. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/conn/__init__.py +0 -0
  27. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/conn/client_identifier.py +0 -0
  28. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/errors/__init__.py +0 -0
  29. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/errors/errors.py +0 -0
  30. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/exec_rule.py +0 -0
  31. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/__init__.py +0 -0
  32. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/preset_development/preview_parameters.py +0 -0
  33. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/regions.py +0 -0
  34. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/__init__.py +0 -0
  35. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/admin_config.py +0 -0
  36. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/content.py +0 -0
  37. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/helpers.py +0 -0
  38. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client/types/types.py +0 -0
  39. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client.egg-info/dependency_links.txt +0 -0
  40. {dasl_client-1.0.25 → dasl_client-1.0.27}/dasl_client.egg-info/top_level.txt +0 -0
  41. {dasl_client-1.0.25 → dasl_client-1.0.27}/setup.cfg +0 -0
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.4
2
+ Name: dasl_client
3
+ Version: 1.0.27
4
+ Summary: The DASL client library used for interacting with the DASL workspace
5
+ Author-email: Antimatter Team <support@antimatter.io>
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: dasl_api==0.1.26
10
+ Requires-Dist: databricks-sdk>=0.41.0
11
+ Requires-Dist: pydantic>=2
12
+ Requires-Dist: typing_extensions>=4.10.0
13
+ Requires-Dist: pyyaml==6.0.2
14
+ Dynamic: license-file
15
+
16
+ # DASL Client Library
17
+
18
+ The DASL (Databricks Antimatter Security Lakehouse) Client Library is a Python SDK for interacting with DASL services.
19
+ This library provides an interface for interacting with DASL services, allowing you to manage
20
+ datasources, rules, workspace configurations, and more from Databricks notebooks.
21
+
22
+ ## Features
23
+
24
+ * **Simple Authentication**: Automatic workspace detection in Databricks notebooks
25
+ * **Datasource Management**: Create, update, list, and delete datasources
26
+ * **Rule Management**: Define and manage security detection rules to identify threats
27
+ * **Workspace Configuration**: Update and retrieve DASL's workspace-level settings
28
+
29
+ ## Installation
30
+
31
+ Install from PyPI:
32
+
33
+ ```bash
34
+ pip install dasl-client
35
+ ```
36
+
37
+ ## Quick Start
38
+
39
+ ### Databricks Notebook Environment (Recommended)
40
+
41
+ The DASL client works best in Databricks notebooks with automatic authentication:
42
+
43
+ ```python
44
+ from dasl_client import Client
45
+
46
+ # Automatically detects Databricks context and authenticates
47
+ client = Client.for_workspace()
48
+ print("Connected to DASL!")
49
+
50
+ # List existing datasources
51
+ print("Existing datasources:")
52
+ for datasource in client.list_datasources():
53
+ print(f" - {datasource.metadata.name}")
54
+
55
+ # List detection rules
56
+ print("Existing detection rules:")
57
+ for rule in client.list_rules():
58
+ print(f" - {rule.metadata.name}")
59
+ ```
60
+
61
+ ### Creating a Datasource
62
+
63
+ ```python
64
+ from dasl_client import DataSource, Schedule, BronzeSpec, SilverSpec
65
+
66
+ # Create a new datasource
67
+ datasource = Datasource(
68
+ source="aws",
69
+ source_type="cloudtrail",
70
+ autoloader=Autoloader(
71
+ enabled=True,
72
+ schedule=Schedule(
73
+ at_least_every="1h",
74
+ enabled=True
75
+ )
76
+ ),
77
+ bronze=BronzeSpec(
78
+ bronze_table="security_logs_bronze",
79
+ skip_bronze_loading=False
80
+ ),
81
+ silver=SilverSpec(
82
+ # Configure silver layer here, see the API reference for more details
83
+ ),
84
+ gold=GoldSpec(
85
+ # Configure gold layer here, see the API reference for more details
86
+ )
87
+ )
88
+
89
+ # Create the datasource
90
+ created_datasource = client.create_datasource(datasource)
91
+ print(f"Created datasource: {created.metadata.name}")
92
+ ```
93
+
94
+ ### Creating a Detection Rule
95
+
96
+ ```python
97
+ from dasl_client.types import Rule, Schedule
98
+
99
+ # Create a new detection rule to detect failed logins
100
+ rule = Rule(
101
+ schedule=Schedule(
102
+ at_least_every="2h",
103
+ enabled=True,
104
+ ),
105
+ input=Rule.Input(
106
+ stream=Rule.Input.Stream(
107
+ tables=[
108
+ Rule.Input.Stream.Table(name="http_activity"),
109
+ ],
110
+ filter="disposition = 'Blocked'",
111
+ starting_timestamp=datetime(2025, 7, 8, 16, 47, 30),
112
+ ),
113
+ ),
114
+ output=Rule.Output(
115
+ summary="record was blocked",
116
+ ),
117
+ )
118
+
119
+ try:
120
+ created_rule = client.create_rule("Detect Blocked HTTP Activity", rule)
121
+ print(f"Successfully created rule: {created_rule.metadata.name}")
122
+ except Exception as e:
123
+ print(f"Error creating rule: {e}")
124
+ ```
125
+
126
+ ## Requirements
127
+
128
+ - Python 3.8+
129
+ - Access to a Databricks workspace with DASL enabled
130
+ - `databricks-sdk>=0.41.0`
131
+ - `pydantic>=2`
132
+
133
+ ## Documentation
134
+
135
+ For complete DASL Client documentation, examples, and API reference:
136
+
137
+ - [DASL Client Documentation](https://antimatter-dasl-client.readthedocs-hosted.com/)
138
+ - [API Reference](https://antimatter-dasl-client.readthedocs-hosted.com/en/latest/api-reference/)
139
+ - [Quickstart Guide](https://antimatter-dasl-client.readthedocs-hosted.com/en/latest/quickstart.html)
140
+
141
+ ## Support
142
+
143
+ - **Email**: support@antimatter.io
144
+ - **Documentation**: [DASL Documentation](https://docs.sl.antimatter.io)
@@ -0,0 +1,129 @@
1
+ # DASL Client Library
2
+
3
+ The DASL (Databricks Antimatter Security Lakehouse) Client Library is a Python SDK for interacting with DASL services.
4
+ This library provides an interface for interacting with DASL services, allowing you to manage
5
+ datasources, rules, workspace configurations, and more from Databricks notebooks.
6
+
7
+ ## Features
8
+
9
+ * **Simple Authentication**: Automatic workspace detection in Databricks notebooks
10
+ * **Datasource Management**: Create, update, list, and delete datasources
11
+ * **Rule Management**: Define and manage security detection rules to identify threats
12
+ * **Workspace Configuration**: Update and retrieve DASL's workspace-level settings
13
+
14
+ ## Installation
15
+
16
+ Install from PyPI:
17
+
18
+ ```bash
19
+ pip install dasl-client
20
+ ```
21
+
22
+ ## Quick Start
23
+
24
+ ### Databricks Notebook Environment (Recommended)
25
+
26
+ The DASL client works best in Databricks notebooks with automatic authentication:
27
+
28
+ ```python
29
+ from dasl_client import Client
30
+
31
+ # Automatically detects Databricks context and authenticates
32
+ client = Client.for_workspace()
33
+ print("Connected to DASL!")
34
+
35
+ # List existing datasources
36
+ print("Existing datasources:")
37
+ for datasource in client.list_datasources():
38
+ print(f" - {datasource.metadata.name}")
39
+
40
+ # List detection rules
41
+ print("Existing detection rules:")
42
+ for rule in client.list_rules():
43
+ print(f" - {rule.metadata.name}")
44
+ ```
45
+
46
+ ### Creating a Datasource
47
+
48
+ ```python
49
+ from dasl_client import DataSource, Schedule, BronzeSpec, SilverSpec
50
+
51
+ # Create a new datasource
52
+ datasource = Datasource(
53
+ source="aws",
54
+ source_type="cloudtrail",
55
+ autoloader=Autoloader(
56
+ enabled=True,
57
+ schedule=Schedule(
58
+ at_least_every="1h",
59
+ enabled=True
60
+ )
61
+ ),
62
+ bronze=BronzeSpec(
63
+ bronze_table="security_logs_bronze",
64
+ skip_bronze_loading=False
65
+ ),
66
+ silver=SilverSpec(
67
+ # Configure silver layer here, see the API reference for more details
68
+ ),
69
+ gold=GoldSpec(
70
+ # Configure gold layer here, see the API reference for more details
71
+ )
72
+ )
73
+
74
+ # Create the datasource
75
+ created_datasource = client.create_datasource(datasource)
76
+ print(f"Created datasource: {created.metadata.name}")
77
+ ```
78
+
79
+ ### Creating a Detection Rule
80
+
81
+ ```python
82
+ from dasl_client.types import Rule, Schedule
83
+
84
+ # Create a new detection rule to detect failed logins
85
+ rule = Rule(
86
+ schedule=Schedule(
87
+ at_least_every="2h",
88
+ enabled=True,
89
+ ),
90
+ input=Rule.Input(
91
+ stream=Rule.Input.Stream(
92
+ tables=[
93
+ Rule.Input.Stream.Table(name="http_activity"),
94
+ ],
95
+ filter="disposition = 'Blocked'",
96
+ starting_timestamp=datetime(2025, 7, 8, 16, 47, 30),
97
+ ),
98
+ ),
99
+ output=Rule.Output(
100
+ summary="record was blocked",
101
+ ),
102
+ )
103
+
104
+ try:
105
+ created_rule = client.create_rule("Detect Blocked HTTP Activity", rule)
106
+ print(f"Successfully created rule: {created_rule.metadata.name}")
107
+ except Exception as e:
108
+ print(f"Error creating rule: {e}")
109
+ ```
110
+
111
+ ## Requirements
112
+
113
+ - Python 3.8+
114
+ - Access to a Databricks workspace with DASL enabled
115
+ - `databricks-sdk>=0.41.0`
116
+ - `pydantic>=2`
117
+
118
+ ## Documentation
119
+
120
+ For complete DASL Client documentation, examples, and API reference:
121
+
122
+ - [DASL Client Documentation](https://antimatter-dasl-client.readthedocs-hosted.com/)
123
+ - [API Reference](https://antimatter-dasl-client.readthedocs-hosted.com/en/latest/api-reference/)
124
+ - [Quickstart Guide](https://antimatter-dasl-client.readthedocs-hosted.com/en/latest/quickstart.html)
125
+
126
+ ## Support
127
+
128
+ - **Email**: support@antimatter.io
129
+ - **Documentation**: [DASL Documentation](https://docs.sl.antimatter.io)
@@ -8,6 +8,8 @@ from pyspark.sql import DataFrame
8
8
  from dasl_api import (
9
9
  CoreV1Api,
10
10
  DbuiV1Api,
11
+ DbuiV1QueryExtendRequest,
12
+ CoreV1QueryExtendRequestDateRange,
11
13
  DbuiV1QueryGenerateRequest,
12
14
  DbuiV1QueryGenerateRequestTimeRange,
13
15
  DbuiV1QueryGenerateStatus,
@@ -597,7 +599,7 @@ class Client:
597
599
  def exec_rule(
598
600
  self,
599
601
  spark,
600
- rule_in: Rule,
602
+ rule_in: Rule | str,
601
603
  ) -> ExecRule:
602
604
  """
603
605
  Locally execute a Rule. Must be run from within a Databricks
@@ -607,19 +609,25 @@ class Client:
607
609
  :param spark: Spark context from Databricks notebook. Will be
608
610
  injected into the execution environment for use by the
609
611
  Rule notebook.
610
- :param rule_in: The specification of the Rule to execute.
612
+ :param rule_in:
613
+ The specification of the Rule to execute. If specified as
614
+ a string, it should be in YAML format.
611
615
  :returns ExecRule: A class containing various information and
612
616
  functionality relating to the execution. See the docs for
613
617
  ExecRule for additional details, but note that you must
614
618
  call its cleanup function or tables created just for this
615
619
  request will leak.
616
620
  """
621
+ rule = rule_in
622
+ if isinstance(rule_in, str):
623
+ rule = Rule.from_yaml_str(rule_in)
624
+
617
625
  Helpers.ensure_databricks()
618
626
 
619
627
  with error_handler():
620
628
  result = self._core_client().core_v1_render_rule(
621
629
  self._workspace(),
622
- rule_in.to_api_obj(),
630
+ rule.to_api_obj(),
623
631
  )
624
632
 
625
633
  try:
@@ -794,6 +802,60 @@ class Client:
794
802
  .id
795
803
  )
796
804
 
805
+ def extend_query(
806
+ self,
807
+ id: str,
808
+ warehouse: Optional[str] = None,
809
+ start_date: Optional[str] = None,
810
+ end_date: Optional[str] = None,
811
+ ) -> str:
812
+ """
813
+ Extend an existing query to cover a larger time range . If the query
814
+ is ordered by time and contains no aggregations, this will add the
815
+ additional data to the existing underlying query, returning the
816
+ existing ID. If the existing table cannot be extended, a new table
817
+ will be created to cover the updated time range.
818
+
819
+ :param id: The ID of the query to extend.
820
+ :param warehouse: The SQL warehouse use to execute the SQL. If
821
+ omitted, the default SQL warehouse specified in the workspace
822
+ config will be used.
823
+ :param start_date: An optional starting date to extend the existing
824
+ query by. If not provided, the current start date of the query
825
+ will be used.
826
+ :param end_date: An optional end date to extend the existing
827
+ query by. If not provided, the current end date of the query
828
+ will be used.
829
+ :returns str: The ID of the query generation operation. This value
830
+ can be used with get_query_status to track the progress of
831
+ the generation process, and eventually to perform lookups
832
+ on the completed query. If the current query could be extended,
833
+ this id will be the same as the one provided. If a new query had
834
+ to be generated, the new ID is returned.
835
+ """
836
+ time_range = None
837
+ if start_date is not None or end_date is not None:
838
+ time_range = CoreV1QueryExtendRequestDateRange(
839
+ startDate=start_date,
840
+ endDate=end_date,
841
+ )
842
+
843
+ req = DbuiV1QueryExtendRequest(
844
+ warehouse=warehouse,
845
+ timeRange=time_range,
846
+ )
847
+
848
+ with error_handler():
849
+ return (
850
+ self._dbui_client()
851
+ .dbui_v1_query_extend(
852
+ self._workspace(),
853
+ id,
854
+ req,
855
+ )
856
+ .id
857
+ )
858
+
797
859
  def get_query_status(
798
860
  self,
799
861
  id: str,
@@ -19,7 +19,9 @@ def get_base_conn(enable_retries: bool = True, host: Optional[str] = None) -> Ap
19
19
  :return: An API conn without any auth
20
20
  """
21
21
  if host is None:
22
- host = os.getenv("DASL_API_URL", "https://api.prod.sl.antimatter.io")
22
+ host = os.getenv(
23
+ "DASL_API_URL", "https://api.sl.us-east-1.cloud.databricks.com"
24
+ )
23
25
  config = Configuration(host=host)
24
26
  if enable_retries:
25
27
  # configure retries with backup for all HTTP verbs; we do not limit this to only
@@ -3,7 +3,7 @@ import os
3
3
 
4
4
 
5
5
  class Helpers:
6
- default_region = "us-east-1"
6
+ default_region = "aws-us-east-1"
7
7
 
8
8
  @staticmethod
9
9
  def ensure_databricks():
@@ -9,6 +9,26 @@ class PresetError(Exception):
9
9
  pass
10
10
 
11
11
 
12
+ class StageExecutionException(PresetError):
13
+ def __init__(
14
+ self,
15
+ medallion_layer="unknown",
16
+ exception_map: Dict[str, List[str]] = {},
17
+ verbose: bool = False,
18
+ ):
19
+ self.exception_map = exception_map
20
+ message = (
21
+ f"Field specification errors encountered in {medallion_layer} stage.\n\n"
22
+ )
23
+ for table, exceptions in exception_map.items():
24
+ message += f"Table: {table}\n"
25
+ count = 1
26
+ for exception in exceptions:
27
+ message += f"Exception {count}:\n{exception.split('JVM')[0] if not verbose else exception}\n\n"
28
+ count += 1
29
+ super().__init__(message)
30
+
31
+
12
32
  class InvalidGoldTableSchemaError(PresetError):
13
33
  def __init__(self, schema: str, additional_message: str = ""):
14
34
  self.schema = schema
@@ -49,6 +49,7 @@ class PreviewEngine:
49
49
  """
50
50
  self._spark = spark
51
51
  self._ds_params = ds_params
52
+ self.__stage_exception = {}
52
53
  self._preset = yaml.safe_load(preset_yaml_str)
53
54
  self._pretransform_name = ds_params._pretransform_name
54
55
 
@@ -129,7 +130,7 @@ class PreviewEngine:
129
130
  if missing_keys:
130
131
  raise MissingSilverKeysError(missing_keys)
131
132
 
132
- def _compile_stages(self) -> None:
133
+ def _compile_stages(self, force_evaluation: bool = False) -> None:
133
134
  """
134
135
  Creates Stage objects, setting silver pretransform to None if not provided.
135
136
  """
@@ -160,15 +161,21 @@ class PreviewEngine:
160
161
  break
161
162
 
162
163
  self._silver = [
163
- Stage(self._spark, "silver transform", table)
164
+ Stage(
165
+ self._spark,
166
+ "silver transform",
167
+ table,
168
+ force_evaluation=force_evaluation,
169
+ )
164
170
  for table in self._preset.get("silver", {}).get("transform", [])
165
171
  ]
166
172
  self._gold = [
167
- Stage(self._spark, "gold", table) for table in self._preset.get("gold", [])
173
+ Stage(self._spark, "gold", table, force_evaluation=force_evaluation)
174
+ for table in self._preset.get("gold", [])
168
175
  ]
169
176
 
170
177
  def _run(
171
- self, df: DataFrame
178
+ self, df: DataFrame, verbose: bool = False
172
179
  ) -> Tuple[DataFrame, Dict[str, DataFrame], Dict[str, DataFrame]]:
173
180
  """
174
181
  Runs all stages, in medallion stage order. This allows prior stage outputs to feed
@@ -232,6 +239,14 @@ class PreviewEngine:
232
239
  for table in self._silver:
233
240
  silver_output_map[table._name] = table.run(df)
234
241
 
242
+ # Check for silver stage exceptions.
243
+ # NOTE: These exception lists only get populated if force_evaluation is enabled.
244
+ for table in self._silver:
245
+ if exceptions := table.get_exceptions():
246
+ self.__stage_exception[table._name] = exceptions
247
+ if self.__stage_exception:
248
+ raise StageExecutionException("silver", self.__stage_exception, verbose)
249
+
235
250
  gold_output_map = {}
236
251
  for table in self._gold:
237
252
  # We store as gold_name/silver_input to prevent clobbering on duplicate gold table use.
@@ -239,12 +254,92 @@ class PreviewEngine:
239
254
  silver_output_map[table._input]
240
255
  )
241
256
 
257
+ # Check for gold stage exceptions.
258
+ # NOTE: These exception lists only get populated if force_evaluation is enabled.
259
+ for table in self._gold:
260
+ if exceptions := table.get_exceptions():
261
+ self.__stage_exception[table._name] = exceptions
262
+ if self.__stage_exception:
263
+ raise StageExecutionException("gold", self.__stage_exception, verbose)
264
+
242
265
  return (
243
266
  (df, silver_output_map, gold_output_map, pre_bronze_output)
244
267
  if self._pre_silver
245
268
  else (None, silver_output_map, gold_output_map, pre_bronze_output)
246
269
  )
247
270
 
271
+ def __get_sql_type(self, data_type) -> str:
272
+ """
273
+ Helper to convert Spark data type objects to SQL type strings.
274
+ """
275
+ if isinstance(data_type, StringType):
276
+ return "STRING"
277
+ elif isinstance(data_type, IntegerType):
278
+ return "INT"
279
+ elif isinstance(data_type, LongType):
280
+ return "BIGINT"
281
+ elif isinstance(data_type, FloatType):
282
+ return "FLOAT"
283
+ elif isinstance(data_type, DoubleType):
284
+ return "DOUBLE"
285
+ elif isinstance(data_type, BooleanType):
286
+ return "BOOLEAN"
287
+ elif isinstance(data_type, TimestampType):
288
+ return "TIMESTAMP"
289
+ elif isinstance(data_type, DateType):
290
+ return "DATE"
291
+ elif isinstance(data_type, ArrayType):
292
+ return f"ARRAY<{self.__get_sql_type(data_type.elementType)}>"
293
+ elif isinstance(data_type, MapType):
294
+ return f"MAP<{self.__get_sql_type(data_type.keyType)}, {self.__get_sql_type(data_type.valueType)}>"
295
+ elif isinstance(data_type, StructType):
296
+ fields = ", ".join(
297
+ [
298
+ f"{field.name}: {self.__get_sql_type(field.dataType)}"
299
+ for field in data_type.fields
300
+ ]
301
+ )
302
+ return f"STRUCT<{fields}>"
303
+ elif isinstance(data_type, VariantType):
304
+ return f"VARIANT"
305
+ else:
306
+ return f"UNKNOWN ({data_type})"
307
+
308
+ def __format_gold_column_merge_exception(
309
+ self,
310
+ columns: Dict[str, List[Exception]],
311
+ gold_df: DataFrame,
312
+ verbose: bool = False,
313
+ ):
314
+ """
315
+ Formatter for various exceptions that occur during the merge of gold tables.
316
+ """
317
+ missing_column_flag = False
318
+ for column, info in columns.items():
319
+ # RANT: it is annoying, but basically every exception comes back from the
320
+ # query analyzer as pyspark.errors.exceptions.connect.AnalysisException,
321
+ # so we are forced into this awkward string search.
322
+ str_e = str(info["exception"])
323
+ str_e = str_e.split("JVM")[0] if not verbose else str_e
324
+ if "LEGACY_ERROR_TEMP_DELTA_0007" in str_e:
325
+ print(
326
+ f"-> Column \"{column}\" of type \"{self.__get_sql_type(info['type'])}\" does not exist in gold table \"{info['table']}\"."
327
+ )
328
+ missing_column_flag = True
329
+ elif "DELTA_FAILED_TO_MERGE_FIELDS" in str_e:
330
+ print(
331
+ f"-> Column \"{column}\" of type \"{self.__get_sql_type(info['type'])}\" is not compatiable with gold table \"{info['table']}\"'s \"{column}\" of type \"{self.__get_sql_type(gold_df.schema[column].dataType)}\""
332
+ )
333
+ else:
334
+ print(
335
+ f"-> Column \"{column}\" raised the following unformatted exception when appending to gold table \"{info['table']}\":\n{str_e}"
336
+ )
337
+
338
+ if missing_column_flag:
339
+ print(
340
+ f"\nA write to 1 or more non-existent columns occured - available columns are: {', '.join(gold_df.columns)}"
341
+ )
342
+
248
343
  def _render_output(
249
344
  self,
250
345
  input_df: DataFrame,
@@ -253,6 +348,7 @@ class PreviewEngine:
253
348
  ],
254
349
  gold_table_catalog: str,
255
350
  gold_table_schema: str,
351
+ verbose: bool = False,
256
352
  ) -> None:
257
353
  """
258
354
  Displays formatted HTML output from executed Stages' DataFrames.
@@ -278,31 +374,6 @@ class PreviewEngine:
278
374
  """
279
375
  )
280
376
 
281
- def check_struct_compatibility(
282
- target_field: StructField, df_field: StructField, prefix=""
283
- ):
284
- if not (
285
- isinstance(target_field.dataType, StructType)
286
- and isinstance(df_field.dataType, StructType)
287
- ):
288
- return
289
-
290
- target_fields = {
291
- field.name: field for field in target_field.dataType.fields
292
- }
293
- for field in df_field.dataType.fields:
294
- if field.name not in target_fields:
295
- raise GoldTableCompatibilityError(
296
- f"Extra field found in gold stage output STRUCT column {prefix}{target_field.name}: {field.name}"
297
- )
298
- else:
299
- if isinstance(field.dataType, StructType):
300
- check_struct_compatibility(
301
- target_fields[field.name],
302
- field,
303
- prefix=prefix + target_field.name + ".",
304
- )
305
-
306
377
  (pre_silver, silver, gold, pre_bronze) = stage_dataframes
307
378
  d("Autoloader Input", 1)
308
379
  display(input_df)
@@ -343,17 +414,33 @@ class PreviewEngine:
343
414
  self._ds_params.add_gold_schema_table(full_name)
344
415
 
345
416
  # Perform the type checks by trying to insert data into the table
346
- try:
347
- df.write.mode("append").save(
348
- f"{self._ds_params.get_autoloader_temp_schema_location()}/{full_name}"
349
- )
350
- except Exception as e:
351
- raise GoldTableCompatibilityError(
352
- f"Preset gold table '{full_name}' did not match the gold schema for {fqn_gold_table_name}: {repr(e)}"
353
- )
354
417
 
355
- d("Resultant gold table preview", 3)
356
- display(df)
418
+ df_columns = df.columns
419
+ df_single_columns = {}
420
+ df_append_exceptions = {}
421
+ for column in df_columns:
422
+ df_single_columns[column] = df.select(column)
423
+ for column, df_single_column in df_single_columns.items():
424
+ try:
425
+ df_single_column.write.mode("append").save(
426
+ f"{self._ds_params.get_autoloader_temp_schema_location()}/{full_name}"
427
+ )
428
+ except Exception as e:
429
+ df_append_exceptions[column] = {
430
+ "type": df_single_column.schema[column].dataType,
431
+ "exception": e,
432
+ "table": name,
433
+ }
434
+
435
+ self.__format_gold_column_merge_exception(
436
+ df_append_exceptions, delta_df, verbose
437
+ )
438
+
439
+ if not df_append_exceptions:
440
+ # alls good. display the output.
441
+ d("Resultant gold table preview", 3)
442
+ unioned_df = delta_df.unionByName(df, allowMissingColumns=True)
443
+ display(unioned_df)
357
444
 
358
445
  def is_backtick_escaped(self, name: str) -> bool:
359
446
  """
@@ -374,7 +461,13 @@ class PreviewEngine:
374
461
  return name
375
462
  return f"`{name}`"
376
463
 
377
- def evaluate(self, gold_table_schema: str, display: bool = True) -> None:
464
+ def evaluate(
465
+ self,
466
+ gold_table_schema: str,
467
+ display: bool = True,
468
+ force_evaluation: bool = False,
469
+ verbose: bool = False,
470
+ ) -> None:
378
471
  """
379
472
  Evaluates the loaded preset YAML using the input datasource configuration to load
380
473
  records. Finally, checks that the output from the Gold stages is compatible with
@@ -429,16 +522,17 @@ class PreviewEngine:
429
522
  schema_hints_file
430
523
  )
431
524
 
432
- self._compile_stages()
525
+ self._compile_stages(force_evaluation=force_evaluation)
433
526
 
434
527
  with self._ds_params as df:
435
- self._result_df_map = self._run(df)
528
+ self._result_df_map = self._run(df, verbose)
436
529
  if display:
437
530
  self._render_output(
438
531
  df,
439
532
  self._result_df_map,
440
533
  self.force_apply_backticks(catalog_name),
441
534
  self.force_apply_backticks(schema_name),
535
+ verbose,
442
536
  )
443
537
 
444
538
  def results(