dlt-utils-lib 1.2.1__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/PKG-INFO +1 -1
- dlt_utils_lib-1.2.2/dlt_utils/dlt_autoloader_recovery_configuration.py +26 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/main_cdc_utils.py +9 -3
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/main_json_utils.py +10 -3
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/PKG-INFO +1 -1
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/SOURCES.txt +1 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/__init__.py +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/dlt_metadata_receiver.py +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/dlt_transformations.py +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/dependency_links.txt +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/requires.txt +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/top_level.txt +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/setup.cfg +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/setup.py +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/__init__.py +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/conftest.py +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/test_main_cdc_utils.py +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/test_main_json_utils.py +0 -0
- {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/test_transformation_module.py +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from databricks.sdk import WorkspaceClient
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _needs_autoloader_recovery(pipeline_id: str) -> bool:
|
|
7
|
+
event = next(iter(WorkspaceClient().pipelines.list_pipeline_events(
|
|
8
|
+
pipeline_id=pipeline_id, order_by=["timestamp desc"], max_results=1
|
|
9
|
+
)), None)
|
|
10
|
+
if event is None:
|
|
11
|
+
logging.warning("No pipeline events found for pipeline_id=%s", pipeline_id)
|
|
12
|
+
return False
|
|
13
|
+
if event.error:
|
|
14
|
+
for exc in event.error.exceptions or []:
|
|
15
|
+
if exc.message and "CF_MANAGED_FILE_EVENTS_INVALID_CONTINUATION_TOKEN" in exc.message:
|
|
16
|
+
return True
|
|
17
|
+
return False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def resolve_autoloader_recovery(spark) -> bool:
|
|
21
|
+
method = spark.conf.get("autoloader_recovery_method", "AUTO").upper()
|
|
22
|
+
if method == "ON":
|
|
23
|
+
return True
|
|
24
|
+
if method == "AUTO":
|
|
25
|
+
return _needs_autoloader_recovery(spark.conf.get("pipelines.id"))
|
|
26
|
+
return False
|
|
@@ -3,6 +3,7 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
from pyspark.sql.functions import col, expr, current_timestamp
|
|
5
5
|
|
|
6
|
+
from .dlt_autoloader_recovery_configuration import resolve_autoloader_recovery
|
|
6
7
|
from .dlt_transformations import (
|
|
7
8
|
add_default_value_for_removed_col,
|
|
8
9
|
apply_partitions,
|
|
@@ -103,9 +104,14 @@ def create_bronze_table_definition(spark,
|
|
|
103
104
|
.option("cloudFiles.inferColumnTypes", "true")
|
|
104
105
|
|
|
105
106
|
if use_managed_file_events:
|
|
106
|
-
reader = reader.option("cloudFiles.useManagedFileEvents", "true")
|
|
107
|
-
|
|
108
|
-
|
|
107
|
+
reader = reader.option("cloudFiles.useManagedFileEvents", "true")
|
|
108
|
+
|
|
109
|
+
if resolve_autoloader_recovery(spark):
|
|
110
|
+
reader = (
|
|
111
|
+
reader
|
|
112
|
+
.option("cloudFiles.listOnStart", "true")
|
|
113
|
+
.option("cloudFiles.validateOptions", "false")
|
|
114
|
+
)
|
|
109
115
|
|
|
110
116
|
return reader.load(files_path) \
|
|
111
117
|
.withColumn('cdc_timestamp', col('cdc_timestamp').cast('timestamp')) \
|
|
@@ -4,6 +4,8 @@ from typing import Callable, Optional
|
|
|
4
4
|
from pyspark.sql import DataFrame
|
|
5
5
|
from pyspark.sql.functions import col, struct
|
|
6
6
|
|
|
7
|
+
from .dlt_autoloader_recovery_configuration import resolve_autoloader_recovery
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
def base_json_replication_process(
|
|
9
11
|
dlt,
|
|
@@ -97,9 +99,14 @@ def _build_autoloader_reader(
|
|
|
97
99
|
reader = reader.option("cloudFiles.schemaHints", schema_hints)
|
|
98
100
|
|
|
99
101
|
if use_managed_file_events:
|
|
100
|
-
reader = reader.option("cloudFiles.useManagedFileEvents", "true")
|
|
101
|
-
|
|
102
|
-
|
|
102
|
+
reader = reader.option("cloudFiles.useManagedFileEvents", "true")
|
|
103
|
+
|
|
104
|
+
if resolve_autoloader_recovery(spark):
|
|
105
|
+
reader = (
|
|
106
|
+
reader
|
|
107
|
+
.option("cloudFiles.listOnStart", "true")
|
|
108
|
+
.option("cloudFiles.validateOptions", "false")
|
|
109
|
+
)
|
|
103
110
|
|
|
104
111
|
return reader.load(source_path)
|
|
105
112
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|