dlt-utils-lib 1.2.0__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/PKG-INFO +1 -1
- dlt_utils_lib-1.2.2/dlt_utils/dlt_autoloader_recovery_configuration.py +26 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/main_cdc_utils.py +8 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/main_json_utils.py +9 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/PKG-INFO +1 -1
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/SOURCES.txt +1 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/test_main_cdc_utils.py +21 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/test_main_json_utils.py +19 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/__init__.py +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/dlt_metadata_receiver.py +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/dlt_transformations.py +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/dependency_links.txt +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/requires.txt +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/top_level.txt +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/setup.cfg +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/setup.py +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/__init__.py +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/conftest.py +0 -0
- {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/test_transformation_module.py +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from databricks.sdk import WorkspaceClient
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _needs_autoloader_recovery(pipeline_id: str) -> bool:
|
|
7
|
+
event = next(iter(WorkspaceClient().pipelines.list_pipeline_events(
|
|
8
|
+
pipeline_id=pipeline_id, order_by=["timestamp desc"], max_results=1
|
|
9
|
+
)), None)
|
|
10
|
+
if event is None:
|
|
11
|
+
logging.warning("No pipeline events found for pipeline_id=%s", pipeline_id)
|
|
12
|
+
return False
|
|
13
|
+
if event.error:
|
|
14
|
+
for exc in event.error.exceptions or []:
|
|
15
|
+
if exc.message and "CF_MANAGED_FILE_EVENTS_INVALID_CONTINUATION_TOKEN" in exc.message:
|
|
16
|
+
return True
|
|
17
|
+
return False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def resolve_autoloader_recovery(spark) -> bool:
|
|
21
|
+
method = spark.conf.get("autoloader_recovery_method", "AUTO").upper()
|
|
22
|
+
if method == "ON":
|
|
23
|
+
return True
|
|
24
|
+
if method == "AUTO":
|
|
25
|
+
return _needs_autoloader_recovery(spark.conf.get("pipelines.id"))
|
|
26
|
+
return False
|
|
@@ -3,6 +3,7 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
from pyspark.sql.functions import col, expr, current_timestamp
|
|
5
5
|
|
|
6
|
+
from .dlt_autoloader_recovery_configuration import resolve_autoloader_recovery
|
|
6
7
|
from .dlt_transformations import (
|
|
7
8
|
add_default_value_for_removed_col,
|
|
8
9
|
apply_partitions,
|
|
@@ -105,6 +106,13 @@ def create_bronze_table_definition(spark,
|
|
|
105
106
|
if use_managed_file_events:
|
|
106
107
|
reader = reader.option("cloudFiles.useManagedFileEvents", "true")
|
|
107
108
|
|
|
109
|
+
if resolve_autoloader_recovery(spark):
|
|
110
|
+
reader = (
|
|
111
|
+
reader
|
|
112
|
+
.option("cloudFiles.listOnStart", "true")
|
|
113
|
+
.option("cloudFiles.validateOptions", "false")
|
|
114
|
+
)
|
|
115
|
+
|
|
108
116
|
return reader.load(files_path) \
|
|
109
117
|
.withColumn('cdc_timestamp', col('cdc_timestamp').cast('timestamp')) \
|
|
110
118
|
.withColumn('ar_h_change_seq', col('ar_h_change_seq').cast('string')) \
|
|
@@ -4,6 +4,8 @@ from typing import Callable, Optional
|
|
|
4
4
|
from pyspark.sql import DataFrame
|
|
5
5
|
from pyspark.sql.functions import col, struct
|
|
6
6
|
|
|
7
|
+
from .dlt_autoloader_recovery_configuration import resolve_autoloader_recovery
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
def base_json_replication_process(
|
|
9
11
|
dlt,
|
|
@@ -99,6 +101,13 @@ def _build_autoloader_reader(
|
|
|
99
101
|
if use_managed_file_events:
|
|
100
102
|
reader = reader.option("cloudFiles.useManagedFileEvents", "true")
|
|
101
103
|
|
|
104
|
+
if resolve_autoloader_recovery(spark):
|
|
105
|
+
reader = (
|
|
106
|
+
reader
|
|
107
|
+
.option("cloudFiles.listOnStart", "true")
|
|
108
|
+
.option("cloudFiles.validateOptions", "false")
|
|
109
|
+
)
|
|
110
|
+
|
|
102
111
|
return reader.load(source_path)
|
|
103
112
|
|
|
104
113
|
|
|
@@ -76,3 +76,24 @@ def test_base_cdc_replication_process_uses_legacy_names_without_both_schemas(mon
|
|
|
76
76
|
assert dlt_spy.streaming_tables[0]["name"] == "customer"
|
|
77
77
|
assert dlt_spy.apply_changes_calls[0]["source"] == "bronze_customer"
|
|
78
78
|
assert dlt_spy.apply_changes_calls[0]["target"] == "customer"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_base_cdc_replication_process_enables_managed_file_events(monkeypatch):
|
|
82
|
+
captured_bronze_kwargs = {}
|
|
83
|
+
|
|
84
|
+
def fake_create_bronze_table_definition(**kwargs):
|
|
85
|
+
captured_bronze_kwargs.update(kwargs)
|
|
86
|
+
|
|
87
|
+
monkeypatch.setattr(main_cdc_utils, "create_bronze_table_definition", fake_create_bronze_table_definition)
|
|
88
|
+
monkeypatch.setattr(main_cdc_utils, "silver_streaming_process", lambda **kwargs: None)
|
|
89
|
+
|
|
90
|
+
main_cdc_utils.base_cdc_replication_process(
|
|
91
|
+
dlt=object(),
|
|
92
|
+
spark=object(),
|
|
93
|
+
cdc_tables_map=[_build_cdc_table()],
|
|
94
|
+
bronze_directory="dms_bronze",
|
|
95
|
+
bucket_name="s3://bucket",
|
|
96
|
+
use_managed_file_events=True,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
assert captured_bronze_kwargs["use_managed_file_events"] is True
|
|
@@ -122,3 +122,22 @@ def test_base_json_replication_process_skips_clustering_when_partitions_are_set(
|
|
|
122
122
|
assert dlt_spy.streaming_tables[0]["partition_cols"] == ["event_date"]
|
|
123
123
|
assert dlt_spy.streaming_tables[0]["cluster_by"] is None
|
|
124
124
|
assert dlt_spy.streaming_tables[0]["cluster_by_auto"] is False
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_base_json_replication_process_enables_managed_file_events(monkeypatch):
|
|
128
|
+
captured_bronze_kwargs = {}
|
|
129
|
+
|
|
130
|
+
def fake_create_bronze_json_table_definition(**kwargs):
|
|
131
|
+
captured_bronze_kwargs.update(kwargs)
|
|
132
|
+
|
|
133
|
+
monkeypatch.setattr(main_json_utils, "create_bronze_json_table_definition", fake_create_bronze_json_table_definition)
|
|
134
|
+
monkeypatch.setattr(main_json_utils, "create_silver_json_table_definition", lambda **kwargs: None)
|
|
135
|
+
|
|
136
|
+
main_json_utils.base_json_replication_process(
|
|
137
|
+
dlt=object(),
|
|
138
|
+
spark=object(),
|
|
139
|
+
json_tables_map=[_build_json_table()],
|
|
140
|
+
use_managed_file_events=True,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
assert captured_bronze_kwargs["use_managed_file_events"] is True
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|