dlt-utils-lib 1.2.0__tar.gz → 1.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/PKG-INFO +1 -1
  2. dlt_utils_lib-1.2.2/dlt_utils/dlt_autoloader_recovery_configuration.py +26 -0
  3. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/main_cdc_utils.py +8 -0
  4. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/main_json_utils.py +9 -0
  5. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/PKG-INFO +1 -1
  6. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/SOURCES.txt +1 -0
  7. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/test_main_cdc_utils.py +21 -0
  8. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/test_main_json_utils.py +19 -0
  9. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/__init__.py +0 -0
  10. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/dlt_metadata_receiver.py +0 -0
  11. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils/dlt_transformations.py +0 -0
  12. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/dependency_links.txt +0 -0
  13. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/requires.txt +0 -0
  14. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/top_level.txt +0 -0
  15. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/setup.cfg +0 -0
  16. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/setup.py +0 -0
  17. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/__init__.py +0 -0
  18. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/conftest.py +0 -0
  19. {dlt_utils_lib-1.2.0 → dlt_utils_lib-1.2.2}/tests/test_transformation_module.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dlt_utils_lib
3
- Version: 1.2.0
3
+ Version: 1.2.2
4
4
  Summary: UNKNOWN
5
5
  Home-page: UNKNOWN
6
6
  License: UNKNOWN
@@ -0,0 +1,26 @@
1
+ import logging
2
+
3
+ from databricks.sdk import WorkspaceClient
4
+
5
+
6
+ def _needs_autoloader_recovery(pipeline_id: str) -> bool:
7
+ event = next(iter(WorkspaceClient().pipelines.list_pipeline_events(
8
+ pipeline_id=pipeline_id, order_by=["timestamp desc"], max_results=1
9
+ )), None)
10
+ if event is None:
11
+ logging.warning("No pipeline events found for pipeline_id=%s", pipeline_id)
12
+ return False
13
+ if event.error:
14
+ for exc in event.error.exceptions or []:
15
+ if exc.message and "CF_MANAGED_FILE_EVENTS_INVALID_CONTINUATION_TOKEN" in exc.message:
16
+ return True
17
+ return False
18
+
19
+
20
+ def resolve_autoloader_recovery(spark) -> bool:
21
+ method = spark.conf.get("autoloader_recovery_method", "AUTO").upper()
22
+ if method == "ON":
23
+ return True
24
+ if method == "AUTO":
25
+ return _needs_autoloader_recovery(spark.conf.get("pipelines.id"))
26
+ return False
@@ -3,6 +3,7 @@ from typing import Optional
3
3
 
4
4
  from pyspark.sql.functions import col, expr, current_timestamp
5
5
 
6
+ from .dlt_autoloader_recovery_configuration import resolve_autoloader_recovery
6
7
  from .dlt_transformations import (
7
8
  add_default_value_for_removed_col,
8
9
  apply_partitions,
@@ -105,6 +106,13 @@ def create_bronze_table_definition(spark,
105
106
  if use_managed_file_events:
106
107
  reader = reader.option("cloudFiles.useManagedFileEvents", "true")
107
108
 
109
+ if resolve_autoloader_recovery(spark):
110
+ reader = (
111
+ reader
112
+ .option("cloudFiles.listOnStart", "true")
113
+ .option("cloudFiles.validateOptions", "false")
114
+ )
115
+
108
116
  return reader.load(files_path) \
109
117
  .withColumn('cdc_timestamp', col('cdc_timestamp').cast('timestamp')) \
110
118
  .withColumn('ar_h_change_seq', col('ar_h_change_seq').cast('string')) \
@@ -4,6 +4,8 @@ from typing import Callable, Optional
4
4
  from pyspark.sql import DataFrame
5
5
  from pyspark.sql.functions import col, struct
6
6
 
7
+ from .dlt_autoloader_recovery_configuration import resolve_autoloader_recovery
8
+
7
9
 
8
10
  def base_json_replication_process(
9
11
  dlt,
@@ -99,6 +101,13 @@ def _build_autoloader_reader(
99
101
  if use_managed_file_events:
100
102
  reader = reader.option("cloudFiles.useManagedFileEvents", "true")
101
103
 
104
+ if resolve_autoloader_recovery(spark):
105
+ reader = (
106
+ reader
107
+ .option("cloudFiles.listOnStart", "true")
108
+ .option("cloudFiles.validateOptions", "false")
109
+ )
110
+
102
111
  return reader.load(source_path)
103
112
 
104
113
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dlt-utils-lib
3
- Version: 1.2.0
3
+ Version: 1.2.2
4
4
  Summary: UNKNOWN
5
5
  Home-page: UNKNOWN
6
6
  License: UNKNOWN
@@ -1,5 +1,6 @@
1
1
  setup.py
2
2
  dlt_utils/__init__.py
3
+ dlt_utils/dlt_autoloader_recovery_configuration.py
3
4
  dlt_utils/dlt_metadata_receiver.py
4
5
  dlt_utils/dlt_transformations.py
5
6
  dlt_utils/main_cdc_utils.py
@@ -76,3 +76,24 @@ def test_base_cdc_replication_process_uses_legacy_names_without_both_schemas(mon
76
76
  assert dlt_spy.streaming_tables[0]["name"] == "customer"
77
77
  assert dlt_spy.apply_changes_calls[0]["source"] == "bronze_customer"
78
78
  assert dlt_spy.apply_changes_calls[0]["target"] == "customer"
79
+
80
+
81
+ def test_base_cdc_replication_process_enables_managed_file_events(monkeypatch):
82
+ captured_bronze_kwargs = {}
83
+
84
+ def fake_create_bronze_table_definition(**kwargs):
85
+ captured_bronze_kwargs.update(kwargs)
86
+
87
+ monkeypatch.setattr(main_cdc_utils, "create_bronze_table_definition", fake_create_bronze_table_definition)
88
+ monkeypatch.setattr(main_cdc_utils, "silver_streaming_process", lambda **kwargs: None)
89
+
90
+ main_cdc_utils.base_cdc_replication_process(
91
+ dlt=object(),
92
+ spark=object(),
93
+ cdc_tables_map=[_build_cdc_table()],
94
+ bronze_directory="dms_bronze",
95
+ bucket_name="s3://bucket",
96
+ use_managed_file_events=True,
97
+ )
98
+
99
+ assert captured_bronze_kwargs["use_managed_file_events"] is True
@@ -122,3 +122,22 @@ def test_base_json_replication_process_skips_clustering_when_partitions_are_set(
122
122
  assert dlt_spy.streaming_tables[0]["partition_cols"] == ["event_date"]
123
123
  assert dlt_spy.streaming_tables[0]["cluster_by"] is None
124
124
  assert dlt_spy.streaming_tables[0]["cluster_by_auto"] is False
125
+
126
+
127
+ def test_base_json_replication_process_enables_managed_file_events(monkeypatch):
128
+ captured_bronze_kwargs = {}
129
+
130
+ def fake_create_bronze_json_table_definition(**kwargs):
131
+ captured_bronze_kwargs.update(kwargs)
132
+
133
+ monkeypatch.setattr(main_json_utils, "create_bronze_json_table_definition", fake_create_bronze_json_table_definition)
134
+ monkeypatch.setattr(main_json_utils, "create_silver_json_table_definition", lambda **kwargs: None)
135
+
136
+ main_json_utils.base_json_replication_process(
137
+ dlt=object(),
138
+ spark=object(),
139
+ json_tables_map=[_build_json_table()],
140
+ use_managed_file_events=True,
141
+ )
142
+
143
+ assert captured_bronze_kwargs["use_managed_file_events"] is True
File without changes
File without changes