dlt-utils-lib 1.2.1__tar.gz → 1.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/PKG-INFO +1 -1
  2. dlt_utils_lib-1.2.2/dlt_utils/dlt_autoloader_recovery_configuration.py +26 -0
  3. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/main_cdc_utils.py +9 -3
  4. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/main_json_utils.py +10 -3
  5. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/PKG-INFO +1 -1
  6. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/SOURCES.txt +1 -0
  7. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/__init__.py +0 -0
  8. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/dlt_metadata_receiver.py +0 -0
  9. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils/dlt_transformations.py +0 -0
  10. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/dependency_links.txt +0 -0
  11. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/requires.txt +0 -0
  12. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/dlt_utils_lib.egg-info/top_level.txt +0 -0
  13. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/setup.cfg +0 -0
  14. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/setup.py +0 -0
  15. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/__init__.py +0 -0
  16. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/conftest.py +0 -0
  17. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/test_main_cdc_utils.py +0 -0
  18. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/test_main_json_utils.py +0 -0
  19. {dlt_utils_lib-1.2.1 → dlt_utils_lib-1.2.2}/tests/test_transformation_module.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dlt_utils_lib
3
- Version: 1.2.1
3
+ Version: 1.2.2
4
4
  Summary: UNKNOWN
5
5
  Home-page: UNKNOWN
6
6
  License: UNKNOWN
@@ -0,0 +1,26 @@
1
+ import logging
2
+
3
+ from databricks.sdk import WorkspaceClient
4
+
5
+
6
+ def _needs_autoloader_recovery(pipeline_id: str) -> bool:
7
+ event = next(iter(WorkspaceClient().pipelines.list_pipeline_events(
8
+ pipeline_id=pipeline_id, order_by=["timestamp desc"], max_results=1
9
+ )), None)
10
+ if event is None:
11
+ logging.warning("No pipeline events found for pipeline_id=%s", pipeline_id)
12
+ return False
13
+ if event.error:
14
+ for exc in event.error.exceptions or []:
15
+ if exc.message and "CF_MANAGED_FILE_EVENTS_INVALID_CONTINUATION_TOKEN" in exc.message:
16
+ return True
17
+ return False
18
+
19
+
20
+ def resolve_autoloader_recovery(spark) -> bool:
21
+ method = spark.conf.get("autoloader_recovery_method", "AUTO").upper()
22
+ if method == "ON":
23
+ return True
24
+ if method == "AUTO":
25
+ return _needs_autoloader_recovery(spark.conf.get("pipelines.id"))
26
+ return False
@@ -3,6 +3,7 @@ from typing import Optional
3
3
 
4
4
  from pyspark.sql.functions import col, expr, current_timestamp
5
5
 
6
+ from .dlt_autoloader_recovery_configuration import resolve_autoloader_recovery
6
7
  from .dlt_transformations import (
7
8
  add_default_value_for_removed_col,
8
9
  apply_partitions,
@@ -103,9 +104,14 @@ def create_bronze_table_definition(spark,
103
104
  .option("cloudFiles.inferColumnTypes", "true")
104
105
 
105
106
  if use_managed_file_events:
106
- reader = reader.option("cloudFiles.useManagedFileEvents", "true") \
107
- .option("cloudFiles.listOnStart", "true") \
108
- .option("cloudFiles.validateOptions", "false")
107
+ reader = reader.option("cloudFiles.useManagedFileEvents", "true")
108
+
109
+ if resolve_autoloader_recovery(spark):
110
+ reader = (
111
+ reader
112
+ .option("cloudFiles.listOnStart", "true")
113
+ .option("cloudFiles.validateOptions", "false")
114
+ )
109
115
 
110
116
  return reader.load(files_path) \
111
117
  .withColumn('cdc_timestamp', col('cdc_timestamp').cast('timestamp')) \
@@ -4,6 +4,8 @@ from typing import Callable, Optional
4
4
  from pyspark.sql import DataFrame
5
5
  from pyspark.sql.functions import col, struct
6
6
 
7
+ from .dlt_autoloader_recovery_configuration import resolve_autoloader_recovery
8
+
7
9
 
8
10
  def base_json_replication_process(
9
11
  dlt,
@@ -97,9 +99,14 @@ def _build_autoloader_reader(
97
99
  reader = reader.option("cloudFiles.schemaHints", schema_hints)
98
100
 
99
101
  if use_managed_file_events:
100
- reader = reader.option("cloudFiles.useManagedFileEvents", "true") \
101
- .option("cloudFiles.listOnStart", "true") \
102
- .option("cloudFiles.validateOptions", "false")
102
+ reader = reader.option("cloudFiles.useManagedFileEvents", "true")
103
+
104
+ if resolve_autoloader_recovery(spark):
105
+ reader = (
106
+ reader
107
+ .option("cloudFiles.listOnStart", "true")
108
+ .option("cloudFiles.validateOptions", "false")
109
+ )
103
110
 
104
111
  return reader.load(source_path)
105
112
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dlt-utils-lib
3
- Version: 1.2.1
3
+ Version: 1.2.2
4
4
  Summary: UNKNOWN
5
5
  Home-page: UNKNOWN
6
6
  License: UNKNOWN
@@ -1,5 +1,6 @@
1
1
  setup.py
2
2
  dlt_utils/__init__.py
3
+ dlt_utils/dlt_autoloader_recovery_configuration.py
3
4
  dlt_utils/dlt_metadata_receiver.py
4
5
  dlt_utils/dlt_transformations.py
5
6
  dlt_utils/main_cdc_utils.py
File without changes
File without changes