cloe-nessy 0.3.16.6b0__py3-none-any.whl → 0.3.16.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/delta_loader/__init__.py +14 -0
- cloe_nessy/integration/delta_loader/delta_load_options.py +37 -0
- cloe_nessy/integration/delta_loader/delta_loader.py +165 -0
- cloe_nessy/integration/delta_loader/delta_loader_factory.py +53 -0
- cloe_nessy/integration/delta_loader/delta_loader_metadata_table.py +68 -0
- cloe_nessy/integration/delta_loader/strategies/__init__.py +9 -0
- cloe_nessy/integration/delta_loader/strategies/delta_cdf_loader.py +361 -0
- cloe_nessy/integration/delta_loader/strategies/delta_timestamp_loader.py +163 -0
- cloe_nessy/integration/reader/catalog_reader.py +33 -6
- cloe_nessy/integration/reader/file_reader.py +23 -0
- cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +1 -1
- cloe_nessy/logging/logger_mixin.py +0 -1
- cloe_nessy/models/column.py +1 -1
- cloe_nessy/models/table.py +4 -3
- cloe_nessy/object_manager/table_manager.py +3 -1
- cloe_nessy/pipeline/actions/__init__.py +2 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +36 -3
- cloe_nessy/pipeline/actions/read_files.py +45 -3
- cloe_nessy/pipeline/actions/transform_deduplication.py +7 -12
- cloe_nessy/pipeline/actions/transform_hash_columns.py +7 -7
- cloe_nessy/pipeline/actions/write_catalog_table.py +5 -0
- cloe_nessy/pipeline/actions/write_delta_append.py +15 -0
- cloe_nessy/pipeline/actions/write_delta_merge.py +23 -0
- cloe_nessy/pipeline/actions/write_file.py +6 -1
- cloe_nessy/pipeline/utils/__init__.py +5 -0
- cloe_nessy/pipeline/utils/delta_load_utils.py +36 -0
- cloe_nessy/utils/column_names.py +9 -0
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/METADATA +1 -1
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/RECORD +31 -20
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/WHEEL +0 -0
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from pyspark.sql import DataFrame
|
|
3
|
+
from pyspark.sql import functions as F
|
|
4
|
+
from pyspark.sql.window import Window
|
|
5
|
+
|
|
6
|
+
from ....models import Column
|
|
7
|
+
from ....utils.column_names import generate_unique_column_name
|
|
8
|
+
from ..delta_loader import DeltaLoader
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DeltaCDFConfig(BaseModel):
|
|
12
|
+
"""This class holds the config for the DeltaCDFLoader.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
deduplication_columns: A list of columns used for deduplication.
|
|
16
|
+
from_commit_version: The starting commit version. If None, it starts from the first viable version.
|
|
17
|
+
to_commit_version: The ending commit version. If None, it goes up to the latest version.
|
|
18
|
+
enable_full_load: Enables an initial full load of the target table. If
|
|
19
|
+
no valid delta load history for the table exists, the delta loader
|
|
20
|
+
will do a full load of the target table and set the metadata to the
|
|
21
|
+
newest commit version. This might be useful if the change data feed
|
|
22
|
+
history is incomplete, either because the table was vacuumed or the
|
|
23
|
+
change data feed was enabled later in the lifecycle of the table.
|
|
24
|
+
Otherwise the table will initially be loaded from the first valid
|
|
25
|
+
commit version. When True, `from_commit_version` and
|
|
26
|
+
`to_commit_version` will be ignored on the initial load. Defaults to
|
|
27
|
+
False.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
deduplication_columns: list[str | Column] | None = Field(default=None)
|
|
31
|
+
from_commit_version: int | None = Field(default=None)
|
|
32
|
+
to_commit_version: int | None = Field(default=None)
|
|
33
|
+
enable_full_load: bool = Field(default=False)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DeltaCDFLoader(DeltaLoader):
|
|
37
|
+
"""Implementation of the DeltaLoader interface using CDF strategy.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
config: Configuration for the DeltaCDFLoader.
|
|
41
|
+
table_identifier: Identifier for the table to be loaded.
|
|
42
|
+
delta_load_identifier: Identifier for the delta load.
|
|
43
|
+
metadata_table_identifier: Identifier for the metadata table. Defaults to None.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
config: DeltaCDFConfig,
|
|
49
|
+
table_identifier: str,
|
|
50
|
+
delta_load_identifier: str,
|
|
51
|
+
metadata_table_identifier: str | None = None,
|
|
52
|
+
):
|
|
53
|
+
super().__init__(
|
|
54
|
+
table_identifier,
|
|
55
|
+
delta_load_identifier,
|
|
56
|
+
metadata_table_identifier,
|
|
57
|
+
)
|
|
58
|
+
self.config = config
|
|
59
|
+
self.table_reader = self._spark.read
|
|
60
|
+
|
|
61
|
+
def _check_cdf_enabled(self, table_identifier: str) -> bool:
|
|
62
|
+
"""Checks if Change Data Feed is enabled for the table."""
|
|
63
|
+
try:
|
|
64
|
+
# Try catalog table approach first (for table names like catalog.schema.table)
|
|
65
|
+
if table_identifier.count(".") == 2 and not table_identifier.startswith("/"):
|
|
66
|
+
table_properties = self._query(f"SHOW TBLPROPERTIES {table_identifier}").collect()
|
|
67
|
+
properties_dict = {row["key"]: row["value"] for row in table_properties}
|
|
68
|
+
value = properties_dict.get("delta.enableChangeDataFeed", "false")
|
|
69
|
+
return str(value).lower() == "true"
|
|
70
|
+
# For file paths, use Delta Table API directly
|
|
71
|
+
from delta import DeltaTable # type: ignore[import-untyped]
|
|
72
|
+
|
|
73
|
+
delta_table = DeltaTable.forPath(self._spark, table_identifier)
|
|
74
|
+
properties = delta_table.detail().select("properties").collect()[0]["properties"]
|
|
75
|
+
value = properties.get("delta.enableChangeDataFeed", "false") if properties else "false"
|
|
76
|
+
return str(value).lower() == "true"
|
|
77
|
+
except Exception:
|
|
78
|
+
# If we can't determine CDF status, assume it's not enabled
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
def _has_valid_metadata(self) -> bool:
|
|
82
|
+
"""Checks if valid (i.e. non-stale) metadata exists for the delta load."""
|
|
83
|
+
try:
|
|
84
|
+
df = self._spark.sql(f"""
|
|
85
|
+
SELECT * FROM {self.metadata_table_identifier}
|
|
86
|
+
WHERE source_table_identifier = '{self.table_identifier}'
|
|
87
|
+
AND delta_load_identifier = '{self.delta_load_identifier}'
|
|
88
|
+
AND is_processed = true
|
|
89
|
+
AND is_stale = false
|
|
90
|
+
""")
|
|
91
|
+
return not df.isEmpty()
|
|
92
|
+
except Exception as e:
|
|
93
|
+
self._console_logger.warning(f"Error accessing metadata table: {e}")
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
def _get_commit_versions(self) -> tuple[int, int]:
|
|
97
|
+
"""Retrieves the starting and ending commit versions for CDF data."""
|
|
98
|
+
|
|
99
|
+
def _get_metadata_df() -> DataFrame:
|
|
100
|
+
df = self.table_reader.table(self.metadata_table_identifier)
|
|
101
|
+
return df.filter(
|
|
102
|
+
(F.col("source_table_identifier") == self.table_identifier)
|
|
103
|
+
& (F.col("delta_load_identifier") == self.delta_load_identifier)
|
|
104
|
+
& F.col("is_processed")
|
|
105
|
+
& ~F.col("is_stale"),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def _get_commit_version(query: DataFrame, version_filter: str | None = None) -> int | None:
|
|
109
|
+
if version_filter is not None:
|
|
110
|
+
query = query.filter(version_filter)
|
|
111
|
+
row = query.selectExpr("max(version)").first()
|
|
112
|
+
if row is None or row[0] is None:
|
|
113
|
+
return None
|
|
114
|
+
# Add type validation before casting
|
|
115
|
+
version_value = row[0]
|
|
116
|
+
if not isinstance(version_value, (int | float)) or isinstance(version_value, bool):
|
|
117
|
+
raise TypeError(f"Expected numeric version, got {type(version_value)}: {version_value}")
|
|
118
|
+
return int(version_value)
|
|
119
|
+
|
|
120
|
+
metadata_df = _get_metadata_df()
|
|
121
|
+
self._console_logger.info("Querying table history to find minimum version.")
|
|
122
|
+
min_version_filter = None
|
|
123
|
+
if self.config.from_commit_version is not None:
|
|
124
|
+
min_version_filter = f"version >= {self.config.from_commit_version}"
|
|
125
|
+
# Handle history queries for both catalog tables and file paths
|
|
126
|
+
if self.table_identifier.count(".") == 2 and not self.table_identifier.startswith("/"):
|
|
127
|
+
# Catalog table
|
|
128
|
+
history_query = f"DESCRIBE HISTORY {self.table_identifier}"
|
|
129
|
+
else:
|
|
130
|
+
# File path - need to use delta.`path` format
|
|
131
|
+
history_query = f"DESCRIBE HISTORY delta.`{self.table_identifier}`"
|
|
132
|
+
|
|
133
|
+
min_commit_version = _get_commit_version(
|
|
134
|
+
self._query(history_query).filter(
|
|
135
|
+
"operation like 'CREATE%' OR operation = 'TRUNCATE' OR operationParameters.properties like '%delta.enableChangeDataFeed%' "
|
|
136
|
+
),
|
|
137
|
+
min_version_filter,
|
|
138
|
+
)
|
|
139
|
+
if min_commit_version is None:
|
|
140
|
+
min_commit_version = 0
|
|
141
|
+
|
|
142
|
+
max_version_filter = None
|
|
143
|
+
if self.config.to_commit_version is not None:
|
|
144
|
+
max_version_filter = f"version <= {self.config.to_commit_version}"
|
|
145
|
+
max_commit_version = _get_commit_version(
|
|
146
|
+
self._query(history_query),
|
|
147
|
+
max_version_filter,
|
|
148
|
+
)
|
|
149
|
+
if min_commit_version is None or max_commit_version is None:
|
|
150
|
+
raise RuntimeError(f"No valid versions found for Table [ '{self.table_identifier}' ].")
|
|
151
|
+
|
|
152
|
+
# Handle cases based on metadata
|
|
153
|
+
if metadata_df.isEmpty():
|
|
154
|
+
# Case 1: No metadata found, read all versions (first delta load)
|
|
155
|
+
self._console_logger.info("No CDF History for this identifier, reading all versions.")
|
|
156
|
+
commit_tuple = (min_commit_version, max_commit_version)
|
|
157
|
+
self._console_logger.info(f"Reading Versions: {commit_tuple}")
|
|
158
|
+
return commit_tuple
|
|
159
|
+
|
|
160
|
+
start_commit_row = metadata_df.agg(F.max("end_commit_version")).first()
|
|
161
|
+
start_commit_version = start_commit_row[0] if start_commit_row is not None else None
|
|
162
|
+
if start_commit_version is None:
|
|
163
|
+
# Case 2: No processed version found in metadata, treat as no metadata
|
|
164
|
+
self._console_logger.info("No processed version found in metadata, reading all versions.")
|
|
165
|
+
commit_tuple = (min_commit_version, max_commit_version)
|
|
166
|
+
self._console_logger.info(f"Reading Versions: {commit_tuple}")
|
|
167
|
+
return commit_tuple
|
|
168
|
+
|
|
169
|
+
if start_commit_version > max_commit_version:
|
|
170
|
+
# Case 3: Last processed version in metadata is greater than last version in table history
|
|
171
|
+
# This can happen if the table is recreated after the last processed version
|
|
172
|
+
raise RuntimeError(
|
|
173
|
+
f"Table ['{self.table_identifier}'] history and CDF metadata are incompatible. "
|
|
174
|
+
"Either reset the CDF metadata and recreate the target table from scratch,"
|
|
175
|
+
"or repair CDF metadata."
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if min_commit_version > start_commit_version:
|
|
179
|
+
# Case 4: First version in table history is greater than last processed version in metadata
|
|
180
|
+
# This can happen if the table is truncated after the last processed version
|
|
181
|
+
self._console_logger.info("The first version in Table history is greater than the last processed version.")
|
|
182
|
+
commit_tuple = (min_commit_version, max_commit_version)
|
|
183
|
+
self._console_logger.info(f"Reading Versions: {commit_tuple}")
|
|
184
|
+
return commit_tuple
|
|
185
|
+
|
|
186
|
+
# Case 5: Normal case, read from last processed version to last available version
|
|
187
|
+
self._console_logger.info("Reading from the last processed version to the last available version.")
|
|
188
|
+
commit_tuple = (start_commit_version, max_commit_version)
|
|
189
|
+
self._console_logger.info(f"Reading Versions: {commit_tuple}")
|
|
190
|
+
return commit_tuple
|
|
191
|
+
|
|
192
|
+
def verify(self) -> None:
|
|
193
|
+
"""Verify that the source table has the Change Data Feed enabled."""
|
|
194
|
+
self._console_logger.info("Verifying table is enabled for Change Data Feed.")
|
|
195
|
+
if not self._check_cdf_enabled(self.table_identifier):
|
|
196
|
+
raise RuntimeError(f"Table {self.table_identifier} is not enabled for Change Data Feed.")
|
|
197
|
+
|
|
198
|
+
def _full_load(self, options: dict[str, str]) -> DataFrame:
|
|
199
|
+
self._console_logger.info(f"Performing full load from source table: {self.table_identifier}")
|
|
200
|
+
|
|
201
|
+
# Handle history queries for both catalog tables and file paths
|
|
202
|
+
if self.table_identifier.count(".") == 2 and not self.table_identifier.startswith("/"):
|
|
203
|
+
# Catalog table
|
|
204
|
+
history_query = f"DESCRIBE HISTORY {self.table_identifier}"
|
|
205
|
+
else:
|
|
206
|
+
# File path - need to use delta.`path` format
|
|
207
|
+
history_query = f"DESCRIBE HISTORY delta.`{self.table_identifier}`"
|
|
208
|
+
|
|
209
|
+
max_version_query = self._query(history_query).selectExpr("max(version)").first()
|
|
210
|
+
if not max_version_query or max_version_query[0] is None:
|
|
211
|
+
raise RuntimeError(f"No valid versions found for Table [ '{self.table_identifier}' ].")
|
|
212
|
+
|
|
213
|
+
# Add type validation before casting
|
|
214
|
+
version_value = max_version_query[0]
|
|
215
|
+
if not isinstance(version_value, (int | float)) or isinstance(version_value, bool):
|
|
216
|
+
raise TypeError(f"Expected numeric version, got {type(version_value)}: {version_value}")
|
|
217
|
+
|
|
218
|
+
start_version = 0
|
|
219
|
+
end_version = int(version_value)
|
|
220
|
+
start_commit_timestamp = None
|
|
221
|
+
end_commit_timestamp = None
|
|
222
|
+
|
|
223
|
+
self.table_reader.options(**options)
|
|
224
|
+
|
|
225
|
+
# Handle table reading for both catalog tables and file paths
|
|
226
|
+
if self.table_identifier.count(".") == 2 and not self.table_identifier.startswith("/"):
|
|
227
|
+
# Catalog table
|
|
228
|
+
df = self.table_reader.table(self.table_identifier)
|
|
229
|
+
else:
|
|
230
|
+
# File path - use load method
|
|
231
|
+
df = self.table_reader.load(self.table_identifier)
|
|
232
|
+
|
|
233
|
+
# Cache the DataFrame since it will be used for both counting and returning
|
|
234
|
+
df.cache()
|
|
235
|
+
row_count = df.count()
|
|
236
|
+
|
|
237
|
+
self._create_metadata_entry(
|
|
238
|
+
rows=row_count,
|
|
239
|
+
last_read_timestamp=end_commit_timestamp,
|
|
240
|
+
start_version=start_version,
|
|
241
|
+
end_version=end_version,
|
|
242
|
+
start_commit_timestamp=start_commit_timestamp,
|
|
243
|
+
end_commit_timestamp=end_commit_timestamp,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Note: We keep the DataFrame cached since it's returned to the caller
|
|
247
|
+
# The caller is responsible for unpersisting when done
|
|
248
|
+
return df
|
|
249
|
+
|
|
250
|
+
def _delta_load(self, options: dict[str, str]) -> DataFrame:
|
|
251
|
+
self._console_logger.info(f"Performing delta load from source table: {self.table_identifier}")
|
|
252
|
+
start_version, end_version = self._get_commit_versions()
|
|
253
|
+
|
|
254
|
+
self._invalidate_versions()
|
|
255
|
+
|
|
256
|
+
if start_version != end_version:
|
|
257
|
+
# Increment version by one to avoid reading the same version twice
|
|
258
|
+
read_start_version = str(start_version + 1)
|
|
259
|
+
else:
|
|
260
|
+
read_start_version = str(start_version)
|
|
261
|
+
|
|
262
|
+
self._console_logger.info(f"Reading commit versions: (from: {read_start_version}, to: {str(end_version)})")
|
|
263
|
+
# Set CDF-specific options
|
|
264
|
+
self.table_reader.option("readChangeFeed", "true")
|
|
265
|
+
self.table_reader.option("startingVersion", read_start_version)
|
|
266
|
+
self.table_reader.option("endingVersion", str(end_version))
|
|
267
|
+
|
|
268
|
+
# Set additional options
|
|
269
|
+
for key, value in options.items():
|
|
270
|
+
self.table_reader.option(key, str(value))
|
|
271
|
+
|
|
272
|
+
# Handle table reading for both catalog tables and file paths
|
|
273
|
+
if self.table_identifier.count(".") == 2 and not self.table_identifier.startswith("/"):
|
|
274
|
+
# Catalog table
|
|
275
|
+
df = self.table_reader.table(self.table_identifier)
|
|
276
|
+
else:
|
|
277
|
+
# File path - use load method
|
|
278
|
+
df = self.table_reader.load(self.table_identifier)
|
|
279
|
+
|
|
280
|
+
df = df.filter("_change_type <> 'update_preimage'")
|
|
281
|
+
|
|
282
|
+
# Cache the DataFrame as it will be used multiple times
|
|
283
|
+
df.cache()
|
|
284
|
+
|
|
285
|
+
# Optimize timestamp extraction by combining operations
|
|
286
|
+
start_commit_timestamp = None
|
|
287
|
+
end_commit_timestamp = None
|
|
288
|
+
|
|
289
|
+
if start_version != end_version:
|
|
290
|
+
# Combine both timestamp extractions into a single operation
|
|
291
|
+
timestamp_df = (
|
|
292
|
+
df.filter(F.col("_commit_version").isin([start_version, end_version]))
|
|
293
|
+
.select("_commit_version", "_commit_timestamp")
|
|
294
|
+
.collect()
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
timestamp_map = {row["_commit_version"]: row["_commit_timestamp"] for row in timestamp_df}
|
|
298
|
+
start_commit_timestamp = timestamp_map.get(start_version)
|
|
299
|
+
end_commit_timestamp = timestamp_map.get(end_version)
|
|
300
|
+
|
|
301
|
+
# Handle case where start_version == end_version
|
|
302
|
+
if start_version == end_version:
|
|
303
|
+
df = df.limit(0)
|
|
304
|
+
row_count = 0
|
|
305
|
+
else:
|
|
306
|
+
row_count = df.count()
|
|
307
|
+
|
|
308
|
+
self._create_metadata_entry(
|
|
309
|
+
rows=row_count,
|
|
310
|
+
last_read_timestamp=end_commit_timestamp,
|
|
311
|
+
start_version=start_version,
|
|
312
|
+
end_version=end_version,
|
|
313
|
+
start_commit_timestamp=start_commit_timestamp,
|
|
314
|
+
end_commit_timestamp=end_commit_timestamp,
|
|
315
|
+
)
|
|
316
|
+
# Remove duplicates introduced by CDF. This happens if a row is changed
|
|
317
|
+
# in multiple read versions. We are only interested in the latest
|
|
318
|
+
# change.
|
|
319
|
+
if self.config.deduplication_columns:
|
|
320
|
+
key_columns = self.config.deduplication_columns
|
|
321
|
+
key_column_names = [col.name if isinstance(col, Column) else col for col in key_columns]
|
|
322
|
+
self._console_logger.info(f"Deduplicating with columns: {key_column_names}")
|
|
323
|
+
window_spec = (
|
|
324
|
+
Window.partitionBy(*key_column_names)
|
|
325
|
+
.orderBy(F.desc("_commit_version"))
|
|
326
|
+
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
row_number_col_name = generate_unique_column_name(existing_columns=set(df.columns), prefix="row_num")
|
|
330
|
+
|
|
331
|
+
df = (
|
|
332
|
+
df.withColumn(row_number_col_name, F.row_number().over(window_spec))
|
|
333
|
+
.filter(F.col(row_number_col_name) == 1)
|
|
334
|
+
.drop(row_number_col_name)
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Strip CDF metadata columns and unpersist the intermediate cache
|
|
338
|
+
result_df = df.drop("_commit_version", "_commit_timestamp")
|
|
339
|
+
|
|
340
|
+
# Unpersist the cached DataFrame to free memory
|
|
341
|
+
df.unpersist()
|
|
342
|
+
|
|
343
|
+
return result_df
|
|
344
|
+
|
|
345
|
+
def read_data(
|
|
346
|
+
self,
|
|
347
|
+
options: dict[str, str] | None = None,
|
|
348
|
+
) -> DataFrame:
|
|
349
|
+
"""Reads data using the CDF strategy.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
options: Additional DataFrameReader options.
|
|
353
|
+
"""
|
|
354
|
+
self.verify()
|
|
355
|
+
options = options or {}
|
|
356
|
+
do_full_load = self.config.enable_full_load and not self._has_valid_metadata()
|
|
357
|
+
|
|
358
|
+
if do_full_load:
|
|
359
|
+
return self._full_load(options)
|
|
360
|
+
|
|
361
|
+
return self._delta_load(options)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from datetime import UTC, datetime
|
|
2
|
+
from typing import cast
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql import functions as F
|
|
7
|
+
|
|
8
|
+
from ....integration.writer import CatalogWriter
|
|
9
|
+
from ....models import Column
|
|
10
|
+
from ..delta_loader import DeltaLoader
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DeltaTimestampConfig(BaseModel):
|
|
14
|
+
"""This class holds the config for the DeltaTimestampLoader.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
timestamp_filter_cols: A list of columns used for timestamp filtering.
|
|
18
|
+
from_timestamp: The starting timestamp. If None, it starts from the beginning.
|
|
19
|
+
to_timestamp: The ending timestamp. If None, it goes up to the latest timestamp.
|
|
20
|
+
filter_method: The method used for filtering when multiple timestamp
|
|
21
|
+
columns are used. Allowed values are '||', '&&', 'OR', 'AND'. Defaults
|
|
22
|
+
to None.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
timestamp_filter_cols: list[str | Column]
|
|
26
|
+
from_timestamp: datetime | None = Field(default=None)
|
|
27
|
+
to_timestamp: datetime | None = Field(default=None)
|
|
28
|
+
filter_method: str | None = Field(default=None)
|
|
29
|
+
|
|
30
|
+
@field_validator("from_timestamp", "to_timestamp", mode="before")
|
|
31
|
+
@classmethod
|
|
32
|
+
def parse_datetime(cls, value):
|
|
33
|
+
"""Parses datetime input.
|
|
34
|
+
|
|
35
|
+
If a string is parsed, it is expected to be in ISO 8601 format.
|
|
36
|
+
"""
|
|
37
|
+
if isinstance(value, str):
|
|
38
|
+
return datetime.fromisoformat(value)
|
|
39
|
+
return value
|
|
40
|
+
|
|
41
|
+
@field_validator("filter_method", mode="before")
|
|
42
|
+
@classmethod
|
|
43
|
+
def parse_filter_method(cls, value):
|
|
44
|
+
"""Parses and validates filter_method input."""
|
|
45
|
+
value = value.upper()
|
|
46
|
+
match value:
|
|
47
|
+
case "OR":
|
|
48
|
+
value = "||"
|
|
49
|
+
case "AND":
|
|
50
|
+
value = "&&"
|
|
51
|
+
case "||" | "&&":
|
|
52
|
+
# Valid filter methods, do nothing
|
|
53
|
+
pass
|
|
54
|
+
case _:
|
|
55
|
+
raise ValueError("Invalid filter method. Allowed values are '||', '&&', 'OR', 'AND'.")
|
|
56
|
+
return value
|
|
57
|
+
|
|
58
|
+
@model_validator(mode="after")
|
|
59
|
+
def check_filter_method(self):
|
|
60
|
+
"""Validates that a filter method is set, when more than one timestamp col is used."""
|
|
61
|
+
if len(self.timestamp_filter_cols) > 1 and self.filter_method is None:
|
|
62
|
+
raise ValueError("filter_method must be set when more than one timestamp_filter_cols is used.")
|
|
63
|
+
return self
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class DeltaTimestampLoader(DeltaLoader):
|
|
67
|
+
"""Implementation of the DeltaLoader interface using timestamp strategy.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
config: Configuration for the DeltaTimestampLoader.
|
|
71
|
+
table_identifier: Identifier for the table to be loaded.
|
|
72
|
+
delta_load_identifier: Identifier for the delta load.
|
|
73
|
+
metadata_table_identifier: Identifier for the metadata table. Defaults to None.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
config: DeltaTimestampConfig,
|
|
79
|
+
table_identifier: str,
|
|
80
|
+
delta_load_identifier: str,
|
|
81
|
+
metadata_table_identifier: str | None = None,
|
|
82
|
+
):
|
|
83
|
+
super().__init__(
|
|
84
|
+
table_identifier,
|
|
85
|
+
delta_load_identifier,
|
|
86
|
+
metadata_table_identifier,
|
|
87
|
+
)
|
|
88
|
+
self.config = config
|
|
89
|
+
self.table_reader = self._spark.read
|
|
90
|
+
self.catalog_writer = CatalogWriter()
|
|
91
|
+
|
|
92
|
+
def _get_last_timestamp(self) -> datetime:
|
|
93
|
+
"""Retrieves last read timestamp for delta load."""
|
|
94
|
+
self._console_logger.info(f"Fetchin last read timestamp for table [ '{self.table_identifier}' ].")
|
|
95
|
+
df = self.table_reader.table(self.metadata_table_identifier)
|
|
96
|
+
row = (
|
|
97
|
+
df.filter(
|
|
98
|
+
(F.col("source_table_identifier") == self.table_identifier)
|
|
99
|
+
& (F.col("delta_load_identifier") == self.delta_load_identifier)
|
|
100
|
+
& F.col("is_processed")
|
|
101
|
+
& ~F.col("is_stale"),
|
|
102
|
+
)
|
|
103
|
+
.agg(F.max("last_read_timestamp"))
|
|
104
|
+
.first()
|
|
105
|
+
)
|
|
106
|
+
last_timestamp = row[0] if row is not None else None
|
|
107
|
+
if last_timestamp is None:
|
|
108
|
+
return datetime.fromtimestamp(0)
|
|
109
|
+
return cast(datetime, last_timestamp)
|
|
110
|
+
|
|
111
|
+
def verify(self) -> None:
|
|
112
|
+
"""Verify that the source table has the Change Data Feed enabled."""
|
|
113
|
+
self._console_logger.info("Verifying that table has all configured timestamp columns.")
|
|
114
|
+
df = self._spark.read.table(self.table_identifier)
|
|
115
|
+
missing_columns = [col for col in self.config.timestamp_filter_cols if col not in df.columns]
|
|
116
|
+
if missing_columns:
|
|
117
|
+
raise RuntimeError(
|
|
118
|
+
f"Timestamp filter Columns not found in Table {self.table_identifier} : {', '.join(str(col) for col in missing_columns)}.",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def read_data(
|
|
122
|
+
self,
|
|
123
|
+
options: dict[str, str] | None = None,
|
|
124
|
+
) -> DataFrame:
|
|
125
|
+
"""Reads data using the Timestamp strategy.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
options: Additional DataFrameReader options.
|
|
129
|
+
"""
|
|
130
|
+
if options is None:
|
|
131
|
+
options = {}
|
|
132
|
+
|
|
133
|
+
last_read_timestamp = self.config.to_timestamp or datetime.now(UTC)
|
|
134
|
+
|
|
135
|
+
from_timestamp = self._get_last_timestamp()
|
|
136
|
+
if self.config.from_timestamp and self.config.from_timestamp > from_timestamp:
|
|
137
|
+
from_timestamp = self.config.from_timestamp
|
|
138
|
+
self._invalidate_versions()
|
|
139
|
+
|
|
140
|
+
self.table_reader.options(**options)
|
|
141
|
+
df = self.table_reader.table(self.table_identifier)
|
|
142
|
+
if from_timestamp != datetime.fromtimestamp(0):
|
|
143
|
+
df = df.filter(
|
|
144
|
+
f" {self.config.filter_method} ".join(
|
|
145
|
+
[f"{col} >= '{from_timestamp.isoformat()}'" for col in self.config.timestamp_filter_cols],
|
|
146
|
+
),
|
|
147
|
+
)
|
|
148
|
+
if last_read_timestamp == from_timestamp:
|
|
149
|
+
# to avoid reading multiple times
|
|
150
|
+
df = df.limit(0)
|
|
151
|
+
else:
|
|
152
|
+
df = df.filter(
|
|
153
|
+
f" {self.config.filter_method} ".join(
|
|
154
|
+
[f"{col} < '{last_read_timestamp.isoformat()}'" for col in self.config.timestamp_filter_cols],
|
|
155
|
+
),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
self._create_metadata_entry(
|
|
159
|
+
rows=df.count(),
|
|
160
|
+
last_read_timestamp=last_read_timestamp,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
return df
|
|
@@ -3,6 +3,9 @@ from typing import Any
|
|
|
3
3
|
from pyspark.sql import DataFrame
|
|
4
4
|
from pyspark.sql.utils import AnalysisException
|
|
5
5
|
|
|
6
|
+
from cloe_nessy.integration.delta_loader.delta_load_options import DeltaLoadOptions
|
|
7
|
+
from cloe_nessy.integration.delta_loader.delta_loader_factory import DeltaLoaderFactory
|
|
8
|
+
|
|
6
9
|
from .exceptions import ReadOperationFailedError
|
|
7
10
|
from .reader import BaseReader
|
|
8
11
|
|
|
@@ -17,12 +20,21 @@ class CatalogReader(BaseReader):
|
|
|
17
20
|
"""Initializes the CatalogReader object."""
|
|
18
21
|
super().__init__()
|
|
19
22
|
|
|
20
|
-
def read(
|
|
23
|
+
def read(
|
|
24
|
+
self,
|
|
25
|
+
table_identifier: str = "",
|
|
26
|
+
*,
|
|
27
|
+
options: dict[str, str] | None = None,
|
|
28
|
+
delta_load_options: DeltaLoadOptions | None = None,
|
|
29
|
+
**kwargs: Any,
|
|
30
|
+
) -> DataFrame:
|
|
21
31
|
"""Reads a table from the Unity Catalog.
|
|
22
32
|
|
|
23
33
|
Args:
|
|
24
34
|
table_identifier: The table identifier in the Unity Catalog in the format 'catalog.schema.table'.
|
|
25
|
-
options: PySpark options for the read table operation
|
|
35
|
+
options: PySpark options for the read table operation.
|
|
36
|
+
delta_load_options: Options for delta loading, if applicable. When provided, uses delta loader
|
|
37
|
+
instead of regular table read to perform incremental loading.
|
|
26
38
|
**kwargs: Additional keyword arguments to maintain compatibility with the base class method.
|
|
27
39
|
|
|
28
40
|
Returns:
|
|
@@ -30,7 +42,7 @@ class CatalogReader(BaseReader):
|
|
|
30
42
|
|
|
31
43
|
Raises:
|
|
32
44
|
ValueError: If the table_identifier is not provided, is not a string, or is not in the correct format.
|
|
33
|
-
|
|
45
|
+
ReadOperationFailedError: For delta load or table read failures.
|
|
34
46
|
"""
|
|
35
47
|
if options is None:
|
|
36
48
|
options = {}
|
|
@@ -42,11 +54,26 @@ class CatalogReader(BaseReader):
|
|
|
42
54
|
raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
|
|
43
55
|
|
|
44
56
|
try:
|
|
57
|
+
if delta_load_options:
|
|
58
|
+
# Use delta loader for incremental loading
|
|
59
|
+
self._console_logger.info(f"Performing delta load for table: {table_identifier}")
|
|
60
|
+
delta_loader = DeltaLoaderFactory.create_loader(
|
|
61
|
+
table_identifier=table_identifier,
|
|
62
|
+
options=delta_load_options,
|
|
63
|
+
)
|
|
64
|
+
df = delta_loader.read_data(options=options)
|
|
65
|
+
self._console_logger.info(f"Delta load completed for table: {table_identifier}")
|
|
66
|
+
return df
|
|
67
|
+
|
|
68
|
+
# Regular table read
|
|
45
69
|
df = self._spark.read.table(table_identifier, **options)
|
|
46
70
|
return df
|
|
47
71
|
except AnalysisException as err:
|
|
48
72
|
raise ValueError(f"Table not found: {table_identifier}") from err
|
|
49
73
|
except Exception as err:
|
|
50
|
-
|
|
51
|
-
f"
|
|
52
|
-
|
|
74
|
+
if delta_load_options:
|
|
75
|
+
raise ReadOperationFailedError(f"Delta load failed for table '{table_identifier}': {err}") from err
|
|
76
|
+
else:
|
|
77
|
+
raise ReadOperationFailedError(
|
|
78
|
+
f"An error occurred while reading the table '{table_identifier}': {err}"
|
|
79
|
+
) from err
|
|
@@ -6,6 +6,8 @@ from pyspark.sql.streaming import DataStreamReader
|
|
|
6
6
|
from pyspark.sql.types import StructType
|
|
7
7
|
|
|
8
8
|
from ...file_utilities import get_file_paths
|
|
9
|
+
from ..delta_loader.delta_load_options import DeltaLoadOptions
|
|
10
|
+
from ..delta_loader.delta_loader_factory import DeltaLoaderFactory
|
|
9
11
|
from .reader import BaseReader
|
|
10
12
|
|
|
11
13
|
|
|
@@ -37,6 +39,7 @@ class FileReader(BaseReader):
|
|
|
37
39
|
search_subdirs: bool = True,
|
|
38
40
|
options: dict | None = None,
|
|
39
41
|
add_metadata_column: bool = False,
|
|
42
|
+
delta_load_options: DeltaLoadOptions | None = None,
|
|
40
43
|
**kwargs: Any,
|
|
41
44
|
) -> DataFrame:
|
|
42
45
|
"""Reads files from a specified location and returns a DataFrame.
|
|
@@ -49,6 +52,8 @@ class FileReader(BaseReader):
|
|
|
49
52
|
search_subdirs: Whether to include files in subdirectories.
|
|
50
53
|
options: Spark DataFrame reader options.
|
|
51
54
|
add_metadata_column: Whether to include __metadata column in the DataFrame.
|
|
55
|
+
delta_load_options: Options for delta loading, if applicable. When provided and spark_format is 'delta',
|
|
56
|
+
uses delta loader for incremental loading of Delta Lake tables.
|
|
52
57
|
**kwargs: Additional keyword arguments to maintain compatibility with the base class method.
|
|
53
58
|
|
|
54
59
|
Raises:
|
|
@@ -71,6 +76,23 @@ class FileReader(BaseReader):
|
|
|
71
76
|
|
|
72
77
|
if not spark_format and not extension:
|
|
73
78
|
raise ValueError("Either spark_format or extension must be provided.")
|
|
79
|
+
|
|
80
|
+
# Handle delta loading for Delta Lake tables
|
|
81
|
+
if delta_load_options and (spark_format == "delta" or extension == "delta"):
|
|
82
|
+
self._console_logger.info(f"Performing delta load for Delta table at: {location}")
|
|
83
|
+
try:
|
|
84
|
+
# For Delta tables, use location as table identifier for delta loader
|
|
85
|
+
delta_loader = DeltaLoaderFactory.create_loader(
|
|
86
|
+
table_identifier=location,
|
|
87
|
+
options=delta_load_options,
|
|
88
|
+
)
|
|
89
|
+
df = delta_loader.read_data(options=options or {})
|
|
90
|
+
self._console_logger.info(f"Delta load completed for: {location}")
|
|
91
|
+
return df
|
|
92
|
+
except Exception as e:
|
|
93
|
+
self._console_logger.error(f"Delta load failed for '{location}': {e}")
|
|
94
|
+
raise
|
|
95
|
+
|
|
74
96
|
self._console_logger.debug(f"Reading files from [ '{location}' ] ...")
|
|
75
97
|
extension_to_datatype_dict = {
|
|
76
98
|
"csv": "csv",
|
|
@@ -78,6 +100,7 @@ class FileReader(BaseReader):
|
|
|
78
100
|
"parquet": "parquet",
|
|
79
101
|
"txt": "text",
|
|
80
102
|
"xml": "xml",
|
|
103
|
+
"delta": "delta",
|
|
81
104
|
}
|
|
82
105
|
|
|
83
106
|
if extension and not spark_format:
|
|
@@ -5,7 +5,7 @@ class DeltaTableOperationType(Enum):
|
|
|
5
5
|
"""Mapping between Delta table operation types and their operation metric keys available in the Delta table history.
|
|
6
6
|
|
|
7
7
|
Values of metric keys included in this mapping are reported using the
|
|
8
|
-
logging capabilities of the Delta operations of the
|
|
8
|
+
logging capabilities of the Delta operations of the DeltaWriter.
|
|
9
9
|
|
|
10
10
|
See https://docs.databricks.com/delta/history.html for a complete list and
|
|
11
11
|
description of available metrics for each operation type.
|