cloe-nessy 0.3.16.6b0__py3-none-any.whl → 0.3.16.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. cloe_nessy/integration/delta_loader/__init__.py +14 -0
  2. cloe_nessy/integration/delta_loader/delta_load_options.py +37 -0
  3. cloe_nessy/integration/delta_loader/delta_loader.py +165 -0
  4. cloe_nessy/integration/delta_loader/delta_loader_factory.py +53 -0
  5. cloe_nessy/integration/delta_loader/delta_loader_metadata_table.py +68 -0
  6. cloe_nessy/integration/delta_loader/strategies/__init__.py +9 -0
  7. cloe_nessy/integration/delta_loader/strategies/delta_cdf_loader.py +361 -0
  8. cloe_nessy/integration/delta_loader/strategies/delta_timestamp_loader.py +163 -0
  9. cloe_nessy/integration/reader/catalog_reader.py +33 -6
  10. cloe_nessy/integration/reader/file_reader.py +23 -0
  11. cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +1 -1
  12. cloe_nessy/logging/logger_mixin.py +0 -1
  13. cloe_nessy/models/column.py +1 -1
  14. cloe_nessy/models/table.py +4 -3
  15. cloe_nessy/object_manager/table_manager.py +3 -1
  16. cloe_nessy/pipeline/actions/__init__.py +2 -0
  17. cloe_nessy/pipeline/actions/read_catalog_table.py +36 -3
  18. cloe_nessy/pipeline/actions/read_files.py +45 -3
  19. cloe_nessy/pipeline/actions/transform_deduplication.py +7 -12
  20. cloe_nessy/pipeline/actions/transform_hash_columns.py +7 -7
  21. cloe_nessy/pipeline/actions/write_catalog_table.py +5 -0
  22. cloe_nessy/pipeline/actions/write_delta_append.py +15 -0
  23. cloe_nessy/pipeline/actions/write_delta_merge.py +23 -0
  24. cloe_nessy/pipeline/actions/write_file.py +6 -1
  25. cloe_nessy/pipeline/utils/__init__.py +5 -0
  26. cloe_nessy/pipeline/utils/delta_load_utils.py +36 -0
  27. cloe_nessy/utils/column_names.py +9 -0
  28. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/METADATA +1 -1
  29. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/RECORD +31 -20
  30. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/WHEEL +0 -0
  31. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,361 @@
1
+ from pydantic import BaseModel, Field
2
+ from pyspark.sql import DataFrame
3
+ from pyspark.sql import functions as F
4
+ from pyspark.sql.window import Window
5
+
6
+ from ....models import Column
7
+ from ....utils.column_names import generate_unique_column_name
8
+ from ..delta_loader import DeltaLoader
9
+
10
+
11
+ class DeltaCDFConfig(BaseModel):
12
+ """This class holds the config for the DeltaCDFLoader.
13
+
14
+ Args:
15
+ deduplication_columns: A list of columns used for deduplication.
16
+ from_commit_version: The starting commit version. If None, it starts from the first viable version.
17
+ to_commit_version: The ending commit version. If None, it goes up to the latest version.
18
+ enable_full_load: Enables an initial full load of the target table. If
19
+ no valid delta load history for the table exists, the delta loader
20
+ will do a full load of the target table and set the metadata to the
21
+ newest commit version. This might be useful if the change data feed
22
+ history is incomplete, either because the table was vacuumed or the
23
+ change data feed was enabled later in the lifecycle of the table.
24
+ Otherwise the table will initially be loaded from the first valid
25
+ commit version. When True, `from_commit_version` and
26
+ `to_commit_version` will be ignored on the initial load. Defaults to
27
+ False.
28
+ """
29
+
30
+ deduplication_columns: list[str | Column] | None = Field(default=None)
31
+ from_commit_version: int | None = Field(default=None)
32
+ to_commit_version: int | None = Field(default=None)
33
+ enable_full_load: bool = Field(default=False)
34
+
35
+
36
+ class DeltaCDFLoader(DeltaLoader):
37
+ """Implementation of the DeltaLoader interface using CDF strategy.
38
+
39
+ Args:
40
+ config: Configuration for the DeltaCDFLoader.
41
+ table_identifier: Identifier for the table to be loaded.
42
+ delta_load_identifier: Identifier for the delta load.
43
+ metadata_table_identifier: Identifier for the metadata table. Defaults to None.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ config: DeltaCDFConfig,
49
+ table_identifier: str,
50
+ delta_load_identifier: str,
51
+ metadata_table_identifier: str | None = None,
52
+ ):
53
+ super().__init__(
54
+ table_identifier,
55
+ delta_load_identifier,
56
+ metadata_table_identifier,
57
+ )
58
+ self.config = config
59
+ self.table_reader = self._spark.read
60
+
61
+ def _check_cdf_enabled(self, table_identifier: str) -> bool:
62
+ """Checks if Change Data Feed is enabled for the table."""
63
+ try:
64
+ # Try catalog table approach first (for table names like catalog.schema.table)
65
+ if table_identifier.count(".") == 2 and not table_identifier.startswith("/"):
66
+ table_properties = self._query(f"SHOW TBLPROPERTIES {table_identifier}").collect()
67
+ properties_dict = {row["key"]: row["value"] for row in table_properties}
68
+ value = properties_dict.get("delta.enableChangeDataFeed", "false")
69
+ return str(value).lower() == "true"
70
+ # For file paths, use Delta Table API directly
71
+ from delta import DeltaTable # type: ignore[import-untyped]
72
+
73
+ delta_table = DeltaTable.forPath(self._spark, table_identifier)
74
+ properties = delta_table.detail().select("properties").collect()[0]["properties"]
75
+ value = properties.get("delta.enableChangeDataFeed", "false") if properties else "false"
76
+ return str(value).lower() == "true"
77
+ except Exception:
78
+ # If we can't determine CDF status, assume it's not enabled
79
+ return False
80
+
81
+ def _has_valid_metadata(self) -> bool:
82
+ """Checks if valid (i.e. non-stale) metadata exists for the delta load."""
83
+ try:
84
+ df = self._spark.sql(f"""
85
+ SELECT * FROM {self.metadata_table_identifier}
86
+ WHERE source_table_identifier = '{self.table_identifier}'
87
+ AND delta_load_identifier = '{self.delta_load_identifier}'
88
+ AND is_processed = true
89
+ AND is_stale = false
90
+ """)
91
+ return not df.isEmpty()
92
+ except Exception as e:
93
+ self._console_logger.warning(f"Error accessing metadata table: {e}")
94
+ return False
95
+
96
+ def _get_commit_versions(self) -> tuple[int, int]:
97
+ """Retrieves the starting and ending commit versions for CDF data."""
98
+
99
+ def _get_metadata_df() -> DataFrame:
100
+ df = self.table_reader.table(self.metadata_table_identifier)
101
+ return df.filter(
102
+ (F.col("source_table_identifier") == self.table_identifier)
103
+ & (F.col("delta_load_identifier") == self.delta_load_identifier)
104
+ & F.col("is_processed")
105
+ & ~F.col("is_stale"),
106
+ )
107
+
108
+ def _get_commit_version(query: DataFrame, version_filter: str | None = None) -> int | None:
109
+ if version_filter is not None:
110
+ query = query.filter(version_filter)
111
+ row = query.selectExpr("max(version)").first()
112
+ if row is None or row[0] is None:
113
+ return None
114
+ # Add type validation before casting
115
+ version_value = row[0]
116
+ if not isinstance(version_value, (int | float)) or isinstance(version_value, bool):
117
+ raise TypeError(f"Expected numeric version, got {type(version_value)}: {version_value}")
118
+ return int(version_value)
119
+
120
+ metadata_df = _get_metadata_df()
121
+ self._console_logger.info("Querying table history to find minimum version.")
122
+ min_version_filter = None
123
+ if self.config.from_commit_version is not None:
124
+ min_version_filter = f"version >= {self.config.from_commit_version}"
125
+ # Handle history queries for both catalog tables and file paths
126
+ if self.table_identifier.count(".") == 2 and not self.table_identifier.startswith("/"):
127
+ # Catalog table
128
+ history_query = f"DESCRIBE HISTORY {self.table_identifier}"
129
+ else:
130
+ # File path - need to use delta.`path` format
131
+ history_query = f"DESCRIBE HISTORY delta.`{self.table_identifier}`"
132
+
133
+ min_commit_version = _get_commit_version(
134
+ self._query(history_query).filter(
135
+ "operation like 'CREATE%' OR operation = 'TRUNCATE' OR operationParameters.properties like '%delta.enableChangeDataFeed%' "
136
+ ),
137
+ min_version_filter,
138
+ )
139
+ if min_commit_version is None:
140
+ min_commit_version = 0
141
+
142
+ max_version_filter = None
143
+ if self.config.to_commit_version is not None:
144
+ max_version_filter = f"version <= {self.config.to_commit_version}"
145
+ max_commit_version = _get_commit_version(
146
+ self._query(history_query),
147
+ max_version_filter,
148
+ )
149
+ if min_commit_version is None or max_commit_version is None:
150
+ raise RuntimeError(f"No valid versions found for Table [ '{self.table_identifier}' ].")
151
+
152
+ # Handle cases based on metadata
153
+ if metadata_df.isEmpty():
154
+ # Case 1: No metadata found, read all versions (first delta load)
155
+ self._console_logger.info("No CDF History for this identifier, reading all versions.")
156
+ commit_tuple = (min_commit_version, max_commit_version)
157
+ self._console_logger.info(f"Reading Versions: {commit_tuple}")
158
+ return commit_tuple
159
+
160
+ start_commit_row = metadata_df.agg(F.max("end_commit_version")).first()
161
+ start_commit_version = start_commit_row[0] if start_commit_row is not None else None
162
+ if start_commit_version is None:
163
+ # Case 2: No processed version found in metadata, treat as no metadata
164
+ self._console_logger.info("No processed version found in metadata, reading all versions.")
165
+ commit_tuple = (min_commit_version, max_commit_version)
166
+ self._console_logger.info(f"Reading Versions: {commit_tuple}")
167
+ return commit_tuple
168
+
169
+ if start_commit_version > max_commit_version:
170
+ # Case 3: Last processed version in metadata is greater than last version in table history
171
+ # This can happen if the table is recreated after the last processed version
172
+ raise RuntimeError(
173
+ f"Table ['{self.table_identifier}'] history and CDF metadata are incompatible. "
174
+ "Either reset the CDF metadata and recreate the target table from scratch,"
175
+ "or repair CDF metadata."
176
+ )
177
+
178
+ if min_commit_version > start_commit_version:
179
+ # Case 4: First version in table history is greater than last processed version in metadata
180
+ # This can happen if the table is truncated after the last processed version
181
+ self._console_logger.info("The first version in Table history is greater than the last processed version.")
182
+ commit_tuple = (min_commit_version, max_commit_version)
183
+ self._console_logger.info(f"Reading Versions: {commit_tuple}")
184
+ return commit_tuple
185
+
186
+ # Case 5: Normal case, read from last processed version to last available version
187
+ self._console_logger.info("Reading from the last processed version to the last available version.")
188
+ commit_tuple = (start_commit_version, max_commit_version)
189
+ self._console_logger.info(f"Reading Versions: {commit_tuple}")
190
+ return commit_tuple
191
+
192
+ def verify(self) -> None:
193
+ """Verify that the source table has the Change Data Feed enabled."""
194
+ self._console_logger.info("Verifying table is enabled for Change Data Feed.")
195
+ if not self._check_cdf_enabled(self.table_identifier):
196
+ raise RuntimeError(f"Table {self.table_identifier} is not enabled for Change Data Feed.")
197
+
198
+ def _full_load(self, options: dict[str, str]) -> DataFrame:
199
+ self._console_logger.info(f"Performing full load from source table: {self.table_identifier}")
200
+
201
+ # Handle history queries for both catalog tables and file paths
202
+ if self.table_identifier.count(".") == 2 and not self.table_identifier.startswith("/"):
203
+ # Catalog table
204
+ history_query = f"DESCRIBE HISTORY {self.table_identifier}"
205
+ else:
206
+ # File path - need to use delta.`path` format
207
+ history_query = f"DESCRIBE HISTORY delta.`{self.table_identifier}`"
208
+
209
+ max_version_query = self._query(history_query).selectExpr("max(version)").first()
210
+ if not max_version_query or max_version_query[0] is None:
211
+ raise RuntimeError(f"No valid versions found for Table [ '{self.table_identifier}' ].")
212
+
213
+ # Add type validation before casting
214
+ version_value = max_version_query[0]
215
+ if not isinstance(version_value, (int | float)) or isinstance(version_value, bool):
216
+ raise TypeError(f"Expected numeric version, got {type(version_value)}: {version_value}")
217
+
218
+ start_version = 0
219
+ end_version = int(version_value)
220
+ start_commit_timestamp = None
221
+ end_commit_timestamp = None
222
+
223
+ self.table_reader.options(**options)
224
+
225
+ # Handle table reading for both catalog tables and file paths
226
+ if self.table_identifier.count(".") == 2 and not self.table_identifier.startswith("/"):
227
+ # Catalog table
228
+ df = self.table_reader.table(self.table_identifier)
229
+ else:
230
+ # File path - use load method
231
+ df = self.table_reader.load(self.table_identifier)
232
+
233
+ # Cache the DataFrame since it will be used for both counting and returning
234
+ df.cache()
235
+ row_count = df.count()
236
+
237
+ self._create_metadata_entry(
238
+ rows=row_count,
239
+ last_read_timestamp=end_commit_timestamp,
240
+ start_version=start_version,
241
+ end_version=end_version,
242
+ start_commit_timestamp=start_commit_timestamp,
243
+ end_commit_timestamp=end_commit_timestamp,
244
+ )
245
+
246
+ # Note: We keep the DataFrame cached since it's returned to the caller
247
+ # The caller is responsible for unpersisting when done
248
+ return df
249
+
250
+ def _delta_load(self, options: dict[str, str]) -> DataFrame:
251
+ self._console_logger.info(f"Performing delta load from source table: {self.table_identifier}")
252
+ start_version, end_version = self._get_commit_versions()
253
+
254
+ self._invalidate_versions()
255
+
256
+ if start_version != end_version:
257
+ # Increment version by one to avoid reading the same version twice
258
+ read_start_version = str(start_version + 1)
259
+ else:
260
+ read_start_version = str(start_version)
261
+
262
+ self._console_logger.info(f"Reading commit versions: (from: {read_start_version}, to: {str(end_version)})")
263
+ # Set CDF-specific options
264
+ self.table_reader.option("readChangeFeed", "true")
265
+ self.table_reader.option("startingVersion", read_start_version)
266
+ self.table_reader.option("endingVersion", str(end_version))
267
+
268
+ # Set additional options
269
+ for key, value in options.items():
270
+ self.table_reader.option(key, str(value))
271
+
272
+ # Handle table reading for both catalog tables and file paths
273
+ if self.table_identifier.count(".") == 2 and not self.table_identifier.startswith("/"):
274
+ # Catalog table
275
+ df = self.table_reader.table(self.table_identifier)
276
+ else:
277
+ # File path - use load method
278
+ df = self.table_reader.load(self.table_identifier)
279
+
280
+ df = df.filter("_change_type <> 'update_preimage'")
281
+
282
+ # Cache the DataFrame as it will be used multiple times
283
+ df.cache()
284
+
285
+ # Optimize timestamp extraction by combining operations
286
+ start_commit_timestamp = None
287
+ end_commit_timestamp = None
288
+
289
+ if start_version != end_version:
290
+ # Combine both timestamp extractions into a single operation
291
+ timestamp_df = (
292
+ df.filter(F.col("_commit_version").isin([start_version, end_version]))
293
+ .select("_commit_version", "_commit_timestamp")
294
+ .collect()
295
+ )
296
+
297
+ timestamp_map = {row["_commit_version"]: row["_commit_timestamp"] for row in timestamp_df}
298
+ start_commit_timestamp = timestamp_map.get(start_version)
299
+ end_commit_timestamp = timestamp_map.get(end_version)
300
+
301
+ # Handle case where start_version == end_version
302
+ if start_version == end_version:
303
+ df = df.limit(0)
304
+ row_count = 0
305
+ else:
306
+ row_count = df.count()
307
+
308
+ self._create_metadata_entry(
309
+ rows=row_count,
310
+ last_read_timestamp=end_commit_timestamp,
311
+ start_version=start_version,
312
+ end_version=end_version,
313
+ start_commit_timestamp=start_commit_timestamp,
314
+ end_commit_timestamp=end_commit_timestamp,
315
+ )
316
+ # Remove duplicates introduced by CDF. This happens if a row is changed
317
+ # in multiple read versions. We are only interested in the latest
318
+ # change.
319
+ if self.config.deduplication_columns:
320
+ key_columns = self.config.deduplication_columns
321
+ key_column_names = [col.name if isinstance(col, Column) else col for col in key_columns]
322
+ self._console_logger.info(f"Deduplicating with columns: {key_column_names}")
323
+ window_spec = (
324
+ Window.partitionBy(*key_column_names)
325
+ .orderBy(F.desc("_commit_version"))
326
+ .rowsBetween(Window.unboundedPreceding, Window.currentRow)
327
+ )
328
+
329
+ row_number_col_name = generate_unique_column_name(existing_columns=set(df.columns), prefix="row_num")
330
+
331
+ df = (
332
+ df.withColumn(row_number_col_name, F.row_number().over(window_spec))
333
+ .filter(F.col(row_number_col_name) == 1)
334
+ .drop(row_number_col_name)
335
+ )
336
+
337
+ # Strip CDF metadata columns and unpersist the intermediate cache
338
+ result_df = df.drop("_commit_version", "_commit_timestamp")
339
+
340
+ # Unpersist the cached DataFrame to free memory
341
+ df.unpersist()
342
+
343
+ return result_df
344
+
345
+ def read_data(
346
+ self,
347
+ options: dict[str, str] | None = None,
348
+ ) -> DataFrame:
349
+ """Reads data using the CDF strategy.
350
+
351
+ Args:
352
+ options: Additional DataFrameReader options.
353
+ """
354
+ self.verify()
355
+ options = options or {}
356
+ do_full_load = self.config.enable_full_load and not self._has_valid_metadata()
357
+
358
+ if do_full_load:
359
+ return self._full_load(options)
360
+
361
+ return self._delta_load(options)
@@ -0,0 +1,163 @@
1
+ from datetime import UTC, datetime
2
+ from typing import cast
3
+
4
+ from pydantic import BaseModel, Field, field_validator, model_validator
5
+ from pyspark.sql import DataFrame
6
+ from pyspark.sql import functions as F
7
+
8
+ from ....integration.writer import CatalogWriter
9
+ from ....models import Column
10
+ from ..delta_loader import DeltaLoader
11
+
12
+
13
+ class DeltaTimestampConfig(BaseModel):
14
+ """This class holds the config for the DeltaTimestampLoader.
15
+
16
+ Args:
17
+ timestamp_filter_cols: A list of columns used for timestamp filtering.
18
+ from_timestamp: The starting timestamp. If None, it starts from the beginning.
19
+ to_timestamp: The ending timestamp. If None, it goes up to the latest timestamp.
20
+ filter_method: The method used for filtering when multiple timestamp
21
+ columns are used. Allowed values are '||', '&&', 'OR', 'AND'. Defaults
22
+ to None.
23
+ """
24
+
25
+ timestamp_filter_cols: list[str | Column]
26
+ from_timestamp: datetime | None = Field(default=None)
27
+ to_timestamp: datetime | None = Field(default=None)
28
+ filter_method: str | None = Field(default=None)
29
+
30
+ @field_validator("from_timestamp", "to_timestamp", mode="before")
31
+ @classmethod
32
+ def parse_datetime(cls, value):
33
+ """Parses datetime input.
34
+
35
+ If a string is parsed, it is expected to be in ISO 8601 format.
36
+ """
37
+ if isinstance(value, str):
38
+ return datetime.fromisoformat(value)
39
+ return value
40
+
41
+ @field_validator("filter_method", mode="before")
42
+ @classmethod
43
+ def parse_filter_method(cls, value):
44
+ """Parses and validates filter_method input."""
45
+ value = value.upper()
46
+ match value:
47
+ case "OR":
48
+ value = "||"
49
+ case "AND":
50
+ value = "&&"
51
+ case "||" | "&&":
52
+ # Valid filter methods, do nothing
53
+ pass
54
+ case _:
55
+ raise ValueError("Invalid filter method. Allowed values are '||', '&&', 'OR', 'AND'.")
56
+ return value
57
+
58
+ @model_validator(mode="after")
59
+ def check_filter_method(self):
60
+ """Validates that a filter method is set, when more than one timestamp col is used."""
61
+ if len(self.timestamp_filter_cols) > 1 and self.filter_method is None:
62
+ raise ValueError("filter_method must be set when more than one timestamp_filter_cols is used.")
63
+ return self
64
+
65
+
66
+ class DeltaTimestampLoader(DeltaLoader):
67
+ """Implementation of the DeltaLoader interface using timestamp strategy.
68
+
69
+ Args:
70
+ config: Configuration for the DeltaTimestampLoader.
71
+ table_identifier: Identifier for the table to be loaded.
72
+ delta_load_identifier: Identifier for the delta load.
73
+ metadata_table_identifier: Identifier for the metadata table. Defaults to None.
74
+ """
75
+
76
+ def __init__(
77
+ self,
78
+ config: DeltaTimestampConfig,
79
+ table_identifier: str,
80
+ delta_load_identifier: str,
81
+ metadata_table_identifier: str | None = None,
82
+ ):
83
+ super().__init__(
84
+ table_identifier,
85
+ delta_load_identifier,
86
+ metadata_table_identifier,
87
+ )
88
+ self.config = config
89
+ self.table_reader = self._spark.read
90
+ self.catalog_writer = CatalogWriter()
91
+
92
+ def _get_last_timestamp(self) -> datetime:
93
+ """Retrieves last read timestamp for delta load."""
94
+ self._console_logger.info(f"Fetchin last read timestamp for table [ '{self.table_identifier}' ].")
95
+ df = self.table_reader.table(self.metadata_table_identifier)
96
+ row = (
97
+ df.filter(
98
+ (F.col("source_table_identifier") == self.table_identifier)
99
+ & (F.col("delta_load_identifier") == self.delta_load_identifier)
100
+ & F.col("is_processed")
101
+ & ~F.col("is_stale"),
102
+ )
103
+ .agg(F.max("last_read_timestamp"))
104
+ .first()
105
+ )
106
+ last_timestamp = row[0] if row is not None else None
107
+ if last_timestamp is None:
108
+ return datetime.fromtimestamp(0)
109
+ return cast(datetime, last_timestamp)
110
+
111
+ def verify(self) -> None:
112
+ """Verify that the source table has the Change Data Feed enabled."""
113
+ self._console_logger.info("Verifying that table has all configured timestamp columns.")
114
+ df = self._spark.read.table(self.table_identifier)
115
+ missing_columns = [col for col in self.config.timestamp_filter_cols if col not in df.columns]
116
+ if missing_columns:
117
+ raise RuntimeError(
118
+ f"Timestamp filter Columns not found in Table {self.table_identifier} : {', '.join(str(col) for col in missing_columns)}.",
119
+ )
120
+
121
+ def read_data(
122
+ self,
123
+ options: dict[str, str] | None = None,
124
+ ) -> DataFrame:
125
+ """Reads data using the Timestamp strategy.
126
+
127
+ Args:
128
+ options: Additional DataFrameReader options.
129
+ """
130
+ if options is None:
131
+ options = {}
132
+
133
+ last_read_timestamp = self.config.to_timestamp or datetime.now(UTC)
134
+
135
+ from_timestamp = self._get_last_timestamp()
136
+ if self.config.from_timestamp and self.config.from_timestamp > from_timestamp:
137
+ from_timestamp = self.config.from_timestamp
138
+ self._invalidate_versions()
139
+
140
+ self.table_reader.options(**options)
141
+ df = self.table_reader.table(self.table_identifier)
142
+ if from_timestamp != datetime.fromtimestamp(0):
143
+ df = df.filter(
144
+ f" {self.config.filter_method} ".join(
145
+ [f"{col} >= '{from_timestamp.isoformat()}'" for col in self.config.timestamp_filter_cols],
146
+ ),
147
+ )
148
+ if last_read_timestamp == from_timestamp:
149
+ # to avoid reading multiple times
150
+ df = df.limit(0)
151
+ else:
152
+ df = df.filter(
153
+ f" {self.config.filter_method} ".join(
154
+ [f"{col} < '{last_read_timestamp.isoformat()}'" for col in self.config.timestamp_filter_cols],
155
+ ),
156
+ )
157
+
158
+ self._create_metadata_entry(
159
+ rows=df.count(),
160
+ last_read_timestamp=last_read_timestamp,
161
+ )
162
+
163
+ return df
@@ -3,6 +3,9 @@ from typing import Any
3
3
  from pyspark.sql import DataFrame
4
4
  from pyspark.sql.utils import AnalysisException
5
5
 
6
+ from cloe_nessy.integration.delta_loader.delta_load_options import DeltaLoadOptions
7
+ from cloe_nessy.integration.delta_loader.delta_loader_factory import DeltaLoaderFactory
8
+
6
9
  from .exceptions import ReadOperationFailedError
7
10
  from .reader import BaseReader
8
11
 
@@ -17,12 +20,21 @@ class CatalogReader(BaseReader):
17
20
  """Initializes the CatalogReader object."""
18
21
  super().__init__()
19
22
 
20
- def read(self, table_identifier: str = "", *, options: dict[str, str] | None = None, **kwargs: Any) -> DataFrame:
23
+ def read(
24
+ self,
25
+ table_identifier: str = "",
26
+ *,
27
+ options: dict[str, str] | None = None,
28
+ delta_load_options: DeltaLoadOptions | None = None,
29
+ **kwargs: Any,
30
+ ) -> DataFrame:
21
31
  """Reads a table from the Unity Catalog.
22
32
 
23
33
  Args:
24
34
  table_identifier: The table identifier in the Unity Catalog in the format 'catalog.schema.table'.
25
- options: PySpark options for the read table operation (not used in the current implementation).
35
+ options: PySpark options for the read table operation.
36
+ delta_load_options: Options for delta loading, if applicable. When provided, uses delta loader
37
+ instead of regular table read to perform incremental loading.
26
38
  **kwargs: Additional keyword arguments to maintain compatibility with the base class method.
27
39
 
28
40
  Returns:
@@ -30,7 +42,7 @@ class CatalogReader(BaseReader):
30
42
 
31
43
  Raises:
32
44
  ValueError: If the table_identifier is not provided, is not a string, or is not in the correct format.
33
- Exception: For any other unexpected errors.
45
+ ReadOperationFailedError: For delta load or table read failures.
34
46
  """
35
47
  if options is None:
36
48
  options = {}
@@ -42,11 +54,26 @@ class CatalogReader(BaseReader):
42
54
  raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
43
55
 
44
56
  try:
57
+ if delta_load_options:
58
+ # Use delta loader for incremental loading
59
+ self._console_logger.info(f"Performing delta load for table: {table_identifier}")
60
+ delta_loader = DeltaLoaderFactory.create_loader(
61
+ table_identifier=table_identifier,
62
+ options=delta_load_options,
63
+ )
64
+ df = delta_loader.read_data(options=options)
65
+ self._console_logger.info(f"Delta load completed for table: {table_identifier}")
66
+ return df
67
+
68
+ # Regular table read
45
69
  df = self._spark.read.table(table_identifier, **options)
46
70
  return df
47
71
  except AnalysisException as err:
48
72
  raise ValueError(f"Table not found: {table_identifier}") from err
49
73
  except Exception as err:
50
- raise ReadOperationFailedError(
51
- f"An error occurred while reading the table '{table_identifier}': {err}"
52
- ) from err
74
+ if delta_load_options:
75
+ raise ReadOperationFailedError(f"Delta load failed for table '{table_identifier}': {err}") from err
76
+ else:
77
+ raise ReadOperationFailedError(
78
+ f"An error occurred while reading the table '{table_identifier}': {err}"
79
+ ) from err
@@ -6,6 +6,8 @@ from pyspark.sql.streaming import DataStreamReader
6
6
  from pyspark.sql.types import StructType
7
7
 
8
8
  from ...file_utilities import get_file_paths
9
+ from ..delta_loader.delta_load_options import DeltaLoadOptions
10
+ from ..delta_loader.delta_loader_factory import DeltaLoaderFactory
9
11
  from .reader import BaseReader
10
12
 
11
13
 
@@ -37,6 +39,7 @@ class FileReader(BaseReader):
37
39
  search_subdirs: bool = True,
38
40
  options: dict | None = None,
39
41
  add_metadata_column: bool = False,
42
+ delta_load_options: DeltaLoadOptions | None = None,
40
43
  **kwargs: Any,
41
44
  ) -> DataFrame:
42
45
  """Reads files from a specified location and returns a DataFrame.
@@ -49,6 +52,8 @@ class FileReader(BaseReader):
49
52
  search_subdirs: Whether to include files in subdirectories.
50
53
  options: Spark DataFrame reader options.
51
54
  add_metadata_column: Whether to include __metadata column in the DataFrame.
55
+ delta_load_options: Options for delta loading, if applicable. When provided and spark_format is 'delta',
56
+ uses delta loader for incremental loading of Delta Lake tables.
52
57
  **kwargs: Additional keyword arguments to maintain compatibility with the base class method.
53
58
 
54
59
  Raises:
@@ -71,6 +76,23 @@ class FileReader(BaseReader):
71
76
 
72
77
  if not spark_format and not extension:
73
78
  raise ValueError("Either spark_format or extension must be provided.")
79
+
80
+ # Handle delta loading for Delta Lake tables
81
+ if delta_load_options and (spark_format == "delta" or extension == "delta"):
82
+ self._console_logger.info(f"Performing delta load for Delta table at: {location}")
83
+ try:
84
+ # For Delta tables, use location as table identifier for delta loader
85
+ delta_loader = DeltaLoaderFactory.create_loader(
86
+ table_identifier=location,
87
+ options=delta_load_options,
88
+ )
89
+ df = delta_loader.read_data(options=options or {})
90
+ self._console_logger.info(f"Delta load completed for: {location}")
91
+ return df
92
+ except Exception as e:
93
+ self._console_logger.error(f"Delta load failed for '{location}': {e}")
94
+ raise
95
+
74
96
  self._console_logger.debug(f"Reading files from [ '{location}' ] ...")
75
97
  extension_to_datatype_dict = {
76
98
  "csv": "csv",
@@ -78,6 +100,7 @@ class FileReader(BaseReader):
78
100
  "parquet": "parquet",
79
101
  "txt": "text",
80
102
  "xml": "xml",
103
+ "delta": "delta",
81
104
  }
82
105
 
83
106
  if extension and not spark_format:
@@ -5,7 +5,7 @@ class DeltaTableOperationType(Enum):
5
5
  """Mapping between Delta table operation types and their operation metric keys available in the Delta table history.
6
6
 
7
7
  Values of metric keys included in this mapping are reported using the
8
- logging capabilities of the Delta operations of the DeltaManager.
8
+ logging capabilities of the Delta operations of the DeltaWriter.
9
9
 
10
10
  See https://docs.databricks.com/delta/history.html for a complete list and
11
11
  description of available metrics for each operation type.
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import logging.handlers
3
2
  from typing import cast
4
3
 
5
4
  from cloe_logging import LoggerFactory
@@ -28,7 +28,7 @@ class Column(BaseModel):
28
28
 
29
29
  name: str
30
30
  data_type: str
31
- nullable: bool
31
+ nullable: bool = True
32
32
  default_value: Any = None
33
33
  generated: str | None = None
34
34
  properties: dict[str, Any] = Field(default_factory=dict)