cloe-nessy 0.3.16.6b0__py3-none-any.whl → 0.3.16.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. cloe_nessy/integration/delta_loader/__init__.py +14 -0
  2. cloe_nessy/integration/delta_loader/delta_load_options.py +37 -0
  3. cloe_nessy/integration/delta_loader/delta_loader.py +165 -0
  4. cloe_nessy/integration/delta_loader/delta_loader_factory.py +53 -0
  5. cloe_nessy/integration/delta_loader/delta_loader_metadata_table.py +68 -0
  6. cloe_nessy/integration/delta_loader/strategies/__init__.py +9 -0
  7. cloe_nessy/integration/delta_loader/strategies/delta_cdf_loader.py +361 -0
  8. cloe_nessy/integration/delta_loader/strategies/delta_timestamp_loader.py +163 -0
  9. cloe_nessy/integration/reader/catalog_reader.py +33 -6
  10. cloe_nessy/integration/reader/file_reader.py +23 -0
  11. cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +1 -1
  12. cloe_nessy/logging/logger_mixin.py +0 -1
  13. cloe_nessy/models/column.py +1 -1
  14. cloe_nessy/models/table.py +4 -3
  15. cloe_nessy/object_manager/table_manager.py +3 -1
  16. cloe_nessy/pipeline/actions/__init__.py +2 -0
  17. cloe_nessy/pipeline/actions/read_catalog_table.py +36 -3
  18. cloe_nessy/pipeline/actions/read_files.py +45 -3
  19. cloe_nessy/pipeline/actions/transform_deduplication.py +7 -12
  20. cloe_nessy/pipeline/actions/transform_hash_columns.py +7 -7
  21. cloe_nessy/pipeline/actions/write_catalog_table.py +5 -0
  22. cloe_nessy/pipeline/actions/write_delta_append.py +15 -0
  23. cloe_nessy/pipeline/actions/write_delta_merge.py +23 -0
  24. cloe_nessy/pipeline/actions/write_file.py +6 -1
  25. cloe_nessy/pipeline/utils/__init__.py +5 -0
  26. cloe_nessy/pipeline/utils/delta_load_utils.py +36 -0
  27. cloe_nessy/utils/column_names.py +9 -0
  28. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/METADATA +1 -1
  29. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/RECORD +31 -20
  30. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/WHEEL +0 -0
  31. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.16.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,14 @@
1
+ from .delta_loader import DeltaLoader
2
+ from .delta_loader_factory import DeltaLoaderFactory, DeltaLoadOptions, consume_delta_load
3
+ from .strategies import DeltaCDFConfig, DeltaCDFLoader, DeltaTimestampConfig, DeltaTimestampLoader
4
+
5
+ __all__ = [
6
+ "consume_delta_load",
7
+ "DeltaCDFConfig",
8
+ "DeltaCDFLoader",
9
+ "DeltaLoader",
10
+ "DeltaLoaderFactory",
11
+ "DeltaLoadOptions",
12
+ "DeltaTimestampConfig",
13
+ "DeltaTimestampLoader",
14
+ ]
@@ -0,0 +1,37 @@
1
+ from pathlib import Path
2
+ from typing import Self
3
+
4
+ import yaml
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class DeltaLoadOptions(BaseModel):
9
+ """Options to configure the DeltaLoader.
10
+
11
+ Args:
12
+ strategy: Delta load strategy to use.
13
+ delta_load_identifier: Unique delta load identifier used to track the delta load metadata.
14
+ strategy_options: Options used to configure the chosen delta load strategy.
15
+ See the config class of the particular strategy for more info.
16
+ metadata_table_identifier: Identifier of the metadata table used to keep
17
+ track of the delta load metadata. The table will be created if it does
18
+ not exist. If none, it will default to `<source_catalog>.<source_schema>.metadata_delta_load`.
19
+ """
20
+
21
+ strategy: str
22
+ delta_load_identifier: str
23
+ strategy_options: dict
24
+ metadata_table_identifier: str | None = None
25
+
26
+ @classmethod
27
+ def from_yaml_str(cls, yaml_str: str) -> Self:
28
+ """Creates an instance of DeltaLoadOptions from a YAML string."""
29
+ options = yaml.safe_load(yaml_str)
30
+ return cls(**options)
31
+
32
+ @classmethod
33
+ def from_file(cls, path: str | Path) -> Self:
34
+ """Creates an instance of DeltaLoadOptions from a YAML file."""
35
+ with Path(path).open() as f:
36
+ yaml_str = f.read()
37
+ return cls.from_yaml_str(yaml_str)
@@ -0,0 +1,165 @@
1
+ from abc import ABC, abstractmethod
2
+ from datetime import datetime
3
+ from functools import partial
4
+
5
+ from delta import DeltaTable # type: ignore[import-untyped]
6
+ from pyspark.sql import DataFrame
7
+ from pyspark.sql import functions as F
8
+
9
+ from ...integration.writer import CatalogWriter
10
+ from ...logging import LoggerMixin
11
+ from ...object_manager import TableManager
12
+ from ...session import SessionManager
13
+ from .delta_loader_metadata_table import DeltaLoaderMetadataTable
14
+
15
+
16
+ class DeltaLoader(ABC, LoggerMixin):
17
+ """Base class for delta load operations.
18
+
19
+ Args:
20
+ table_identifier: Identifier for the table to be loaded.
21
+ delta_load_identifier: Identifier for the delta load.
22
+ metadata_table_identifier: Identifier for the metadata table. If None,
23
+ the metadata_table_identifier will be derived from the table identifier:
24
+ `<table_catalog>.<table_schema>.metadata_delta_load`.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ table_identifier: str,
30
+ delta_load_identifier: str,
31
+ metadata_table_identifier: str | None = None,
32
+ ):
33
+ self._spark = SessionManager.get_spark_session()
34
+ self._console_logger = self.get_console_logger()
35
+ self.table_identifier = table_identifier
36
+ self.delta_load_identifier = delta_load_identifier
37
+ self.metadata_table_identifier = (
38
+ metadata_table_identifier
39
+ or f"{self.table_identifier.split('.')[0]}.{self.table_identifier.split('.')[1]}.metadata_delta_load"
40
+ )
41
+ table_manager = TableManager()
42
+ table_manager.create_table(table=DeltaLoaderMetadataTable(identifier=self.metadata_table_identifier))
43
+
44
+ @abstractmethod
45
+ def read_data(
46
+ self,
47
+ options: dict[str, str] | None = None,
48
+ ) -> DataFrame:
49
+ """Reads data incrementally using a strategy.
50
+
51
+ Args:
52
+ options: Additional DataFrameReader options.
53
+ """
54
+ pass
55
+
56
+ @abstractmethod
57
+ def verify(self) -> None:
58
+ """Verify that the source table qualifies for the delta load strategy."""
59
+ pass
60
+
61
+ def _query(self, query: str) -> DataFrame:
62
+ df = self._spark.sql(query)
63
+ return df
64
+
65
+ def _create_metadata_entry(
66
+ self,
67
+ *,
68
+ rows: int,
69
+ last_read_timestamp: datetime | None = None,
70
+ start_version: int | None = None,
71
+ end_version: int | None = None,
72
+ start_commit_timestamp: datetime | None = None,
73
+ end_commit_timestamp: datetime | None = None,
74
+ ) -> None:
75
+ """Creates an entry in the metadata table for the delta load."""
76
+ self._console_logger.info(
77
+ f"Creating metadata entry for table: [ {self.table_identifier} ] with Delta Load Identifier: [ {self.delta_load_identifier} ]",
78
+ )
79
+ metadata_df = self._spark.range(1)
80
+ metadata_df = metadata_df.select(
81
+ F.lit(rows).alias("rows").cast("bigint"),
82
+ F.lit(False).alias("is_processed"),
83
+ F.lit(False).alias("is_stale"),
84
+ F.lit(self.table_identifier).alias("source_table_identifier"),
85
+ F.lit(self.delta_load_identifier).alias("delta_load_identifier"),
86
+ F.lit(start_version).alias("start_commit_version"),
87
+ F.lit(end_version).alias("end_commit_version"),
88
+ F.lit(start_commit_timestamp).alias("start_commit_timestamp_utc"),
89
+ F.lit(end_commit_timestamp).alias("end_commit_timestamp_utc"),
90
+ F.lit(last_read_timestamp).alias("last_read_timestamp"),
91
+ F.current_timestamp().alias("__DCR"),
92
+ F.current_timestamp().alias("__DMR"),
93
+ ).withColumn(
94
+ "BK",
95
+ F.md5(
96
+ F.concat_ws(
97
+ "-",
98
+ F.col("source_table_identifier"),
99
+ F.col("delta_load_identifier"),
100
+ F.current_timestamp(),
101
+ ),
102
+ ),
103
+ )
104
+ catalog_writer = CatalogWriter()
105
+ catalog_writer.write_table(
106
+ df=metadata_df,
107
+ table_identifier=self.metadata_table_identifier,
108
+ mode="append",
109
+ )
110
+
111
+ def _invalidate_versions(self) -> None:
112
+ """Invalidate any pending changes in the metadata for the delta load."""
113
+ self._console_logger.info(
114
+ f"Invalidating unprocessed delta load metadata for table: [ {self.table_identifier} ] with Delta Load Identifier: [ {self.delta_load_identifier} ]",
115
+ )
116
+ delta_table = DeltaTable.forName(self._spark, self.metadata_table_identifier)
117
+ delta_table.update(
118
+ condition=(F.col("source_table_identifier") == self.table_identifier)
119
+ & (F.col("delta_load_identifier") == self.delta_load_identifier)
120
+ & ~F.col("is_processed")
121
+ & ~F.col("is_stale"),
122
+ set={"is_stale": F.lit(True), "__DMR": F.current_timestamp()},
123
+ )
124
+
125
+ def reset_cdf(self) -> None:
126
+ """Invalidates all changes in the metadata for the delta load."""
127
+ delta_table = DeltaTable.forName(self._spark, self.metadata_table_identifier)
128
+ self._console_logger.info(
129
+ f"Resetting delta load metadata for table: [ {self.table_identifier} ] with Delta Load Identifier: [ {self.delta_load_identifier} ]",
130
+ )
131
+ delta_table.update(
132
+ condition=(F.col("source_table_identifier") == self.table_identifier)
133
+ & (F.col("delta_load_identifier") == self.delta_load_identifier)
134
+ & ~F.col("is_stale"),
135
+ set={"is_stale": F.lit(True), "__DMR": F.current_timestamp()},
136
+ )
137
+
138
+ def consume_data(self) -> None:
139
+ """Marks data as consumed in the metadata for the delta load."""
140
+ df = self._spark.table(self.metadata_table_identifier)
141
+ df = df.filter(
142
+ (F.col("source_table_identifier") == self.table_identifier)
143
+ & (F.col("delta_load_identifier") == self.delta_load_identifier)
144
+ & ~F.col("is_processed")
145
+ & ~F.col("is_stale"),
146
+ )
147
+ df = df.groupBy("BK", "delta_load_identifier").agg(F.max("__DCR")).limit(1)
148
+ self._console_logger.info(
149
+ f"Mark metadata for table as processed: [ {self.table_identifier} ] with Delta Load Identifier: [ {self.delta_load_identifier} ].",
150
+ )
151
+ delta_table = DeltaTable.forName(self._spark, self.metadata_table_identifier)
152
+ delta_table.alias("target").merge(df.alias("source"), "target.BK = source.BK").whenMatchedUpdate(
153
+ set={
154
+ "is_processed": F.lit(True),
155
+ "__DMR": F.current_timestamp(),
156
+ },
157
+ ).execute()
158
+
159
+ def write_data(self, write_callable: partial):
160
+ """Wrapper to write and consume a delta load."""
161
+ try:
162
+ write_callable()
163
+ except Exception as e:
164
+ raise RuntimeError("Error while writing...") from e
165
+ self.consume_data()
@@ -0,0 +1,53 @@
1
+ from typing import Any
2
+
3
+ from .delta_load_options import DeltaLoadOptions
4
+ from .delta_loader import DeltaLoader
5
+ from .strategies import DeltaCDFConfig, DeltaCDFLoader, DeltaTimestampConfig, DeltaTimestampLoader
6
+
7
+
8
+ def consume_delta_load(
9
+ runtime_info: dict[str, Any],
10
+ delta_load_identifier: str | None = None,
11
+ ) -> None:
12
+ """Consumes a delta load by updating the metadata table.
13
+
14
+ Args:
15
+ runtime_info: Runtime information.
16
+ delta_load_identifier: If set, the ConsumeDeltaLoadAction action
17
+ will only consume DeltaLoader transaction for the given
18
+ delta_load_identifier.
19
+ """
20
+ for table_name, value in runtime_info["delta_load_options"].items():
21
+ if delta_load_identifier is None or delta_load_identifier == value.get("delta_load_identifier"):
22
+ delta_loader: DeltaLoader = DeltaLoaderFactory.create_loader(
23
+ table_identifier=table_name,
24
+ options=DeltaLoadOptions(
25
+ **value,
26
+ ),
27
+ )
28
+ delta_loader.consume_data()
29
+
30
+
31
+ class DeltaLoaderFactory:
32
+ """Factory to create a DeltaLoader instance based on the DeltaLoadOptions."""
33
+
34
+ @staticmethod
35
+ def create_loader(table_identifier: str, options: DeltaLoadOptions) -> DeltaLoader:
36
+ """Creates an instance of DeltaLoader, choosing the desired strategy."""
37
+ if options.strategy.upper() == "CDF":
38
+ cdf_config = DeltaCDFConfig(**options.strategy_options)
39
+ return DeltaCDFLoader(
40
+ table_identifier=table_identifier,
41
+ delta_load_identifier=options.delta_load_identifier,
42
+ config=cdf_config,
43
+ metadata_table_identifier=options.metadata_table_identifier,
44
+ )
45
+ if options.strategy.upper() == "TIMESTAMP":
46
+ timestamp_config = DeltaTimestampConfig(**options.strategy_options)
47
+ return DeltaTimestampLoader(
48
+ table_identifier=table_identifier,
49
+ delta_load_identifier=options.delta_load_identifier,
50
+ config=timestamp_config,
51
+ metadata_table_identifier=options.metadata_table_identifier,
52
+ )
53
+ raise ValueError(f"Unknown strategy: {options.strategy}")
@@ -0,0 +1,68 @@
1
+ from ...models import Column, Table
2
+
3
+
4
+ class DeltaLoaderMetadataTable(Table):
5
+ """A Table Model for the Delta CDF Reader Metadata Table."""
6
+
7
+ data_source_format: str = "DELTA"
8
+ partition_by: list[str] = ["source_table_identifier"]
9
+ liquid_clustering: bool = True
10
+ columns: list[Column] = [
11
+ Column(
12
+ name="BK",
13
+ data_type="STRING",
14
+ ),
15
+ Column(
16
+ name="delta_load_identifier",
17
+ data_type="STRING",
18
+ ),
19
+ Column(
20
+ name="source_table_identifier",
21
+ data_type="STRING",
22
+ ),
23
+ Column(
24
+ name="is_processed",
25
+ data_type="BOOLEAN",
26
+ ),
27
+ Column(
28
+ name="is_stale",
29
+ data_type="BOOLEAN",
30
+ ),
31
+ Column(
32
+ name="last_read_timestamp",
33
+ data_type="TIMESTAMP",
34
+ nullable=True,
35
+ ),
36
+ Column(
37
+ name="rows",
38
+ data_type="BIGINT",
39
+ ),
40
+ Column(
41
+ name="start_commit_version",
42
+ data_type="INT",
43
+ nullable=True,
44
+ ),
45
+ Column(
46
+ name="end_commit_version",
47
+ data_type="INT",
48
+ nullable=True,
49
+ ),
50
+ Column(
51
+ name="start_commit_timestamp_utc",
52
+ data_type="TIMESTAMP",
53
+ nullable=True,
54
+ ),
55
+ Column(
56
+ name="end_commit_timestamp_utc",
57
+ data_type="TIMESTAMP",
58
+ nullable=True,
59
+ ),
60
+ Column(
61
+ name="__DCR",
62
+ data_type="TIMESTAMP",
63
+ ),
64
+ Column(
65
+ name="__DMR",
66
+ data_type="TIMESTAMP",
67
+ ),
68
+ ]
@@ -0,0 +1,9 @@
1
+ from .delta_cdf_loader import DeltaCDFConfig, DeltaCDFLoader
2
+ from .delta_timestamp_loader import DeltaTimestampConfig, DeltaTimestampLoader
3
+
4
+ __all__ = [
5
+ "DeltaCDFConfig",
6
+ "DeltaCDFLoader",
7
+ "DeltaTimestampConfig",
8
+ "DeltaTimestampLoader",
9
+ ]