cloe-nessy 0.3.16.6b0__py3-none-any.whl → 0.3.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/delta_loader/__init__.py +14 -0
- cloe_nessy/integration/delta_loader/delta_load_options.py +37 -0
- cloe_nessy/integration/delta_loader/delta_loader.py +165 -0
- cloe_nessy/integration/delta_loader/delta_loader_factory.py +53 -0
- cloe_nessy/integration/delta_loader/delta_loader_metadata_table.py +68 -0
- cloe_nessy/integration/delta_loader/strategies/__init__.py +9 -0
- cloe_nessy/integration/delta_loader/strategies/delta_cdf_loader.py +361 -0
- cloe_nessy/integration/delta_loader/strategies/delta_timestamp_loader.py +163 -0
- cloe_nessy/integration/reader/catalog_reader.py +33 -6
- cloe_nessy/integration/reader/file_reader.py +23 -0
- cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +1 -1
- cloe_nessy/logging/logger_mixin.py +0 -1
- cloe_nessy/models/column.py +1 -1
- cloe_nessy/models/table.py +4 -3
- cloe_nessy/object_manager/table_manager.py +3 -1
- cloe_nessy/pipeline/actions/__init__.py +4 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +36 -3
- cloe_nessy/pipeline/actions/read_files.py +45 -3
- cloe_nessy/pipeline/actions/transform_convert_timestamp.py +97 -0
- cloe_nessy/pipeline/actions/transform_deduplication.py +7 -12
- cloe_nessy/pipeline/actions/transform_hash_columns.py +7 -7
- cloe_nessy/pipeline/actions/write_catalog_table.py +5 -0
- cloe_nessy/pipeline/actions/write_delta_append.py +15 -0
- cloe_nessy/pipeline/actions/write_delta_merge.py +23 -0
- cloe_nessy/pipeline/actions/write_file.py +6 -1
- cloe_nessy/pipeline/utils/__init__.py +5 -0
- cloe_nessy/pipeline/utils/delta_load_utils.py +36 -0
- cloe_nessy/utils/column_names.py +9 -0
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/METADATA +3 -3
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/RECORD +32 -20
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/WHEEL +0 -0
- {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .delta_loader import DeltaLoader
|
|
2
|
+
from .delta_loader_factory import DeltaLoaderFactory, DeltaLoadOptions, consume_delta_load
|
|
3
|
+
from .strategies import DeltaCDFConfig, DeltaCDFLoader, DeltaTimestampConfig, DeltaTimestampLoader
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"consume_delta_load",
|
|
7
|
+
"DeltaCDFConfig",
|
|
8
|
+
"DeltaCDFLoader",
|
|
9
|
+
"DeltaLoader",
|
|
10
|
+
"DeltaLoaderFactory",
|
|
11
|
+
"DeltaLoadOptions",
|
|
12
|
+
"DeltaTimestampConfig",
|
|
13
|
+
"DeltaTimestampLoader",
|
|
14
|
+
]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Self
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DeltaLoadOptions(BaseModel):
|
|
9
|
+
"""Options to configure the DeltaLoader.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
strategy: Delta load strategy to use.
|
|
13
|
+
delta_load_identifier: Unique delta load identifier used to track the delta load metadata.
|
|
14
|
+
strategy_options: Options used to configure the chosen delta load strategy.
|
|
15
|
+
See the config class of the particular strategy for more info.
|
|
16
|
+
metadata_table_identifier: Identifier of the metadata table used to keep
|
|
17
|
+
track of the delta load metadata. The table will be created if it does
|
|
18
|
+
not exist. If none, it will default to `<source_catalog>.<source_schema>.metadata_delta_load`.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
strategy: str
|
|
22
|
+
delta_load_identifier: str
|
|
23
|
+
strategy_options: dict
|
|
24
|
+
metadata_table_identifier: str | None = None
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def from_yaml_str(cls, yaml_str: str) -> Self:
|
|
28
|
+
"""Creates an instance of DeltaLoadOptions from a YAML string."""
|
|
29
|
+
options = yaml.safe_load(yaml_str)
|
|
30
|
+
return cls(**options)
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def from_file(cls, path: str | Path) -> Self:
|
|
34
|
+
"""Creates an instance of DeltaLoadOptions from a YAML file."""
|
|
35
|
+
with Path(path).open() as f:
|
|
36
|
+
yaml_str = f.read()
|
|
37
|
+
return cls.from_yaml_str(yaml_str)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from functools import partial
|
|
4
|
+
|
|
5
|
+
from delta import DeltaTable # type: ignore[import-untyped]
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
from pyspark.sql import functions as F
|
|
8
|
+
|
|
9
|
+
from ...integration.writer import CatalogWriter
|
|
10
|
+
from ...logging import LoggerMixin
|
|
11
|
+
from ...object_manager import TableManager
|
|
12
|
+
from ...session import SessionManager
|
|
13
|
+
from .delta_loader_metadata_table import DeltaLoaderMetadataTable
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DeltaLoader(ABC, LoggerMixin):
|
|
17
|
+
"""Base class for delta load operations.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
table_identifier: Identifier for the table to be loaded.
|
|
21
|
+
delta_load_identifier: Identifier for the delta load.
|
|
22
|
+
metadata_table_identifier: Identifier for the metadata table. If None,
|
|
23
|
+
the metadata_table_identifier will be derived from the table identifier:
|
|
24
|
+
`<table_catalog>.<table_schema>.metadata_delta_load`.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
table_identifier: str,
|
|
30
|
+
delta_load_identifier: str,
|
|
31
|
+
metadata_table_identifier: str | None = None,
|
|
32
|
+
):
|
|
33
|
+
self._spark = SessionManager.get_spark_session()
|
|
34
|
+
self._console_logger = self.get_console_logger()
|
|
35
|
+
self.table_identifier = table_identifier
|
|
36
|
+
self.delta_load_identifier = delta_load_identifier
|
|
37
|
+
self.metadata_table_identifier = (
|
|
38
|
+
metadata_table_identifier
|
|
39
|
+
or f"{self.table_identifier.split('.')[0]}.{self.table_identifier.split('.')[1]}.metadata_delta_load"
|
|
40
|
+
)
|
|
41
|
+
table_manager = TableManager()
|
|
42
|
+
table_manager.create_table(table=DeltaLoaderMetadataTable(identifier=self.metadata_table_identifier))
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def read_data(
|
|
46
|
+
self,
|
|
47
|
+
options: dict[str, str] | None = None,
|
|
48
|
+
) -> DataFrame:
|
|
49
|
+
"""Reads data incrementally using a strategy.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
options: Additional DataFrameReader options.
|
|
53
|
+
"""
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def verify(self) -> None:
|
|
58
|
+
"""Verify that the source table qualifies for the delta load strategy."""
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
def _query(self, query: str) -> DataFrame:
|
|
62
|
+
df = self._spark.sql(query)
|
|
63
|
+
return df
|
|
64
|
+
|
|
65
|
+
def _create_metadata_entry(
|
|
66
|
+
self,
|
|
67
|
+
*,
|
|
68
|
+
rows: int,
|
|
69
|
+
last_read_timestamp: datetime | None = None,
|
|
70
|
+
start_version: int | None = None,
|
|
71
|
+
end_version: int | None = None,
|
|
72
|
+
start_commit_timestamp: datetime | None = None,
|
|
73
|
+
end_commit_timestamp: datetime | None = None,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""Creates an entry in the metadata table for the delta load."""
|
|
76
|
+
self._console_logger.info(
|
|
77
|
+
f"Creating metadata entry for table: [ {self.table_identifier} ] with Delta Load Identifier: [ {self.delta_load_identifier} ]",
|
|
78
|
+
)
|
|
79
|
+
metadata_df = self._spark.range(1)
|
|
80
|
+
metadata_df = metadata_df.select(
|
|
81
|
+
F.lit(rows).alias("rows").cast("bigint"),
|
|
82
|
+
F.lit(False).alias("is_processed"),
|
|
83
|
+
F.lit(False).alias("is_stale"),
|
|
84
|
+
F.lit(self.table_identifier).alias("source_table_identifier"),
|
|
85
|
+
F.lit(self.delta_load_identifier).alias("delta_load_identifier"),
|
|
86
|
+
F.lit(start_version).alias("start_commit_version"),
|
|
87
|
+
F.lit(end_version).alias("end_commit_version"),
|
|
88
|
+
F.lit(start_commit_timestamp).alias("start_commit_timestamp_utc"),
|
|
89
|
+
F.lit(end_commit_timestamp).alias("end_commit_timestamp_utc"),
|
|
90
|
+
F.lit(last_read_timestamp).alias("last_read_timestamp"),
|
|
91
|
+
F.current_timestamp().alias("__DCR"),
|
|
92
|
+
F.current_timestamp().alias("__DMR"),
|
|
93
|
+
).withColumn(
|
|
94
|
+
"BK",
|
|
95
|
+
F.md5(
|
|
96
|
+
F.concat_ws(
|
|
97
|
+
"-",
|
|
98
|
+
F.col("source_table_identifier"),
|
|
99
|
+
F.col("delta_load_identifier"),
|
|
100
|
+
F.current_timestamp(),
|
|
101
|
+
),
|
|
102
|
+
),
|
|
103
|
+
)
|
|
104
|
+
catalog_writer = CatalogWriter()
|
|
105
|
+
catalog_writer.write_table(
|
|
106
|
+
df=metadata_df,
|
|
107
|
+
table_identifier=self.metadata_table_identifier,
|
|
108
|
+
mode="append",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def _invalidate_versions(self) -> None:
|
|
112
|
+
"""Invalidate any pending changes in the metadata for the delta load."""
|
|
113
|
+
self._console_logger.info(
|
|
114
|
+
f"Invalidating unprocessed delta load metadata for table: [ {self.table_identifier} ] with Delta Load Identifier: [ {self.delta_load_identifier} ]",
|
|
115
|
+
)
|
|
116
|
+
delta_table = DeltaTable.forName(self._spark, self.metadata_table_identifier)
|
|
117
|
+
delta_table.update(
|
|
118
|
+
condition=(F.col("source_table_identifier") == self.table_identifier)
|
|
119
|
+
& (F.col("delta_load_identifier") == self.delta_load_identifier)
|
|
120
|
+
& ~F.col("is_processed")
|
|
121
|
+
& ~F.col("is_stale"),
|
|
122
|
+
set={"is_stale": F.lit(True), "__DMR": F.current_timestamp()},
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def reset_cdf(self) -> None:
|
|
126
|
+
"""Invalidates all changes in the metadata for the delta load."""
|
|
127
|
+
delta_table = DeltaTable.forName(self._spark, self.metadata_table_identifier)
|
|
128
|
+
self._console_logger.info(
|
|
129
|
+
f"Resetting delta load metadata for table: [ {self.table_identifier} ] with Delta Load Identifier: [ {self.delta_load_identifier} ]",
|
|
130
|
+
)
|
|
131
|
+
delta_table.update(
|
|
132
|
+
condition=(F.col("source_table_identifier") == self.table_identifier)
|
|
133
|
+
& (F.col("delta_load_identifier") == self.delta_load_identifier)
|
|
134
|
+
& ~F.col("is_stale"),
|
|
135
|
+
set={"is_stale": F.lit(True), "__DMR": F.current_timestamp()},
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def consume_data(self) -> None:
|
|
139
|
+
"""Marks data as consumed in the metadata for the delta load."""
|
|
140
|
+
df = self._spark.table(self.metadata_table_identifier)
|
|
141
|
+
df = df.filter(
|
|
142
|
+
(F.col("source_table_identifier") == self.table_identifier)
|
|
143
|
+
& (F.col("delta_load_identifier") == self.delta_load_identifier)
|
|
144
|
+
& ~F.col("is_processed")
|
|
145
|
+
& ~F.col("is_stale"),
|
|
146
|
+
)
|
|
147
|
+
df = df.groupBy("BK", "delta_load_identifier").agg(F.max("__DCR")).limit(1)
|
|
148
|
+
self._console_logger.info(
|
|
149
|
+
f"Mark metadata for table as processed: [ {self.table_identifier} ] with Delta Load Identifier: [ {self.delta_load_identifier} ].",
|
|
150
|
+
)
|
|
151
|
+
delta_table = DeltaTable.forName(self._spark, self.metadata_table_identifier)
|
|
152
|
+
delta_table.alias("target").merge(df.alias("source"), "target.BK = source.BK").whenMatchedUpdate(
|
|
153
|
+
set={
|
|
154
|
+
"is_processed": F.lit(True),
|
|
155
|
+
"__DMR": F.current_timestamp(),
|
|
156
|
+
},
|
|
157
|
+
).execute()
|
|
158
|
+
|
|
159
|
+
def write_data(self, write_callable: partial):
|
|
160
|
+
"""Wrapper to write and consume a delta load."""
|
|
161
|
+
try:
|
|
162
|
+
write_callable()
|
|
163
|
+
except Exception as e:
|
|
164
|
+
raise RuntimeError("Error while writing...") from e
|
|
165
|
+
self.consume_data()
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from .delta_load_options import DeltaLoadOptions
|
|
4
|
+
from .delta_loader import DeltaLoader
|
|
5
|
+
from .strategies import DeltaCDFConfig, DeltaCDFLoader, DeltaTimestampConfig, DeltaTimestampLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def consume_delta_load(
|
|
9
|
+
runtime_info: dict[str, Any],
|
|
10
|
+
delta_load_identifier: str | None = None,
|
|
11
|
+
) -> None:
|
|
12
|
+
"""Consumes a delta load by updating the metadata table.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
runtime_info: Runtime information.
|
|
16
|
+
delta_load_identifier: If set, the ConsumeDeltaLoadAction action
|
|
17
|
+
will only consume DeltaLoader transaction for the given
|
|
18
|
+
delta_load_identifier.
|
|
19
|
+
"""
|
|
20
|
+
for table_name, value in runtime_info["delta_load_options"].items():
|
|
21
|
+
if delta_load_identifier is None or delta_load_identifier == value.get("delta_load_identifier"):
|
|
22
|
+
delta_loader: DeltaLoader = DeltaLoaderFactory.create_loader(
|
|
23
|
+
table_identifier=table_name,
|
|
24
|
+
options=DeltaLoadOptions(
|
|
25
|
+
**value,
|
|
26
|
+
),
|
|
27
|
+
)
|
|
28
|
+
delta_loader.consume_data()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DeltaLoaderFactory:
|
|
32
|
+
"""Factory to create a DeltaLoader instance based on the DeltaLoadOptions."""
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def create_loader(table_identifier: str, options: DeltaLoadOptions) -> DeltaLoader:
|
|
36
|
+
"""Creates an instance of DeltaLoader, choosing the desired strategy."""
|
|
37
|
+
if options.strategy.upper() == "CDF":
|
|
38
|
+
cdf_config = DeltaCDFConfig(**options.strategy_options)
|
|
39
|
+
return DeltaCDFLoader(
|
|
40
|
+
table_identifier=table_identifier,
|
|
41
|
+
delta_load_identifier=options.delta_load_identifier,
|
|
42
|
+
config=cdf_config,
|
|
43
|
+
metadata_table_identifier=options.metadata_table_identifier,
|
|
44
|
+
)
|
|
45
|
+
if options.strategy.upper() == "TIMESTAMP":
|
|
46
|
+
timestamp_config = DeltaTimestampConfig(**options.strategy_options)
|
|
47
|
+
return DeltaTimestampLoader(
|
|
48
|
+
table_identifier=table_identifier,
|
|
49
|
+
delta_load_identifier=options.delta_load_identifier,
|
|
50
|
+
config=timestamp_config,
|
|
51
|
+
metadata_table_identifier=options.metadata_table_identifier,
|
|
52
|
+
)
|
|
53
|
+
raise ValueError(f"Unknown strategy: {options.strategy}")
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from ...models import Column, Table
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DeltaLoaderMetadataTable(Table):
|
|
5
|
+
"""A Table Model for the Delta CDF Reader Metadata Table."""
|
|
6
|
+
|
|
7
|
+
data_source_format: str = "DELTA"
|
|
8
|
+
partition_by: list[str] = ["source_table_identifier"]
|
|
9
|
+
liquid_clustering: bool = True
|
|
10
|
+
columns: list[Column] = [
|
|
11
|
+
Column(
|
|
12
|
+
name="BK",
|
|
13
|
+
data_type="STRING",
|
|
14
|
+
),
|
|
15
|
+
Column(
|
|
16
|
+
name="delta_load_identifier",
|
|
17
|
+
data_type="STRING",
|
|
18
|
+
),
|
|
19
|
+
Column(
|
|
20
|
+
name="source_table_identifier",
|
|
21
|
+
data_type="STRING",
|
|
22
|
+
),
|
|
23
|
+
Column(
|
|
24
|
+
name="is_processed",
|
|
25
|
+
data_type="BOOLEAN",
|
|
26
|
+
),
|
|
27
|
+
Column(
|
|
28
|
+
name="is_stale",
|
|
29
|
+
data_type="BOOLEAN",
|
|
30
|
+
),
|
|
31
|
+
Column(
|
|
32
|
+
name="last_read_timestamp",
|
|
33
|
+
data_type="TIMESTAMP",
|
|
34
|
+
nullable=True,
|
|
35
|
+
),
|
|
36
|
+
Column(
|
|
37
|
+
name="rows",
|
|
38
|
+
data_type="BIGINT",
|
|
39
|
+
),
|
|
40
|
+
Column(
|
|
41
|
+
name="start_commit_version",
|
|
42
|
+
data_type="INT",
|
|
43
|
+
nullable=True,
|
|
44
|
+
),
|
|
45
|
+
Column(
|
|
46
|
+
name="end_commit_version",
|
|
47
|
+
data_type="INT",
|
|
48
|
+
nullable=True,
|
|
49
|
+
),
|
|
50
|
+
Column(
|
|
51
|
+
name="start_commit_timestamp_utc",
|
|
52
|
+
data_type="TIMESTAMP",
|
|
53
|
+
nullable=True,
|
|
54
|
+
),
|
|
55
|
+
Column(
|
|
56
|
+
name="end_commit_timestamp_utc",
|
|
57
|
+
data_type="TIMESTAMP",
|
|
58
|
+
nullable=True,
|
|
59
|
+
),
|
|
60
|
+
Column(
|
|
61
|
+
name="__DCR",
|
|
62
|
+
data_type="TIMESTAMP",
|
|
63
|
+
),
|
|
64
|
+
Column(
|
|
65
|
+
name="__DMR",
|
|
66
|
+
data_type="TIMESTAMP",
|
|
67
|
+
),
|
|
68
|
+
]
|