tgedr-dataops 0.0.36__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
- tgedr_dataops/commons/utils_fs.py +187 -0
- tgedr_dataops/quality/pandas_validation.py +21 -0
- tgedr_dataops/sink/local_fs_file_sink.py +77 -0
- {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
- tgedr_dataops/source/abstract_s3_file_source.py +72 -0
- tgedr_dataops/source/local_fs_file_source.py +108 -0
- tgedr_dataops/source/pd_df_s3_source.py +130 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
- tgedr_dataops/source/s3_file_extended_source.py +68 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +60 -39
- tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
- tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
- tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
- tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
- tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
- {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
- tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
- tgedr/dataops/chain.py +0 -51
- tgedr/dataops/commons/dataset.py +0 -23
- tgedr/dataops/commons/metadata.py +0 -172
- tgedr/dataops/commons/utils_fs.py +0 -85
- tgedr/dataops/commons/utils_spark.py +0 -87
- tgedr/dataops/etl.py +0 -112
- tgedr/dataops/processor.py +0 -27
- tgedr/dataops/sink/local_fs_file_sink.py +0 -47
- tgedr/dataops/sink/sink.py +0 -46
- tgedr/dataops/source/abstract_s3_file_source.py +0 -43
- tgedr/dataops/source/delta_table_source.py +0 -49
- tgedr/dataops/source/local_delta_table.py +0 -47
- tgedr/dataops/source/local_fs_file_source.py +0 -71
- tgedr/dataops/source/pd_df_s3_source.py +0 -51
- tgedr/dataops/source/s3_delta_table.py +0 -75
- tgedr/dataops/source/source.py +0 -51
- tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
- tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
- tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
- tgedr/dataops/store/spark_delta.py +0 -369
- tgedr/dataops/store/store.py +0 -49
- tgedr/dataops/utils_reflection.py +0 -134
- tgedr/dataops/validation/abs.py +0 -46
- tgedr/dataops/validation/pandas.py +0 -10
- tgedr/dataops/validation/pyspark.py +0 -10
- tgedr_dataops-0.0.36.dist-info/METADATA +0 -20
- tgedr_dataops-0.0.36.dist-info/RECORD +0 -37
- tgedr_dataops-0.0.36.dist-info/top_level.txt +0 -1
- {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
- {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import s3fs
|
|
3
|
-
import logging
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import pyarrow as pa
|
|
7
|
-
|
|
8
|
-
from tgedr.dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
|
|
9
|
-
from tgedr.dataops.commons.utils_fs import remove_s3_protocol
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class S3FsSinglePartitionParquetStore(FsSinglePartitionParquetStore): # pragma: no cover
|
|
16
|
-
"""FsSinglePartitionParquetStore implementation using aws s3 file system"""
|
|
17
|
-
|
|
18
|
-
CONFIG_KEY_AWS_ACCESS_KEY_ID: str = "aws_access_key_id"
|
|
19
|
-
CONFIG_KEY_AWS_SECRET_ACCESS_KEY: str = "aws_secret_access_key"
|
|
20
|
-
CONFIG_KEY_AWS_SESSION_TOKEN: str = "aws_session_token"
|
|
21
|
-
|
|
22
|
-
@property
|
|
23
|
-
def fs(self):
|
|
24
|
-
if self._fs is None:
|
|
25
|
-
if (self._config is not None) and all(
|
|
26
|
-
element in list(self._config.keys())
|
|
27
|
-
for element in [
|
|
28
|
-
self.CONFIG_KEY_AWS_ACCESS_KEY_ID,
|
|
29
|
-
self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY,
|
|
30
|
-
self.CONFIG_KEY_AWS_SESSION_TOKEN,
|
|
31
|
-
]
|
|
32
|
-
):
|
|
33
|
-
self._fs = s3fs.S3FileSystem(
|
|
34
|
-
key=self._config[self.CONFIG_KEY_AWS_ACCESS_KEY_ID],
|
|
35
|
-
secret=self._config[self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY],
|
|
36
|
-
token=self._config[self.CONFIG_KEY_AWS_SESSION_TOKEN],
|
|
37
|
-
)
|
|
38
|
-
else:
|
|
39
|
-
self._fs = s3fs.S3FileSystem()
|
|
40
|
-
return self._fs
|
|
41
|
-
|
|
42
|
-
def _rmdir(self, key):
|
|
43
|
-
if self.fs.isdir(key):
|
|
44
|
-
self.fs.delete(key, recursive=True)
|
|
45
|
-
|
|
46
|
-
def _exists(self, key) -> bool:
|
|
47
|
-
return self.fs.get_file_info(key).type.name != "NotFound"
|
|
48
|
-
|
|
49
|
-
def get(
|
|
50
|
-
self,
|
|
51
|
-
key: str,
|
|
52
|
-
filter: callable = None,
|
|
53
|
-
filters: List[tuple[str, str, List[str]]] = None,
|
|
54
|
-
schema: pa.Schema = None,
|
|
55
|
-
) -> pd.DataFrame:
|
|
56
|
-
return super().get(key=remove_s3_protocol(key), filter=filter, filters=filters, schema=schema)
|
|
57
|
-
|
|
58
|
-
def delete(
|
|
59
|
-
self,
|
|
60
|
-
key: str,
|
|
61
|
-
partition_field: Optional[str] = None,
|
|
62
|
-
partition_values: Optional[List[str]] = None,
|
|
63
|
-
kv_dict: Optional[Dict[str, List[Any]]] = None,
|
|
64
|
-
schema: pa.Schema = None,
|
|
65
|
-
):
|
|
66
|
-
super().delete(
|
|
67
|
-
key=remove_s3_protocol(key),
|
|
68
|
-
partition_field=partition_field,
|
|
69
|
-
partition_values=partition_values,
|
|
70
|
-
kv_dict=kv_dict,
|
|
71
|
-
schema=schema,
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
def save(
|
|
75
|
-
self,
|
|
76
|
-
df: pd.DataFrame,
|
|
77
|
-
key: str,
|
|
78
|
-
partition_field: Optional[str] = None,
|
|
79
|
-
append: bool = False,
|
|
80
|
-
replace_partitions: bool = False,
|
|
81
|
-
schema: Any = None,
|
|
82
|
-
):
|
|
83
|
-
super().save(
|
|
84
|
-
df=df,
|
|
85
|
-
key=remove_s3_protocol(key),
|
|
86
|
-
partition_field=partition_field,
|
|
87
|
-
append=append,
|
|
88
|
-
replace_partitions=replace_partitions,
|
|
89
|
-
schema=schema,
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
def update(
|
|
93
|
-
self,
|
|
94
|
-
df: pd.DataFrame,
|
|
95
|
-
key: str,
|
|
96
|
-
key_fields: List[str],
|
|
97
|
-
partition_field: Optional[str] = None,
|
|
98
|
-
schema: Any = None,
|
|
99
|
-
):
|
|
100
|
-
super().update(
|
|
101
|
-
df=df, key=remove_s3_protocol(key), key_fields=key_fields, partition_field=partition_field, schema=schema
|
|
102
|
-
)
|
|
@@ -1,369 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
import dataclasses
|
|
3
|
-
import logging
|
|
4
|
-
from typing import Any, Dict, List, Optional, Union
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
from pyspark.sql import DataFrame
|
|
7
|
-
from delta.tables import DeltaTable
|
|
8
|
-
from pyspark.sql import functions as F
|
|
9
|
-
from pyspark.sql import types as T
|
|
10
|
-
from pyspark.sql.utils import AnalysisException
|
|
11
|
-
from pyspark.sql.functions import monotonically_increasing_id
|
|
12
|
-
from tgedr.dataops.store.store import NoStoreException, Store, StoreException
|
|
13
|
-
from tgedr.dataops.commons.metadata import Metadata
|
|
14
|
-
from tgedr.dataops.commons.utils_spark import UtilsSpark
|
|
15
|
-
|
|
16
|
-
logger = logging.getLogger(__name__)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class SparkDeltaStore(Store, ABC):
|
|
20
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
21
|
-
Store.__init__(self, config)
|
|
22
|
-
|
|
23
|
-
def get(self, key: str, version: str = None, **kwargs) -> DataFrame:
|
|
24
|
-
logger.info(f"[get|in] ({key}, {version})")
|
|
25
|
-
|
|
26
|
-
table = self._get_table(path=key)
|
|
27
|
-
if table is None:
|
|
28
|
-
raise NoStoreException(f"[get] couldn't find data in key: {key}")
|
|
29
|
-
|
|
30
|
-
reader = UtilsSpark.get_spark_session().read.format("delta")
|
|
31
|
-
if version is not None:
|
|
32
|
-
reader = reader.option("versionAsOf", version)
|
|
33
|
-
|
|
34
|
-
result = reader.load(key)
|
|
35
|
-
|
|
36
|
-
logger.info("[get_df|out]")
|
|
37
|
-
return result
|
|
38
|
-
|
|
39
|
-
def __get_deletion_criteria(self, df):
|
|
40
|
-
logger.debug("[__get_deletion_criteria|in])")
|
|
41
|
-
fields = df.dtypes
|
|
42
|
-
numerics = [
|
|
43
|
-
x
|
|
44
|
-
for x in fields
|
|
45
|
-
if x[1] in ["bigint", "int", "double", "float", "long", "decimal.Decimal"] or (x[1][:7]) == "decimal"
|
|
46
|
-
]
|
|
47
|
-
dates = [x for x in fields if (x[1]) in ["datetime", "datetime.datetime"]]
|
|
48
|
-
textuals = [x for x in fields if x[1] in ["string"]]
|
|
49
|
-
if 0 < len(numerics):
|
|
50
|
-
column = numerics[0][0]
|
|
51
|
-
result = (F.col(column) > 0) | (F.col(column) <= 0)
|
|
52
|
-
elif 0 < len(dates):
|
|
53
|
-
column = dates[0][0]
|
|
54
|
-
now = datetime.now()
|
|
55
|
-
result = (F.col(column) > now) | (F.col(column) <= now)
|
|
56
|
-
elif 0 < len(textuals):
|
|
57
|
-
column = textuals[0][0]
|
|
58
|
-
result = (F.col(column) > "a") | (F.col(column) <= "a")
|
|
59
|
-
else:
|
|
60
|
-
raise StoreException(
|
|
61
|
-
f"[__get_deletion_criteria] failed to figure out column types handy to create a full deletion criteria"
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
logger.debug(f"[__get_deletion_criteria|out] = {result}")
|
|
65
|
-
return result
|
|
66
|
-
|
|
67
|
-
def delete(self, key: str, condition: Union[F.Column, str, None] = None, **kwargs) -> None:
|
|
68
|
-
logger.info(f"[delete|in] ({key}, {condition})")
|
|
69
|
-
|
|
70
|
-
spark = UtilsSpark.get_spark_session()
|
|
71
|
-
"""
|
|
72
|
-
is_s3_operation = True if key.startswith("s3") else False
|
|
73
|
-
if is_s3_operation:
|
|
74
|
-
"""
|
|
75
|
-
delta_table = DeltaTable.forPath(spark, key)
|
|
76
|
-
if condition is None:
|
|
77
|
-
condition = self.__get_deletion_criteria(delta_table.toDF())
|
|
78
|
-
delta_table.delete(condition=condition)
|
|
79
|
-
"""
|
|
80
|
-
else: # local development mostly for temporary or test purposes
|
|
81
|
-
spark_fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
|
|
82
|
-
# get spark context path
|
|
83
|
-
spark_path = spark._jvm.org.apache.hadoop.fs.Path(key)
|
|
84
|
-
logger.info(f"[delete] spark path is {spark_path}")
|
|
85
|
-
try:
|
|
86
|
-
if spark_fs.exists(spark_path):
|
|
87
|
-
spark_fs.delete(spark_path, True)
|
|
88
|
-
except AnalysisException as x:
|
|
89
|
-
raise StoreException(f"[delete] couldn't do it on key {key}: {x}")
|
|
90
|
-
"""
|
|
91
|
-
logger.info("[delete|out]")
|
|
92
|
-
|
|
93
|
-
def save(
|
|
94
|
-
self,
|
|
95
|
-
df: DataFrame,
|
|
96
|
-
key: str,
|
|
97
|
-
append: bool = False,
|
|
98
|
-
partition_fields: Optional[List[str]] = None,
|
|
99
|
-
metadata: Optional[Metadata] = None,
|
|
100
|
-
retention_days: int = 7,
|
|
101
|
-
deleted_retention_days: int = 7,
|
|
102
|
-
column_descriptions: Optional[Dict[str, str]] = None,
|
|
103
|
-
table_name: Optional[str] = None,
|
|
104
|
-
**kwargs,
|
|
105
|
-
):
|
|
106
|
-
logger.info(
|
|
107
|
-
f"[save|in] ({df}, {key}, {append}, {partition_fields}, {metadata}, {retention_days}, {deleted_retention_days}, {column_descriptions}, {table_name}, {kwargs})"
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
if column_descriptions is not None:
|
|
111
|
-
df = self._set_column_descriptions(df, column_descriptions)
|
|
112
|
-
|
|
113
|
-
if append:
|
|
114
|
-
writer = df.write.format("delta").mode("append")
|
|
115
|
-
else:
|
|
116
|
-
writer = df.write.format("delta").mode("overwrite")
|
|
117
|
-
|
|
118
|
-
if partition_fields is not None:
|
|
119
|
-
table = self._get_table(path=key)
|
|
120
|
-
if table is not None:
|
|
121
|
-
self._set_table_partitions(path=key, partition_fields=partition_fields)
|
|
122
|
-
writer = writer.partitionBy(*partition_fields)
|
|
123
|
-
|
|
124
|
-
if self._has_schema_changed(path=key, df=df):
|
|
125
|
-
writer = writer.option("overwriteSchema", "true")
|
|
126
|
-
|
|
127
|
-
if metadata:
|
|
128
|
-
writer = writer.option("userMetadata", metadata)
|
|
129
|
-
|
|
130
|
-
if table_name is not None:
|
|
131
|
-
# assume we have db.table
|
|
132
|
-
db = table_name.split(".")[0]
|
|
133
|
-
UtilsSpark.get_spark_session().sql(f"CREATE DATABASE IF NOT EXISTS {db}")
|
|
134
|
-
writer = writer.option("path", key).saveAsTable(table_name)
|
|
135
|
-
else:
|
|
136
|
-
writer.save(key)
|
|
137
|
-
|
|
138
|
-
logger.info(f"[save] optimizing...")
|
|
139
|
-
table = self._get_table(path=key)
|
|
140
|
-
|
|
141
|
-
if retention_days is not None and deleted_retention_days is not None:
|
|
142
|
-
self.enforce_retention_policy(
|
|
143
|
-
path=key, retention_days=retention_days, deleted_retention_days=deleted_retention_days
|
|
144
|
-
)
|
|
145
|
-
elif retention_days is not None:
|
|
146
|
-
self.enforce_retention_policy(path=key, retention_days=retention_days)
|
|
147
|
-
|
|
148
|
-
table.optimize().executeCompaction()
|
|
149
|
-
|
|
150
|
-
logger.info(f"[save|out]")
|
|
151
|
-
|
|
152
|
-
def update(
|
|
153
|
-
self,
|
|
154
|
-
df: Any,
|
|
155
|
-
key: str,
|
|
156
|
-
match_fields: List[str],
|
|
157
|
-
partition_fields: Optional[List[str]] = None,
|
|
158
|
-
metadata: Optional[Metadata] = None,
|
|
159
|
-
retention_days: int = 7,
|
|
160
|
-
deleted_retention_days: int = 7,
|
|
161
|
-
**kwargs,
|
|
162
|
-
):
|
|
163
|
-
logger.info(
|
|
164
|
-
f"[update|in] ({df}, {key}, {match_fields}, {partition_fields}, {metadata}, {retention_days}, {deleted_retention_days}, {kwargs})"
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
table = self._get_table(path=key)
|
|
168
|
-
if table is None:
|
|
169
|
-
self.save(
|
|
170
|
-
df=df,
|
|
171
|
-
key=key,
|
|
172
|
-
partition_fields=partition_fields,
|
|
173
|
-
metadata=metadata,
|
|
174
|
-
retention_days=retention_days,
|
|
175
|
-
deleted_retention_days=deleted_retention_days,
|
|
176
|
-
**kwargs,
|
|
177
|
-
)
|
|
178
|
-
else:
|
|
179
|
-
if partition_fields is not None:
|
|
180
|
-
self._set_table_partitions(path=key, partition_fields=partition_fields)
|
|
181
|
-
|
|
182
|
-
match_clause = None
|
|
183
|
-
for field in match_fields:
|
|
184
|
-
match_clause = (
|
|
185
|
-
f"current.{field} = updates.{field}"
|
|
186
|
-
if match_clause is None
|
|
187
|
-
else f"{match_clause} and current.{field} = updates.{field}"
|
|
188
|
-
)
|
|
189
|
-
logger.info(f"[update] match clause: {match_clause}")
|
|
190
|
-
|
|
191
|
-
# check if the df has all the required columns
|
|
192
|
-
# as we are upserting the updated columns coming in must at least match or exceed the current columns
|
|
193
|
-
for column in table.toDF().columns:
|
|
194
|
-
# we'll assume missing columns are nullable, typically metrics
|
|
195
|
-
if column not in df.columns:
|
|
196
|
-
df = df.withColumn(column, F.lit(None).cast(T.StringType()))
|
|
197
|
-
|
|
198
|
-
table.alias("current").merge(
|
|
199
|
-
df.alias("updates"), match_clause
|
|
200
|
-
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
|
|
201
|
-
|
|
202
|
-
if retention_days is not None and deleted_retention_days is not None:
|
|
203
|
-
self.enforce_retention_policy(
|
|
204
|
-
path=key, retention_days=retention_days, deleted_retention_days=deleted_retention_days
|
|
205
|
-
)
|
|
206
|
-
elif retention_days is not None:
|
|
207
|
-
self.enforce_retention_policy(path=key, retention_days=retention_days)
|
|
208
|
-
|
|
209
|
-
table.optimize().executeCompaction()
|
|
210
|
-
|
|
211
|
-
logger.info("[UtilsDeltaTable.upsert|out]")
|
|
212
|
-
|
|
213
|
-
def enforce_retention_policy(self, path: str, retention_days: int = 7, deleted_retention_days: int = 7):
|
|
214
|
-
logger.info(f"[enforce_retention_policy|in] ({path}, {retention_days}, {deleted_retention_days})")
|
|
215
|
-
|
|
216
|
-
retention = f"interval {retention_days} days"
|
|
217
|
-
deleted_retention = f"interval {deleted_retention_days} days"
|
|
218
|
-
|
|
219
|
-
UtilsSpark.get_spark_session().sql(
|
|
220
|
-
f"ALTER TABLE delta.`{path}` SET TBLPROPERTIES('delta.logRetentionDuration' = '{retention}', 'delta.deletedFileRetentionDuration' = '{deleted_retention}')"
|
|
221
|
-
)
|
|
222
|
-
logger.info("[enforce_retention_policy|out]")
|
|
223
|
-
|
|
224
|
-
def get_latest_table_versions(self, path: str, how_many: int = 1) -> List[str]:
|
|
225
|
-
"""
|
|
226
|
-
checks the delta table history and retrieves the latest n versions
|
|
227
|
-
sorted from the newest to the oldest
|
|
228
|
-
"""
|
|
229
|
-
logger.info(f"[get_latest_table_versions|in] ({path}, {how_many})")
|
|
230
|
-
result: List[str] = []
|
|
231
|
-
|
|
232
|
-
table = self._get_table(path=path)
|
|
233
|
-
if table is not None:
|
|
234
|
-
history_rows = table.history().orderBy(F.desc("timestamp")).limit(how_many)
|
|
235
|
-
result = [str(x.version) for x in history_rows.collect()]
|
|
236
|
-
|
|
237
|
-
logger.info(f"[get_latest_table_versions|out] => {result}")
|
|
238
|
-
return result
|
|
239
|
-
|
|
240
|
-
def get_metadata(self, path: str, version: str = None) -> Optional[Metadata]:
|
|
241
|
-
"""
|
|
242
|
-
Raises
|
|
243
|
-
------
|
|
244
|
-
NoStoreException
|
|
245
|
-
"""
|
|
246
|
-
logger.info(f"[get_metadata|in] ({path}, {version})")
|
|
247
|
-
table = self._get_table(path)
|
|
248
|
-
if table is None:
|
|
249
|
-
raise NoStoreException(f"[get_metadata] no data in path: {path}")
|
|
250
|
-
|
|
251
|
-
result = None
|
|
252
|
-
|
|
253
|
-
df_history = table.history().filter(F.col("userMetadata").isNotNull())
|
|
254
|
-
if version is not None:
|
|
255
|
-
df_history = df_history.filter(F.col("version") <= int(version))
|
|
256
|
-
|
|
257
|
-
df_history = df_history.orderBy(F.col("version").desc())
|
|
258
|
-
if not df_history.isEmpty():
|
|
259
|
-
userMetadata = df_history.take(1)[0].userMetadata
|
|
260
|
-
result = Metadata.from_str(userMetadata)
|
|
261
|
-
if version is not None:
|
|
262
|
-
result = dataclasses.replace(result, version=version)
|
|
263
|
-
|
|
264
|
-
logger.info(f"[get_metadata|out] => ({result})")
|
|
265
|
-
return result
|
|
266
|
-
|
|
267
|
-
def _get_delta_log(self, path: str) -> DataFrame:
|
|
268
|
-
logger.info(f"[_get_delta_log|in] ({path})")
|
|
269
|
-
|
|
270
|
-
spark = UtilsSpark.get_spark_session()
|
|
271
|
-
jdf = (
|
|
272
|
-
spark._jvm.org.apache.spark.sql.delta.DeltaLog.forTable(spark._jsparkSession, path)
|
|
273
|
-
.snapshot()
|
|
274
|
-
.allFiles()
|
|
275
|
-
.toDF()
|
|
276
|
-
)
|
|
277
|
-
result = DataFrame(jdf, spark)
|
|
278
|
-
|
|
279
|
-
logger.info(f"[_get_delta_log|out] => {result}")
|
|
280
|
-
return result
|
|
281
|
-
|
|
282
|
-
def _get_table_partitions(self, path: str) -> List[str]:
|
|
283
|
-
logger.info(f"[_get_table_partitions|in] ({path})")
|
|
284
|
-
result: List[str] = []
|
|
285
|
-
|
|
286
|
-
delta_log: DataFrame = self._get_delta_log(path=path)
|
|
287
|
-
partition_keys = [
|
|
288
|
-
x.keys
|
|
289
|
-
for x in delta_log.select(F.map_keys(F.col("partitionValues")).alias("keys")).distinct().collect()
|
|
290
|
-
if 0 < len(x)
|
|
291
|
-
]
|
|
292
|
-
if 0 < len(partition_keys):
|
|
293
|
-
result: List[str] = list({y for y in partition_keys for y in y})
|
|
294
|
-
|
|
295
|
-
logger.info(f"[_get_table_partitions|out] => {result}")
|
|
296
|
-
return result
|
|
297
|
-
|
|
298
|
-
def _vacuum_now(self, path: str):
|
|
299
|
-
logger.info("[_vacuum_now|in]")
|
|
300
|
-
|
|
301
|
-
spark = UtilsSpark.get_spark_session()
|
|
302
|
-
old_conf_value = spark.conf.get("spark.databricks.delta.retentionDurationCheck.enabled")
|
|
303
|
-
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
|
|
304
|
-
DeltaTable.forPath(spark, path).vacuum(0)
|
|
305
|
-
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", old_conf_value)
|
|
306
|
-
|
|
307
|
-
logger.info("[_vacuum_now|out]")
|
|
308
|
-
|
|
309
|
-
def _has_schema_changed(self, path: str, df: DataFrame) -> bool:
|
|
310
|
-
logger.info(f"[_has_schema_changed|in] ({path},{df})")
|
|
311
|
-
result: bool = False
|
|
312
|
-
table = self._get_table(path=path)
|
|
313
|
-
if table is not None:
|
|
314
|
-
result = table.toDF().schema != df.schema
|
|
315
|
-
logger.info(f"[_has_schema_changed|out] => {result}")
|
|
316
|
-
return result
|
|
317
|
-
|
|
318
|
-
def _set_table_partitions(self, path: str, partition_fields: List[str]) -> None:
|
|
319
|
-
logger.info(f"[_set_table_partitions|in] ({path},{partition_fields})")
|
|
320
|
-
|
|
321
|
-
spark = UtilsSpark.get_spark_session()
|
|
322
|
-
# let's check partition_cols
|
|
323
|
-
current_partition_fields = self._get_table_partitions(path=path)
|
|
324
|
-
shall_we_repartition = sorted(partition_fields) != sorted(current_partition_fields)
|
|
325
|
-
|
|
326
|
-
if shall_we_repartition:
|
|
327
|
-
logger.info("[_set_table_partitions] going to repartition")
|
|
328
|
-
new_df = spark.read.format("delta").load(path)
|
|
329
|
-
new_df.write.format("delta").mode("overwrite").partitionBy(*partition_fields).option(
|
|
330
|
-
"overwriteSchema", "true"
|
|
331
|
-
).save(path)
|
|
332
|
-
self._vacuum_now(path)
|
|
333
|
-
logger.info(
|
|
334
|
-
f"[_set_table_partitions] changed partition cols from {current_partition_fields} to {partition_fields}"
|
|
335
|
-
)
|
|
336
|
-
logger.info("[_set_table_partitions|out]")
|
|
337
|
-
|
|
338
|
-
def _get_table(self, path) -> Optional[DeltaTable]:
|
|
339
|
-
logger.debug(f"[_get_table|in] ({path})")
|
|
340
|
-
result: DeltaTable = None
|
|
341
|
-
try:
|
|
342
|
-
result: DeltaTable = DeltaTable.forPath(UtilsSpark.get_spark_session(), path)
|
|
343
|
-
except AnalysisException as ax:
|
|
344
|
-
logger.warning(f"[_get_table] couldn't load from {path}: {ax}")
|
|
345
|
-
|
|
346
|
-
logger.debug(f"[_get_table|out] => {result}")
|
|
347
|
-
return result
|
|
348
|
-
|
|
349
|
-
def set_column_comments(self, db: str, table: str, col_comments: Dict[str, str]) -> None:
|
|
350
|
-
logger.info(f"[set_column_comments|in] ({db}, {table}, {col_comments})")
|
|
351
|
-
spark = UtilsSpark.get_spark_session()
|
|
352
|
-
|
|
353
|
-
table_description: DataFrame = spark.sql(f"describe {db}.{table}").withColumn(
|
|
354
|
-
"set_column_comments_id", monotonically_increasing_id()
|
|
355
|
-
)
|
|
356
|
-
id = table_description.filter(F.col("col_name") == "# Partitioning").collect()[0].set_column_comments_id
|
|
357
|
-
table_description = table_description.filter(
|
|
358
|
-
(F.col("set_column_comments_id") < F.lit(id)) & (F.col("col_name") != "")
|
|
359
|
-
).drop("set_column_comments_id")
|
|
360
|
-
rows = [r.asDict() for r in table_description.collect()]
|
|
361
|
-
for row in rows:
|
|
362
|
-
col = row["col_name"]
|
|
363
|
-
data_type = row["data_type"]
|
|
364
|
-
if col in col_comments:
|
|
365
|
-
new_comment = col_comments[col]
|
|
366
|
-
logger.info(f"[set_column_comments] setting new comment ({new_comment}) to column {col}")
|
|
367
|
-
spark.sql(f"ALTER TABLE {db}.{table} CHANGE COLUMN {col} {col} {data_type} COMMENT '{new_comment}'")
|
|
368
|
-
|
|
369
|
-
logger.info("[set_column_comments|out]")
|
tgedr/dataops/store/store.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
from typing import Any, Dict, Optional
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class StoreException(Exception):
|
|
6
|
-
pass
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class NoStoreException(StoreException):
|
|
10
|
-
pass
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class StoreInterface(metaclass=abc.ABCMeta):
|
|
14
|
-
@classmethod
|
|
15
|
-
def __subclasshook__(cls, subclass):
|
|
16
|
-
return (
|
|
17
|
-
hasattr(subclass, "get")
|
|
18
|
-
and callable(subclass.get)
|
|
19
|
-
and hasattr(subclass, "delete")
|
|
20
|
-
and callable(subclass.delete)
|
|
21
|
-
and hasattr(subclass, "save")
|
|
22
|
-
and callable(subclass.save)
|
|
23
|
-
and hasattr(subclass, "update")
|
|
24
|
-
and callable(subclass.update)
|
|
25
|
-
) or NotImplemented
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@StoreInterface.register
|
|
29
|
-
class Store(abc.ABC):
|
|
30
|
-
"""abstract class used to manage persistence, defining CRUD-like (CreateReadUpdateDelete) methods"""
|
|
31
|
-
|
|
32
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
33
|
-
self._config = config
|
|
34
|
-
|
|
35
|
-
@abc.abstractmethod
|
|
36
|
-
def get(self, key: str, **kwargs) -> Any:
|
|
37
|
-
raise NotImplementedError()
|
|
38
|
-
|
|
39
|
-
@abc.abstractmethod
|
|
40
|
-
def delete(self, key: str, **kwargs) -> None:
|
|
41
|
-
raise NotImplementedError()
|
|
42
|
-
|
|
43
|
-
@abc.abstractmethod
|
|
44
|
-
def save(self, df: Any, key: str, **kwargs):
|
|
45
|
-
raise NotImplementedError()
|
|
46
|
-
|
|
47
|
-
@abc.abstractmethod
|
|
48
|
-
def update(self, df: Any, key: str, **kwargs):
|
|
49
|
-
raise NotImplementedError()
|
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
import importlib
|
|
2
|
-
import inspect
|
|
3
|
-
import logging
|
|
4
|
-
import os
|
|
5
|
-
import sys
|
|
6
|
-
from importlib import import_module
|
|
7
|
-
from typing import Dict, Any, List
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class UtilsReflectionException(Exception):
|
|
14
|
-
pass
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class UtilsReflection:
|
|
18
|
-
__MODULE_EXTENSIONS = (".py", ".pyc", ".pyo")
|
|
19
|
-
|
|
20
|
-
@staticmethod
|
|
21
|
-
def load_class(clazz: str, parent_check: type = None) -> Any:
|
|
22
|
-
logger.debug(f"[load_class|in] (clazz={clazz}, parent_check={parent_check})")
|
|
23
|
-
type_elements = clazz.split(".")
|
|
24
|
-
module = ".".join(type_elements[:-1])
|
|
25
|
-
_clazz = type_elements[-1]
|
|
26
|
-
|
|
27
|
-
result = getattr(import_module(module), _clazz)
|
|
28
|
-
|
|
29
|
-
if not callable(result):
|
|
30
|
-
raise TypeError(f"Object {_clazz} in {module} is not callable.")
|
|
31
|
-
|
|
32
|
-
if parent_check and (not issubclass(result, parent_check)):
|
|
33
|
-
raise TypeError(f"Wrong class type, it is not a subclass of {parent_check.__name__}")
|
|
34
|
-
|
|
35
|
-
logger.debug(f"[load_class|out] => {result}")
|
|
36
|
-
return result
|
|
37
|
-
|
|
38
|
-
@staticmethod
|
|
39
|
-
def load_subclass_from_module(module: str, clazz: str, super_clazz: type) -> Any:
|
|
40
|
-
logger.info(f"[load_subclass_from_module|in] (module={module}, clazz={clazz}, super_clazz={super_clazz})")
|
|
41
|
-
result = getattr(import_module(module), clazz)
|
|
42
|
-
|
|
43
|
-
if not callable(result):
|
|
44
|
-
raise TypeError(f"Object {clazz} in {module} is not callable.")
|
|
45
|
-
|
|
46
|
-
if super_clazz and (not issubclass(result, super_clazz)):
|
|
47
|
-
raise TypeError(f"Wrong class type, it is not a subclass of {super_clazz.__name__}")
|
|
48
|
-
|
|
49
|
-
logger.info(f"[load_subclass_from_module|out] => {result}")
|
|
50
|
-
return result
|
|
51
|
-
|
|
52
|
-
@staticmethod
|
|
53
|
-
def get_type(module: str, _type: str) -> type:
|
|
54
|
-
logger.info(f"[get_type|in] (module={module}, _type={_type})")
|
|
55
|
-
result = None
|
|
56
|
-
|
|
57
|
-
result = getattr(import_module(module), _type)
|
|
58
|
-
|
|
59
|
-
logger.info(f"[get_type|out] => {result}")
|
|
60
|
-
return result
|
|
61
|
-
|
|
62
|
-
@staticmethod
|
|
63
|
-
def is_subclass_of(sub_class: type, super_class: type) -> bool:
|
|
64
|
-
logger.info(f"[is_subclass_of|in] ({sub_class}, {super_class})")
|
|
65
|
-
result = False
|
|
66
|
-
|
|
67
|
-
if callable(sub_class) and issubclass(sub_class, super_class):
|
|
68
|
-
result = True
|
|
69
|
-
|
|
70
|
-
logger.info(f"[is_subclass_of|out] => {result}")
|
|
71
|
-
return result
|
|
72
|
-
|
|
73
|
-
@staticmethod
|
|
74
|
-
def find_module_classes(module: str) -> List[Any]:
|
|
75
|
-
logger.info(f"[find_module_classes|in] ({module})")
|
|
76
|
-
result = []
|
|
77
|
-
for name, obj in inspect.getmembers(sys.modules[module]):
|
|
78
|
-
if inspect.isclass(obj):
|
|
79
|
-
result.append(obj)
|
|
80
|
-
logger.info(f"[find_module_classes|out] => {result}")
|
|
81
|
-
return result
|
|
82
|
-
|
|
83
|
-
@staticmethod
|
|
84
|
-
def find_class_implementations_in_package(package_name: str, super_class: type) -> Dict[str, type]:
|
|
85
|
-
logger.info(f"[find_class_implementations_in_package|in] ({package_name}, {super_class})")
|
|
86
|
-
result = {}
|
|
87
|
-
|
|
88
|
-
the_package = importlib.import_module(package_name)
|
|
89
|
-
pkg_path = the_package.__path__[0]
|
|
90
|
-
modules = [
|
|
91
|
-
package_name + "." + module.split(".")[0]
|
|
92
|
-
for module in os.listdir(pkg_path)
|
|
93
|
-
if module.endswith(UtilsReflection.__MODULE_EXTENSIONS) and module != "__init__.py"
|
|
94
|
-
]
|
|
95
|
-
|
|
96
|
-
logger.info(f"[find_class_implementations_in_package] found modules: {modules}")
|
|
97
|
-
|
|
98
|
-
for _module in modules:
|
|
99
|
-
if _module not in sys.modules:
|
|
100
|
-
importlib.import_module(_module)
|
|
101
|
-
|
|
102
|
-
for _class in UtilsReflection.find_module_classes(_module):
|
|
103
|
-
if UtilsReflection.is_subclass_of(_class, super_class) and _class != super_class:
|
|
104
|
-
result[_module] = _class
|
|
105
|
-
|
|
106
|
-
logger.info(f"[find_class_implementations_in_package|out] => {result}")
|
|
107
|
-
return result
|
|
108
|
-
|
|
109
|
-
@staticmethod
|
|
110
|
-
def find_package_path(package_name: str) -> str:
|
|
111
|
-
logger.info(f"[find_package_path|in] ({package_name})")
|
|
112
|
-
the_package = importlib.import_module(package_name)
|
|
113
|
-
result = the_package.__path__[0]
|
|
114
|
-
logger.info(f"[find_package_path|out] => {result}")
|
|
115
|
-
return result
|
|
116
|
-
|
|
117
|
-
@staticmethod
|
|
118
|
-
def find_class_implementations(packages: str, clazz: Any) -> Dict[str, Any]:
|
|
119
|
-
"""
|
|
120
|
-
throws UtilsReflectionException
|
|
121
|
-
"""
|
|
122
|
-
logger.info(f"[find_class_implementations|in] ({packages}, {clazz})")
|
|
123
|
-
result = {}
|
|
124
|
-
_packages = [a.strip() for a in packages.split(",")]
|
|
125
|
-
|
|
126
|
-
# find classes that extend clazz
|
|
127
|
-
for pack_name in _packages:
|
|
128
|
-
module_class_map = UtilsReflection.find_class_implementations_in_package(pack_name, clazz)
|
|
129
|
-
for mod, _clazz in module_class_map.items():
|
|
130
|
-
impl = mod.split(".")[-1]
|
|
131
|
-
result[impl] = _clazz
|
|
132
|
-
|
|
133
|
-
logger.info(f"[find_class_implementations|out] => {result}")
|
|
134
|
-
return result
|