tgedr-dataops 0.0.37__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
  2. tgedr_dataops/commons/utils_fs.py +187 -0
  3. tgedr_dataops/quality/pandas_validation.py +21 -0
  4. tgedr_dataops/sink/local_fs_file_sink.py +77 -0
  5. {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
  6. tgedr_dataops/source/abstract_s3_file_source.py +72 -0
  7. tgedr_dataops/source/local_fs_file_source.py +108 -0
  8. tgedr_dataops/source/pd_df_s3_source.py +130 -0
  9. {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
  10. tgedr_dataops/source/s3_file_extended_source.py +68 -0
  11. {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +63 -27
  12. tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
  13. tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
  14. tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
  15. tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
  16. tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
  17. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
  18. tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
  19. tgedr/dataops/chain.py +0 -51
  20. tgedr/dataops/commons/dataset.py +0 -23
  21. tgedr/dataops/commons/metadata.py +0 -172
  22. tgedr/dataops/commons/utils_fs.py +0 -85
  23. tgedr/dataops/commons/utils_spark.py +0 -87
  24. tgedr/dataops/etl.py +0 -112
  25. tgedr/dataops/processor.py +0 -27
  26. tgedr/dataops/sink/local_fs_file_sink.py +0 -47
  27. tgedr/dataops/sink/sink.py +0 -46
  28. tgedr/dataops/source/abstract_s3_file_source.py +0 -43
  29. tgedr/dataops/source/delta_table_source.py +0 -49
  30. tgedr/dataops/source/local_delta_table.py +0 -47
  31. tgedr/dataops/source/local_fs_file_source.py +0 -71
  32. tgedr/dataops/source/pd_df_s3_source.py +0 -76
  33. tgedr/dataops/source/s3_delta_table.py +0 -75
  34. tgedr/dataops/source/s3_file_extended_source.py +0 -39
  35. tgedr/dataops/source/source.py +0 -51
  36. tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
  37. tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
  38. tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
  39. tgedr/dataops/store/spark_delta.py +0 -369
  40. tgedr/dataops/store/store.py +0 -49
  41. tgedr/dataops/utils_reflection.py +0 -134
  42. tgedr/dataops/validation/abs.py +0 -46
  43. tgedr/dataops/validation/pandas.py +0 -10
  44. tgedr/dataops/validation/pyspark.py +0 -10
  45. tgedr_dataops-0.0.37.dist-info/METADATA +0 -21
  46. tgedr_dataops-0.0.37.dist-info/RECORD +0 -38
  47. tgedr_dataops-0.0.37.dist-info/top_level.txt +0 -1
  48. {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
  49. {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
  50. {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
  51. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
@@ -1,102 +0,0 @@
1
- import logging
2
- import s3fs
3
- import logging
4
- from typing import Any, Dict, List, Optional
5
- import pandas as pd
6
- import pyarrow as pa
7
-
8
- from tgedr.dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
9
- from tgedr.dataops.commons.utils_fs import remove_s3_protocol
10
-
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class S3FsSinglePartitionParquetStore(FsSinglePartitionParquetStore): # pragma: no cover
16
- """FsSinglePartitionParquetStore implementation using aws s3 file system"""
17
-
18
- CONFIG_KEY_AWS_ACCESS_KEY_ID: str = "aws_access_key_id"
19
- CONFIG_KEY_AWS_SECRET_ACCESS_KEY: str = "aws_secret_access_key"
20
- CONFIG_KEY_AWS_SESSION_TOKEN: str = "aws_session_token"
21
-
22
- @property
23
- def fs(self):
24
- if self._fs is None:
25
- if (self._config is not None) and all(
26
- element in list(self._config.keys())
27
- for element in [
28
- self.CONFIG_KEY_AWS_ACCESS_KEY_ID,
29
- self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY,
30
- self.CONFIG_KEY_AWS_SESSION_TOKEN,
31
- ]
32
- ):
33
- self._fs = s3fs.S3FileSystem(
34
- key=self._config[self.CONFIG_KEY_AWS_ACCESS_KEY_ID],
35
- secret=self._config[self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY],
36
- token=self._config[self.CONFIG_KEY_AWS_SESSION_TOKEN],
37
- )
38
- else:
39
- self._fs = s3fs.S3FileSystem()
40
- return self._fs
41
-
42
- def _rmdir(self, key):
43
- if self.fs.isdir(key):
44
- self.fs.delete(key, recursive=True)
45
-
46
- def _exists(self, key) -> bool:
47
- return self.fs.get_file_info(key).type.name != "NotFound"
48
-
49
- def get(
50
- self,
51
- key: str,
52
- filter: callable = None,
53
- filters: List[tuple[str, str, List[str]]] = None,
54
- schema: pa.Schema = None,
55
- ) -> pd.DataFrame:
56
- return super().get(key=remove_s3_protocol(key), filter=filter, filters=filters, schema=schema)
57
-
58
- def delete(
59
- self,
60
- key: str,
61
- partition_field: Optional[str] = None,
62
- partition_values: Optional[List[str]] = None,
63
- kv_dict: Optional[Dict[str, List[Any]]] = None,
64
- schema: pa.Schema = None,
65
- ):
66
- super().delete(
67
- key=remove_s3_protocol(key),
68
- partition_field=partition_field,
69
- partition_values=partition_values,
70
- kv_dict=kv_dict,
71
- schema=schema,
72
- )
73
-
74
- def save(
75
- self,
76
- df: pd.DataFrame,
77
- key: str,
78
- partition_field: Optional[str] = None,
79
- append: bool = False,
80
- replace_partitions: bool = False,
81
- schema: Any = None,
82
- ):
83
- super().save(
84
- df=df,
85
- key=remove_s3_protocol(key),
86
- partition_field=partition_field,
87
- append=append,
88
- replace_partitions=replace_partitions,
89
- schema=schema,
90
- )
91
-
92
- def update(
93
- self,
94
- df: pd.DataFrame,
95
- key: str,
96
- key_fields: List[str],
97
- partition_field: Optional[str] = None,
98
- schema: Any = None,
99
- ):
100
- super().update(
101
- df=df, key=remove_s3_protocol(key), key_fields=key_fields, partition_field=partition_field, schema=schema
102
- )
@@ -1,369 +0,0 @@
1
- from abc import ABC
2
- import dataclasses
3
- import logging
4
- from typing import Any, Dict, List, Optional, Union
5
- from datetime import datetime
6
- from pyspark.sql import DataFrame
7
- from delta.tables import DeltaTable
8
- from pyspark.sql import functions as F
9
- from pyspark.sql import types as T
10
- from pyspark.sql.utils import AnalysisException
11
- from pyspark.sql.functions import monotonically_increasing_id
12
- from tgedr.dataops.store.store import NoStoreException, Store, StoreException
13
- from tgedr.dataops.commons.metadata import Metadata
14
- from tgedr.dataops.commons.utils_spark import UtilsSpark
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
-
19
- class SparkDeltaStore(Store, ABC):
20
- def __init__(self, config: Optional[Dict[str, Any]] = None):
21
- Store.__init__(self, config)
22
-
23
- def get(self, key: str, version: str = None, **kwargs) -> DataFrame:
24
- logger.info(f"[get|in] ({key}, {version})")
25
-
26
- table = self._get_table(path=key)
27
- if table is None:
28
- raise NoStoreException(f"[get] couldn't find data in key: {key}")
29
-
30
- reader = UtilsSpark.get_spark_session().read.format("delta")
31
- if version is not None:
32
- reader = reader.option("versionAsOf", version)
33
-
34
- result = reader.load(key)
35
-
36
- logger.info("[get_df|out]")
37
- return result
38
-
39
- def __get_deletion_criteria(self, df):
40
- logger.debug("[__get_deletion_criteria|in])")
41
- fields = df.dtypes
42
- numerics = [
43
- x
44
- for x in fields
45
- if x[1] in ["bigint", "int", "double", "float", "long", "decimal.Decimal"] or (x[1][:7]) == "decimal"
46
- ]
47
- dates = [x for x in fields if (x[1]) in ["datetime", "datetime.datetime"]]
48
- textuals = [x for x in fields if x[1] in ["string"]]
49
- if 0 < len(numerics):
50
- column = numerics[0][0]
51
- result = (F.col(column) > 0) | (F.col(column) <= 0)
52
- elif 0 < len(dates):
53
- column = dates[0][0]
54
- now = datetime.now()
55
- result = (F.col(column) > now) | (F.col(column) <= now)
56
- elif 0 < len(textuals):
57
- column = textuals[0][0]
58
- result = (F.col(column) > "a") | (F.col(column) <= "a")
59
- else:
60
- raise StoreException(
61
- f"[__get_deletion_criteria] failed to figure out column types handy to create a full deletion criteria"
62
- )
63
-
64
- logger.debug(f"[__get_deletion_criteria|out] = {result}")
65
- return result
66
-
67
- def delete(self, key: str, condition: Union[F.Column, str, None] = None, **kwargs) -> None:
68
- logger.info(f"[delete|in] ({key}, {condition})")
69
-
70
- spark = UtilsSpark.get_spark_session()
71
- """
72
- is_s3_operation = True if key.startswith("s3") else False
73
- if is_s3_operation:
74
- """
75
- delta_table = DeltaTable.forPath(spark, key)
76
- if condition is None:
77
- condition = self.__get_deletion_criteria(delta_table.toDF())
78
- delta_table.delete(condition=condition)
79
- """
80
- else: # local development mostly for temporary or test purposes
81
- spark_fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
82
- # get spark context path
83
- spark_path = spark._jvm.org.apache.hadoop.fs.Path(key)
84
- logger.info(f"[delete] spark path is {spark_path}")
85
- try:
86
- if spark_fs.exists(spark_path):
87
- spark_fs.delete(spark_path, True)
88
- except AnalysisException as x:
89
- raise StoreException(f"[delete] couldn't do it on key {key}: {x}")
90
- """
91
- logger.info("[delete|out]")
92
-
93
- def save(
94
- self,
95
- df: DataFrame,
96
- key: str,
97
- append: bool = False,
98
- partition_fields: Optional[List[str]] = None,
99
- metadata: Optional[Metadata] = None,
100
- retention_days: int = 7,
101
- deleted_retention_days: int = 7,
102
- column_descriptions: Optional[Dict[str, str]] = None,
103
- table_name: Optional[str] = None,
104
- **kwargs,
105
- ):
106
- logger.info(
107
- f"[save|in] ({df}, {key}, {append}, {partition_fields}, {metadata}, {retention_days}, {deleted_retention_days}, {column_descriptions}, {table_name}, {kwargs})"
108
- )
109
-
110
- if column_descriptions is not None:
111
- df = self._set_column_descriptions(df, column_descriptions)
112
-
113
- if append:
114
- writer = df.write.format("delta").mode("append")
115
- else:
116
- writer = df.write.format("delta").mode("overwrite")
117
-
118
- if partition_fields is not None:
119
- table = self._get_table(path=key)
120
- if table is not None:
121
- self._set_table_partitions(path=key, partition_fields=partition_fields)
122
- writer = writer.partitionBy(*partition_fields)
123
-
124
- if self._has_schema_changed(path=key, df=df):
125
- writer = writer.option("overwriteSchema", "true")
126
-
127
- if metadata:
128
- writer = writer.option("userMetadata", metadata)
129
-
130
- if table_name is not None:
131
- # assume we have db.table
132
- db = table_name.split(".")[0]
133
- UtilsSpark.get_spark_session().sql(f"CREATE DATABASE IF NOT EXISTS {db}")
134
- writer = writer.option("path", key).saveAsTable(table_name)
135
- else:
136
- writer.save(key)
137
-
138
- logger.info(f"[save] optimizing...")
139
- table = self._get_table(path=key)
140
-
141
- if retention_days is not None and deleted_retention_days is not None:
142
- self.enforce_retention_policy(
143
- path=key, retention_days=retention_days, deleted_retention_days=deleted_retention_days
144
- )
145
- elif retention_days is not None:
146
- self.enforce_retention_policy(path=key, retention_days=retention_days)
147
-
148
- table.optimize().executeCompaction()
149
-
150
- logger.info(f"[save|out]")
151
-
152
- def update(
153
- self,
154
- df: Any,
155
- key: str,
156
- match_fields: List[str],
157
- partition_fields: Optional[List[str]] = None,
158
- metadata: Optional[Metadata] = None,
159
- retention_days: int = 7,
160
- deleted_retention_days: int = 7,
161
- **kwargs,
162
- ):
163
- logger.info(
164
- f"[update|in] ({df}, {key}, {match_fields}, {partition_fields}, {metadata}, {retention_days}, {deleted_retention_days}, {kwargs})"
165
- )
166
-
167
- table = self._get_table(path=key)
168
- if table is None:
169
- self.save(
170
- df=df,
171
- key=key,
172
- partition_fields=partition_fields,
173
- metadata=metadata,
174
- retention_days=retention_days,
175
- deleted_retention_days=deleted_retention_days,
176
- **kwargs,
177
- )
178
- else:
179
- if partition_fields is not None:
180
- self._set_table_partitions(path=key, partition_fields=partition_fields)
181
-
182
- match_clause = None
183
- for field in match_fields:
184
- match_clause = (
185
- f"current.{field} = updates.{field}"
186
- if match_clause is None
187
- else f"{match_clause} and current.{field} = updates.{field}"
188
- )
189
- logger.info(f"[update] match clause: {match_clause}")
190
-
191
- # check if the df has all the required columns
192
- # as we are upserting the updated columns coming in must at least match or exceed the current columns
193
- for column in table.toDF().columns:
194
- # we'll assume missing columns are nullable, typically metrics
195
- if column not in df.columns:
196
- df = df.withColumn(column, F.lit(None).cast(T.StringType()))
197
-
198
- table.alias("current").merge(
199
- df.alias("updates"), match_clause
200
- ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
201
-
202
- if retention_days is not None and deleted_retention_days is not None:
203
- self.enforce_retention_policy(
204
- path=key, retention_days=retention_days, deleted_retention_days=deleted_retention_days
205
- )
206
- elif retention_days is not None:
207
- self.enforce_retention_policy(path=key, retention_days=retention_days)
208
-
209
- table.optimize().executeCompaction()
210
-
211
- logger.info("[UtilsDeltaTable.upsert|out]")
212
-
213
- def enforce_retention_policy(self, path: str, retention_days: int = 7, deleted_retention_days: int = 7):
214
- logger.info(f"[enforce_retention_policy|in] ({path}, {retention_days}, {deleted_retention_days})")
215
-
216
- retention = f"interval {retention_days} days"
217
- deleted_retention = f"interval {deleted_retention_days} days"
218
-
219
- UtilsSpark.get_spark_session().sql(
220
- f"ALTER TABLE delta.`{path}` SET TBLPROPERTIES('delta.logRetentionDuration' = '{retention}', 'delta.deletedFileRetentionDuration' = '{deleted_retention}')"
221
- )
222
- logger.info("[enforce_retention_policy|out]")
223
-
224
- def get_latest_table_versions(self, path: str, how_many: int = 1) -> List[str]:
225
- """
226
- checks the delta table history and retrieves the latest n versions
227
- sorted from the newest to the oldest
228
- """
229
- logger.info(f"[get_latest_table_versions|in] ({path}, {how_many})")
230
- result: List[str] = []
231
-
232
- table = self._get_table(path=path)
233
- if table is not None:
234
- history_rows = table.history().orderBy(F.desc("timestamp")).limit(how_many)
235
- result = [str(x.version) for x in history_rows.collect()]
236
-
237
- logger.info(f"[get_latest_table_versions|out] => {result}")
238
- return result
239
-
240
- def get_metadata(self, path: str, version: str = None) -> Optional[Metadata]:
241
- """
242
- Raises
243
- ------
244
- NoStoreException
245
- """
246
- logger.info(f"[get_metadata|in] ({path}, {version})")
247
- table = self._get_table(path)
248
- if table is None:
249
- raise NoStoreException(f"[get_metadata] no data in path: {path}")
250
-
251
- result = None
252
-
253
- df_history = table.history().filter(F.col("userMetadata").isNotNull())
254
- if version is not None:
255
- df_history = df_history.filter(F.col("version") <= int(version))
256
-
257
- df_history = df_history.orderBy(F.col("version").desc())
258
- if not df_history.isEmpty():
259
- userMetadata = df_history.take(1)[0].userMetadata
260
- result = Metadata.from_str(userMetadata)
261
- if version is not None:
262
- result = dataclasses.replace(result, version=version)
263
-
264
- logger.info(f"[get_metadata|out] => ({result})")
265
- return result
266
-
267
- def _get_delta_log(self, path: str) -> DataFrame:
268
- logger.info(f"[_get_delta_log|in] ({path})")
269
-
270
- spark = UtilsSpark.get_spark_session()
271
- jdf = (
272
- spark._jvm.org.apache.spark.sql.delta.DeltaLog.forTable(spark._jsparkSession, path)
273
- .snapshot()
274
- .allFiles()
275
- .toDF()
276
- )
277
- result = DataFrame(jdf, spark)
278
-
279
- logger.info(f"[_get_delta_log|out] => {result}")
280
- return result
281
-
282
- def _get_table_partitions(self, path: str) -> List[str]:
283
- logger.info(f"[_get_table_partitions|in] ({path})")
284
- result: List[str] = []
285
-
286
- delta_log: DataFrame = self._get_delta_log(path=path)
287
- partition_keys = [
288
- x.keys
289
- for x in delta_log.select(F.map_keys(F.col("partitionValues")).alias("keys")).distinct().collect()
290
- if 0 < len(x)
291
- ]
292
- if 0 < len(partition_keys):
293
- result: List[str] = list({y for y in partition_keys for y in y})
294
-
295
- logger.info(f"[_get_table_partitions|out] => {result}")
296
- return result
297
-
298
- def _vacuum_now(self, path: str):
299
- logger.info("[_vacuum_now|in]")
300
-
301
- spark = UtilsSpark.get_spark_session()
302
- old_conf_value = spark.conf.get("spark.databricks.delta.retentionDurationCheck.enabled")
303
- spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
304
- DeltaTable.forPath(spark, path).vacuum(0)
305
- spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", old_conf_value)
306
-
307
- logger.info("[_vacuum_now|out]")
308
-
309
- def _has_schema_changed(self, path: str, df: DataFrame) -> bool:
310
- logger.info(f"[_has_schema_changed|in] ({path},{df})")
311
- result: bool = False
312
- table = self._get_table(path=path)
313
- if table is not None:
314
- result = table.toDF().schema != df.schema
315
- logger.info(f"[_has_schema_changed|out] => {result}")
316
- return result
317
-
318
- def _set_table_partitions(self, path: str, partition_fields: List[str]) -> None:
319
- logger.info(f"[_set_table_partitions|in] ({path},{partition_fields})")
320
-
321
- spark = UtilsSpark.get_spark_session()
322
- # let's check partition_cols
323
- current_partition_fields = self._get_table_partitions(path=path)
324
- shall_we_repartition = sorted(partition_fields) != sorted(current_partition_fields)
325
-
326
- if shall_we_repartition:
327
- logger.info("[_set_table_partitions] going to repartition")
328
- new_df = spark.read.format("delta").load(path)
329
- new_df.write.format("delta").mode("overwrite").partitionBy(*partition_fields).option(
330
- "overwriteSchema", "true"
331
- ).save(path)
332
- self._vacuum_now(path)
333
- logger.info(
334
- f"[_set_table_partitions] changed partition cols from {current_partition_fields} to {partition_fields}"
335
- )
336
- logger.info("[_set_table_partitions|out]")
337
-
338
- def _get_table(self, path) -> Optional[DeltaTable]:
339
- logger.debug(f"[_get_table|in] ({path})")
340
- result: DeltaTable = None
341
- try:
342
- result: DeltaTable = DeltaTable.forPath(UtilsSpark.get_spark_session(), path)
343
- except AnalysisException as ax:
344
- logger.warning(f"[_get_table] couldn't load from {path}: {ax}")
345
-
346
- logger.debug(f"[_get_table|out] => {result}")
347
- return result
348
-
349
- def set_column_comments(self, db: str, table: str, col_comments: Dict[str, str]) -> None:
350
- logger.info(f"[set_column_comments|in] ({db}, {table}, {col_comments})")
351
- spark = UtilsSpark.get_spark_session()
352
-
353
- table_description: DataFrame = spark.sql(f"describe {db}.{table}").withColumn(
354
- "set_column_comments_id", monotonically_increasing_id()
355
- )
356
- id = table_description.filter(F.col("col_name") == "# Partitioning").collect()[0].set_column_comments_id
357
- table_description = table_description.filter(
358
- (F.col("set_column_comments_id") < F.lit(id)) & (F.col("col_name") != "")
359
- ).drop("set_column_comments_id")
360
- rows = [r.asDict() for r in table_description.collect()]
361
- for row in rows:
362
- col = row["col_name"]
363
- data_type = row["data_type"]
364
- if col in col_comments:
365
- new_comment = col_comments[col]
366
- logger.info(f"[set_column_comments] setting new comment ({new_comment}) to column {col}")
367
- spark.sql(f"ALTER TABLE {db}.{table} CHANGE COLUMN {col} {col} {data_type} COMMENT '{new_comment}'")
368
-
369
- logger.info("[set_column_comments|out]")
@@ -1,49 +0,0 @@
1
- import abc
2
- from typing import Any, Dict, Optional
3
-
4
-
5
- class StoreException(Exception):
6
- pass
7
-
8
-
9
- class NoStoreException(StoreException):
10
- pass
11
-
12
-
13
- class StoreInterface(metaclass=abc.ABCMeta):
14
- @classmethod
15
- def __subclasshook__(cls, subclass):
16
- return (
17
- hasattr(subclass, "get")
18
- and callable(subclass.get)
19
- and hasattr(subclass, "delete")
20
- and callable(subclass.delete)
21
- and hasattr(subclass, "save")
22
- and callable(subclass.save)
23
- and hasattr(subclass, "update")
24
- and callable(subclass.update)
25
- ) or NotImplemented
26
-
27
-
28
- @StoreInterface.register
29
- class Store(abc.ABC):
30
- """abstract class used to manage persistence, defining CRUD-like (CreateReadUpdateDelete) methods"""
31
-
32
- def __init__(self, config: Optional[Dict[str, Any]] = None):
33
- self._config = config
34
-
35
- @abc.abstractmethod
36
- def get(self, key: str, **kwargs) -> Any:
37
- raise NotImplementedError()
38
-
39
- @abc.abstractmethod
40
- def delete(self, key: str, **kwargs) -> None:
41
- raise NotImplementedError()
42
-
43
- @abc.abstractmethod
44
- def save(self, df: Any, key: str, **kwargs):
45
- raise NotImplementedError()
46
-
47
- @abc.abstractmethod
48
- def update(self, df: Any, key: str, **kwargs):
49
- raise NotImplementedError()
@@ -1,134 +0,0 @@
1
- import importlib
2
- import inspect
3
- import logging
4
- import os
5
- import sys
6
- from importlib import import_module
7
- from typing import Dict, Any, List
8
-
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- class UtilsReflectionException(Exception):
14
- pass
15
-
16
-
17
- class UtilsReflection:
18
- __MODULE_EXTENSIONS = (".py", ".pyc", ".pyo")
19
-
20
- @staticmethod
21
- def load_class(clazz: str, parent_check: type = None) -> Any:
22
- logger.debug(f"[load_class|in] (clazz={clazz}, parent_check={parent_check})")
23
- type_elements = clazz.split(".")
24
- module = ".".join(type_elements[:-1])
25
- _clazz = type_elements[-1]
26
-
27
- result = getattr(import_module(module), _clazz)
28
-
29
- if not callable(result):
30
- raise TypeError(f"Object {_clazz} in {module} is not callable.")
31
-
32
- if parent_check and (not issubclass(result, parent_check)):
33
- raise TypeError(f"Wrong class type, it is not a subclass of {parent_check.__name__}")
34
-
35
- logger.debug(f"[load_class|out] => {result}")
36
- return result
37
-
38
- @staticmethod
39
- def load_subclass_from_module(module: str, clazz: str, super_clazz: type) -> Any:
40
- logger.info(f"[load_subclass_from_module|in] (module={module}, clazz={clazz}, super_clazz={super_clazz})")
41
- result = getattr(import_module(module), clazz)
42
-
43
- if not callable(result):
44
- raise TypeError(f"Object {clazz} in {module} is not callable.")
45
-
46
- if super_clazz and (not issubclass(result, super_clazz)):
47
- raise TypeError(f"Wrong class type, it is not a subclass of {super_clazz.__name__}")
48
-
49
- logger.info(f"[load_subclass_from_module|out] => {result}")
50
- return result
51
-
52
- @staticmethod
53
- def get_type(module: str, _type: str) -> type:
54
- logger.info(f"[get_type|in] (module={module}, _type={_type})")
55
- result = None
56
-
57
- result = getattr(import_module(module), _type)
58
-
59
- logger.info(f"[get_type|out] => {result}")
60
- return result
61
-
62
- @staticmethod
63
- def is_subclass_of(sub_class: type, super_class: type) -> bool:
64
- logger.info(f"[is_subclass_of|in] ({sub_class}, {super_class})")
65
- result = False
66
-
67
- if callable(sub_class) and issubclass(sub_class, super_class):
68
- result = True
69
-
70
- logger.info(f"[is_subclass_of|out] => {result}")
71
- return result
72
-
73
- @staticmethod
74
- def find_module_classes(module: str) -> List[Any]:
75
- logger.info(f"[find_module_classes|in] ({module})")
76
- result = []
77
- for name, obj in inspect.getmembers(sys.modules[module]):
78
- if inspect.isclass(obj):
79
- result.append(obj)
80
- logger.info(f"[find_module_classes|out] => {result}")
81
- return result
82
-
83
- @staticmethod
84
- def find_class_implementations_in_package(package_name: str, super_class: type) -> Dict[str, type]:
85
- logger.info(f"[find_class_implementations_in_package|in] ({package_name}, {super_class})")
86
- result = {}
87
-
88
- the_package = importlib.import_module(package_name)
89
- pkg_path = the_package.__path__[0]
90
- modules = [
91
- package_name + "." + module.split(".")[0]
92
- for module in os.listdir(pkg_path)
93
- if module.endswith(UtilsReflection.__MODULE_EXTENSIONS) and module != "__init__.py"
94
- ]
95
-
96
- logger.info(f"[find_class_implementations_in_package] found modules: {modules}")
97
-
98
- for _module in modules:
99
- if _module not in sys.modules:
100
- importlib.import_module(_module)
101
-
102
- for _class in UtilsReflection.find_module_classes(_module):
103
- if UtilsReflection.is_subclass_of(_class, super_class) and _class != super_class:
104
- result[_module] = _class
105
-
106
- logger.info(f"[find_class_implementations_in_package|out] => {result}")
107
- return result
108
-
109
- @staticmethod
110
- def find_package_path(package_name: str) -> str:
111
- logger.info(f"[find_package_path|in] ({package_name})")
112
- the_package = importlib.import_module(package_name)
113
- result = the_package.__path__[0]
114
- logger.info(f"[find_package_path|out] => {result}")
115
- return result
116
-
117
- @staticmethod
118
- def find_class_implementations(packages: str, clazz: Any) -> Dict[str, Any]:
119
- """
120
- throws UtilsReflectionException
121
- """
122
- logger.info(f"[find_class_implementations|in] ({packages}, {clazz})")
123
- result = {}
124
- _packages = [a.strip() for a in packages.split(",")]
125
-
126
- # find classes that extend clazz
127
- for pack_name in _packages:
128
- module_class_map = UtilsReflection.find_class_implementations_in_package(pack_name, clazz)
129
- for mod, _clazz in module_class_map.items():
130
- impl = mod.split(".")[-1]
131
- result[impl] = _clazz
132
-
133
- logger.info(f"[find_class_implementations|out] => {result}")
134
- return result