cloe-nessy 0.3.3__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/__init__.py +0 -0
- cloe_nessy/clients/__init__.py +0 -0
- cloe_nessy/clients/api_client/__init__.py +0 -0
- cloe_nessy/clients/api_client/api_client.py +0 -0
- cloe_nessy/clients/api_client/api_response.py +0 -0
- cloe_nessy/clients/api_client/auth.py +0 -0
- cloe_nessy/clients/api_client/exceptions.py +0 -0
- cloe_nessy/file_utilities/__init__.py +0 -0
- cloe_nessy/file_utilities/exceptions.py +0 -0
- cloe_nessy/file_utilities/factory.py +0 -0
- cloe_nessy/file_utilities/get_file_paths.py +0 -0
- cloe_nessy/file_utilities/location_types.py +0 -0
- cloe_nessy/file_utilities/strategies/__init__.py +0 -0
- cloe_nessy/file_utilities/strategies/base_strategy.py +0 -0
- cloe_nessy/file_utilities/strategies/local_strategy.py +0 -0
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +0 -0
- cloe_nessy/file_utilities/strategies/utils_strategy.py +0 -0
- cloe_nessy/integration/__init__.py +0 -0
- cloe_nessy/integration/reader/__init__.py +0 -0
- cloe_nessy/integration/reader/api_reader.py +0 -0
- cloe_nessy/integration/reader/catalog_reader.py +0 -0
- cloe_nessy/integration/reader/excel_reader.py +0 -0
- cloe_nessy/integration/reader/exceptions.py +0 -0
- cloe_nessy/integration/reader/file_reader.py +7 -1
- cloe_nessy/integration/reader/reader.py +0 -0
- cloe_nessy/integration/writer/__init__.py +0 -0
- cloe_nessy/integration/writer/catalog_writer.py +1 -1
- cloe_nessy/logging/__init__.py +0 -0
- cloe_nessy/logging/logger_mixin.py +0 -0
- cloe_nessy/models/__init__.py +4 -0
- cloe_nessy/models/adapter/__init__.py +3 -0
- cloe_nessy/models/adapter/unity_catalog_adapter.py +292 -0
- cloe_nessy/models/catalog.py +10 -0
- cloe_nessy/models/column.py +0 -0
- cloe_nessy/models/constraint.py +0 -0
- cloe_nessy/models/foreign_key.py +0 -0
- cloe_nessy/models/mixins/__init__.py +0 -0
- cloe_nessy/models/mixins/read_instance_mixin.py +0 -0
- cloe_nessy/models/mixins/template_loader_mixin.py +0 -0
- cloe_nessy/models/schema.py +19 -0
- cloe_nessy/models/table.py +50 -5
- cloe_nessy/models/types.py +0 -0
- cloe_nessy/models/volume.py +67 -0
- cloe_nessy/object_manager/__init__.py +7 -2
- cloe_nessy/object_manager/table_manager.py +183 -7
- cloe_nessy/object_manager/volume_manager.py +70 -0
- cloe_nessy/pipeline/__init__.py +0 -0
- cloe_nessy/pipeline/actions/__init__.py +2 -0
- cloe_nessy/pipeline/actions/read_api.py +69 -45
- cloe_nessy/pipeline/actions/read_catalog_table.py +9 -9
- cloe_nessy/pipeline/actions/read_excel.py +14 -10
- cloe_nessy/pipeline/actions/read_files.py +54 -28
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +9 -9
- cloe_nessy/pipeline/actions/transform_change_datatype.py +13 -8
- cloe_nessy/pipeline/actions/transform_clean_column_names.py +4 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +25 -11
- cloe_nessy/pipeline/actions/transform_decode.py +18 -7
- cloe_nessy/pipeline/actions/transform_deduplication.py +9 -9
- cloe_nessy/pipeline/actions/transform_distinct.py +8 -8
- cloe_nessy/pipeline/actions/transform_filter.py +6 -6
- cloe_nessy/pipeline/actions/transform_generic_sql.py +12 -6
- cloe_nessy/pipeline/actions/transform_group_aggregate.py +20 -26
- cloe_nessy/pipeline/actions/transform_hash_columns.py +209 -0
- cloe_nessy/pipeline/actions/transform_join.py +17 -10
- cloe_nessy/pipeline/actions/transform_json_normalize.py +19 -6
- cloe_nessy/pipeline/actions/transform_rename_columns.py +7 -7
- cloe_nessy/pipeline/actions/transform_replace_values.py +8 -8
- cloe_nessy/pipeline/actions/transform_select_columns.py +38 -9
- cloe_nessy/pipeline/actions/transform_union.py +12 -8
- cloe_nessy/pipeline/actions/write_catalog_table.py +11 -10
- cloe_nessy/pipeline/pipeline.py +44 -2
- cloe_nessy/pipeline/pipeline_action.py +0 -0
- cloe_nessy/pipeline/pipeline_config.py +0 -0
- cloe_nessy/pipeline/pipeline_context.py +0 -0
- cloe_nessy/pipeline/pipeline_parsing_service.py +0 -0
- cloe_nessy/pipeline/pipeline_step.py +0 -0
- cloe_nessy/py.typed +0 -0
- cloe_nessy/session/__init__.py +0 -0
- cloe_nessy/session/session_manager.py +27 -0
- cloe_nessy/settings/__init__.py +0 -0
- cloe_nessy/settings/settings.py +0 -0
- cloe_nessy/utils/__init__.py +0 -0
- cloe_nessy/utils/file_and_directory_handler.py +0 -0
- cloe_nessy-0.3.8.dist-info/METADATA +46 -0
- {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.8.dist-info}/RECORD +41 -35
- {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.8.dist-info}/WHEEL +1 -1
- {cloe_nessy-0.3.3.dist-info → cloe_nessy-0.3.8.dist-info}/top_level.txt +0 -0
- cloe_nessy-0.3.3.dist-info/METADATA +0 -26
|
@@ -1,22 +1,119 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
from delta import DeltaTable # type: ignore
|
|
6
|
+
|
|
1
7
|
from ..logging import LoggerMixin
|
|
8
|
+
from ..models import Table
|
|
2
9
|
from ..session import SessionManager
|
|
3
10
|
|
|
4
11
|
|
|
12
|
+
@dataclass
|
|
13
|
+
class TableManagerLogs:
|
|
14
|
+
"""Dataclass defining the table manager logs table."""
|
|
15
|
+
|
|
16
|
+
logger_name = "Tabular:TableManager"
|
|
17
|
+
log_type: str = "nessy_simple_logs"
|
|
18
|
+
uc_table_name: str = "nessy_simple_logs"
|
|
19
|
+
uc_table_columns: dict[str, str] = field(
|
|
20
|
+
default_factory=lambda: {
|
|
21
|
+
"message": "STRING",
|
|
22
|
+
}
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def table_log_decorator(operation: str):
|
|
27
|
+
"""Creates a decorator that logs the start, failure (if any), and completion of a table operation.
|
|
28
|
+
|
|
29
|
+
The created decorator wraps a function that performs an operation on a table. The decorator logs
|
|
30
|
+
the start of the operation, calls the original function, logs if there was an exception, and logs
|
|
31
|
+
the completion of the operation. Functions that are wrapped must support the self._table_logger
|
|
32
|
+
attribute.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
operation: The name of the operation to be logged. This will be included in the log messages.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
inner_decorator: A decorator that can be used to wrap a function that performs an operation on a table.
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
```python
|
|
42
|
+
@table_log_decorator(operation='delete_physical_data_for_table')
|
|
43
|
+
def _delete_physical_data(self, table_identifier: str):
|
|
44
|
+
self._dbutils.fs.rm(table_location, recurse=True)
|
|
45
|
+
```
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def inner_decorator(func):
|
|
49
|
+
@functools.wraps(func)
|
|
50
|
+
def wrapper(self, *args, **kwargs):
|
|
51
|
+
table_identifier = kwargs.get("table_identifier") or kwargs.get("table").identifier or args[0]
|
|
52
|
+
if not isinstance(table_identifier, str):
|
|
53
|
+
# assume its a Table object
|
|
54
|
+
table_identifier = table_identifier.identifier
|
|
55
|
+
self._tabular_logger.info(
|
|
56
|
+
"operation:%s | identifier:%s | status:start | error:''",
|
|
57
|
+
operation,
|
|
58
|
+
table_identifier,
|
|
59
|
+
)
|
|
60
|
+
try:
|
|
61
|
+
func(self, *args, **kwargs)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
self._tabular_logger.error(
|
|
64
|
+
"operation:%s | identifier:%s | status:failed | error:%s",
|
|
65
|
+
operation,
|
|
66
|
+
table_identifier,
|
|
67
|
+
e,
|
|
68
|
+
)
|
|
69
|
+
raise e
|
|
70
|
+
else:
|
|
71
|
+
self._tabular_logger.info(
|
|
72
|
+
"operation:%s | identifier:%s | status:completed | error:''",
|
|
73
|
+
operation,
|
|
74
|
+
table_identifier,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return wrapper
|
|
78
|
+
|
|
79
|
+
return inner_decorator
|
|
80
|
+
|
|
81
|
+
|
|
5
82
|
class TableManager(LoggerMixin):
|
|
6
|
-
"""TableManager class for managing tables
|
|
83
|
+
"""TableManager class for managing tables."""
|
|
7
84
|
|
|
8
|
-
def __init__(self):
|
|
85
|
+
def __init__(self, tabular_logger: logging.Logger | None = None):
|
|
9
86
|
self._spark = SessionManager.get_spark_session()
|
|
10
87
|
self._utils = SessionManager.get_utils()
|
|
11
88
|
self._console_logger = self.get_console_logger()
|
|
12
89
|
self._console_logger.debug("TableManager initialized...")
|
|
13
|
-
self._tabular_logger = self.get_tabular_logger(
|
|
90
|
+
self._tabular_logger = tabular_logger or self.get_tabular_logger(**TableManagerLogs().__dict__)
|
|
14
91
|
self._tabular_logger.debug("message:TableManager initialized.")
|
|
15
92
|
|
|
16
|
-
@
|
|
17
|
-
def create_table(
|
|
18
|
-
|
|
19
|
-
|
|
93
|
+
@table_log_decorator(operation="create")
|
|
94
|
+
def create_table(
|
|
95
|
+
self,
|
|
96
|
+
table: Table,
|
|
97
|
+
ignore_if_exists: bool = False,
|
|
98
|
+
replace: bool = False,
|
|
99
|
+
) -> None:
|
|
100
|
+
"""Creates a Table in the catalog.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
table: A Table object representing the Delta table.
|
|
104
|
+
ignore_if_exists: If set to True, the function will return early
|
|
105
|
+
without doing anything if the table already exists.
|
|
106
|
+
replace: If set to True, the function will replace the table if it
|
|
107
|
+
already exists.
|
|
108
|
+
"""
|
|
109
|
+
if ignore_if_exists and self.table_exists(table):
|
|
110
|
+
return
|
|
111
|
+
self._console_logger.info(f"Creating table: {table.identifier}")
|
|
112
|
+
self._spark.sql(f"USE CATALOG {table.catalog};")
|
|
113
|
+
self._spark.sql(f"USE SCHEMA {table.schema};")
|
|
114
|
+
for statement in table.get_create_statement(replace=replace).split(";"):
|
|
115
|
+
if statement and statement != "\n":
|
|
116
|
+
self._spark.sql(statement)
|
|
20
117
|
|
|
21
118
|
def drop_table(self, table_identifier: str, delete_physical_data: bool = False):
|
|
22
119
|
"""Deletes a Table. For security reasons you are forced to pass the table_name.
|
|
@@ -56,3 +153,82 @@ class TableManager(LoggerMixin):
|
|
|
56
153
|
"""
|
|
57
154
|
self._console_logger.info("... deleting physical data for table [ '' ] from Catalog.")
|
|
58
155
|
raise NotImplementedError("This can be implemented, once a Table object is available.")
|
|
156
|
+
|
|
157
|
+
def get_delta_table(self, table: Table | None = None, location: str | None = None) -> DeltaTable:
|
|
158
|
+
"""Get the DeltaTable object from the Table objects location or a location string.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
table: A Table object representing the Delta table.
|
|
162
|
+
location: A string representing the table location.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
The DeltaTable object corresponding to the given Table object or location string.
|
|
166
|
+
|
|
167
|
+
Raises:
|
|
168
|
+
ValueError: If neither table nor location is provided, or if both are provided.
|
|
169
|
+
"""
|
|
170
|
+
if (table is None and location is None) or (table is not None and location is not None):
|
|
171
|
+
raise ValueError("Either table or location must be provided, but not both.")
|
|
172
|
+
|
|
173
|
+
if table is not None:
|
|
174
|
+
location = str(table.storage_path)
|
|
175
|
+
self._console_logger.info(f"Getting DeltaTable object for location: {location}")
|
|
176
|
+
return DeltaTable.forPath(self._spark, str(location))
|
|
177
|
+
|
|
178
|
+
def table_exists(self, table: Table | None = None, table_identifier: str | None = None) -> bool:
|
|
179
|
+
"""Checks if a table exists in the catalog.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
table: A Table object representing the Delta table.
|
|
183
|
+
table_identifier: A string representing the table identifier in the format 'catalog.schema.table'.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
True if the table exists, else False.
|
|
187
|
+
|
|
188
|
+
Raises:
|
|
189
|
+
ValueError: If neither table nor table_identifier is provided, or if both are provided.
|
|
190
|
+
ValueError: If the table_identifier is not in the format 'catalog.schema.table'.
|
|
191
|
+
"""
|
|
192
|
+
if (table is None and table_identifier is None) or (table is not None and table_identifier is not None):
|
|
193
|
+
raise ValueError("Either table or table_identifier must be provided, but not both.")
|
|
194
|
+
|
|
195
|
+
if table is not None:
|
|
196
|
+
catalog = table.catalog
|
|
197
|
+
schema = table.schema
|
|
198
|
+
table_name = table.name
|
|
199
|
+
else:
|
|
200
|
+
assert table_identifier is not None, "table_identifier must be provided."
|
|
201
|
+
catalog, schema, table_name = table_identifier.split(".")
|
|
202
|
+
if not all([catalog, schema, table_name]):
|
|
203
|
+
raise ValueError("Invalid table identifier format. Expected 'catalog.schema.table'.")
|
|
204
|
+
|
|
205
|
+
query_result = self._spark.sql(
|
|
206
|
+
f"""
|
|
207
|
+
SELECT 1 FROM {catalog}.information_schema.tables
|
|
208
|
+
WHERE table_name = '{table_name}'
|
|
209
|
+
AND table_schema = '{schema}'
|
|
210
|
+
LIMIT 1""",
|
|
211
|
+
)
|
|
212
|
+
result = query_result.count() > 0
|
|
213
|
+
self._console_logger.info(f"Table [ '{catalog}.{schema}.{table_name}' ] exists: {result}")
|
|
214
|
+
return result is True
|
|
215
|
+
|
|
216
|
+
@table_log_decorator(operation="refresh")
|
|
217
|
+
def refresh_table(self, table: Table | None = None, table_identifier: str | None = None):
|
|
218
|
+
"""Refreshes the metadata of a Delta table.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
table: A Table object representing the Delta table.
|
|
222
|
+
table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
|
|
223
|
+
|
|
224
|
+
Raises:
|
|
225
|
+
ValueError: If neither table nor table_identifier is provided, or if both are provided.
|
|
226
|
+
"""
|
|
227
|
+
if (table is None and table_identifier is None) or (table is not None and table_identifier is not None):
|
|
228
|
+
raise ValueError("Either table or table_identifier must be provided, but not both.")
|
|
229
|
+
|
|
230
|
+
if table is not None:
|
|
231
|
+
table_identifier = f"{table.catalog}.{table.schema}.{table.name}"
|
|
232
|
+
|
|
233
|
+
self._console_logger.info(f"Refreshing table: {table_identifier}")
|
|
234
|
+
self._spark.sql(f"REFRESH TABLE {table_identifier};")
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from ..logging import LoggerMixin
|
|
4
|
+
from ..models import Volume
|
|
5
|
+
from ..session import SessionManager
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class VolumeManager(LoggerMixin):
|
|
9
|
+
"""VolumeManager class for managing volumes."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, console_logger: logging.Logger | None = None):
|
|
12
|
+
self._spark = SessionManager.get_spark_session()
|
|
13
|
+
self._console_logger = console_logger or self.get_console_logger()
|
|
14
|
+
|
|
15
|
+
def create_volume(self, volume: Volume):
|
|
16
|
+
"""Creates a Volume in the catalog.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
volume: A Volume object representing the UC object.
|
|
20
|
+
"""
|
|
21
|
+
self._console_logger.info(f"Creating volume: {volume.identifier}")
|
|
22
|
+
self._spark.sql(f"USE CATALOG {volume.catalog};")
|
|
23
|
+
self._spark.sql(f"USE SCHEMA {volume.schema_name};")
|
|
24
|
+
for statement in volume.get_create_statement().split(";"):
|
|
25
|
+
if statement and statement != "\n":
|
|
26
|
+
self._spark.sql(statement)
|
|
27
|
+
|
|
28
|
+
def drop_volume(self, volume: Volume, if_exists: bool = True):
|
|
29
|
+
"""Delete the volume.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
volume: The volume to be deleted.
|
|
33
|
+
if_exists: If False, an error will be raised if the volume does not exist.
|
|
34
|
+
"""
|
|
35
|
+
self._console_logger.info(f"Deleting volume: [' {volume.identifier}' ]")
|
|
36
|
+
self._spark.sql(f"DROP VOLUME {'IF EXISTS' if if_exists else ''} {volume.escaped_identifier};")
|
|
37
|
+
self._console_logger.info(f"Volume [' {volume.identifier}' ] has been deleted.")
|
|
38
|
+
|
|
39
|
+
def volume_exists(self, volume: Volume | None = None, volume_identifier: str | None = None) -> bool:
|
|
40
|
+
"""Check if the volume exists.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
volume: The volume to check.
|
|
44
|
+
volume_identifier: The identifier of the volume to check.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If both volume and volume_identifier are provided.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
True if the volume exists, False otherwise.
|
|
51
|
+
"""
|
|
52
|
+
if volume and volume_identifier:
|
|
53
|
+
raise ValueError("Only one of volume or volume_identifier should be provided.")
|
|
54
|
+
if volume:
|
|
55
|
+
volume_identifier = volume.identifier
|
|
56
|
+
|
|
57
|
+
assert volume_identifier is not None
|
|
58
|
+
|
|
59
|
+
if volume_identifier.count(".") != 2:
|
|
60
|
+
raise ValueError("The identifier must be in the format 'catalog.schema.volume_name'.")
|
|
61
|
+
catalog, volume_schema, table_name = volume_identifier.split(".")
|
|
62
|
+
query_result = self._spark.sql(
|
|
63
|
+
f"""
|
|
64
|
+
SELECT 1 FROM {catalog}.information_schema.volumes
|
|
65
|
+
WHERE volume_name = '{table_name}'
|
|
66
|
+
AND volume_schema = '{volume_schema}'
|
|
67
|
+
LIMIT 1""",
|
|
68
|
+
)
|
|
69
|
+
result = query_result.count() > 0
|
|
70
|
+
return result is True
|
cloe_nessy/pipeline/__init__.py
CHANGED
|
File without changes
|
|
@@ -14,6 +14,7 @@ from .transform_distinct import TransformDistinctAction
|
|
|
14
14
|
from .transform_filter import TransformFilterAction
|
|
15
15
|
from .transform_generic_sql import TransformSqlAction
|
|
16
16
|
from .transform_group_aggregate import TransformGroupAggregate
|
|
17
|
+
from .transform_hash_columns import TransformHashColumnsAction
|
|
17
18
|
from .transform_join import TransformJoinAction
|
|
18
19
|
from .transform_json_normalize import TransformJsonNormalize
|
|
19
20
|
from .transform_rename_columns import TransformRenameColumnsAction
|
|
@@ -51,4 +52,5 @@ __all__ = [
|
|
|
51
52
|
"TransformRenameColumnsAction",
|
|
52
53
|
"TransformReplaceValuesAction",
|
|
53
54
|
"TransformSelectColumnsAction",
|
|
55
|
+
"TransformHashColumnsAction",
|
|
54
56
|
]
|
|
@@ -55,51 +55,75 @@ class ReadAPIAction(PipelineAction):
|
|
|
55
55
|
DataFrame containing the response data.
|
|
56
56
|
|
|
57
57
|
Example:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
58
|
+
=== "Basic Usage"
|
|
59
|
+
```yaml
|
|
60
|
+
Read API:
|
|
61
|
+
action: READ_API
|
|
62
|
+
options:
|
|
63
|
+
base_url: https://some_url.com/api/
|
|
64
|
+
endpoint: my/endpoint/
|
|
65
|
+
```
|
|
66
|
+
=== "Usage with Parameters and Headers"
|
|
67
|
+
```yaml
|
|
68
|
+
Read API:
|
|
69
|
+
action: READ_API
|
|
70
|
+
options:
|
|
71
|
+
base_url: https://some_url.com/api/
|
|
72
|
+
endpoint: my/endpoint/
|
|
73
|
+
method: GET
|
|
74
|
+
timeout: 90
|
|
75
|
+
headers:
|
|
76
|
+
key1: value1
|
|
77
|
+
key2: value2
|
|
78
|
+
params:
|
|
79
|
+
key1: value1
|
|
80
|
+
key2: value2
|
|
81
|
+
```
|
|
82
|
+
=== "Usage with Authentication"
|
|
83
|
+
```yaml
|
|
84
|
+
Read API:
|
|
85
|
+
action: READ_API
|
|
86
|
+
options:
|
|
87
|
+
base_url: https://some_url.com/api/
|
|
88
|
+
endpoint: my/endpoint/
|
|
89
|
+
method: GET
|
|
90
|
+
timeout: 90
|
|
91
|
+
auth:
|
|
92
|
+
- type: basic
|
|
93
|
+
username: my_username
|
|
94
|
+
password: my_password
|
|
95
|
+
- type: secret_scope
|
|
96
|
+
secret_scope: my_secret_scope
|
|
97
|
+
header_template:
|
|
98
|
+
"header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
|
|
99
|
+
- type: secret_scope
|
|
100
|
+
secret_scope: my_secret_scope
|
|
101
|
+
header_template:
|
|
102
|
+
"header_key_2": "<SECRET_NAME>"
|
|
103
|
+
- type: secret_scope
|
|
104
|
+
secret_scope: my_other_secret_scope
|
|
105
|
+
header_template:
|
|
106
|
+
"header_key_3": "<SECRET_NAME>"
|
|
107
|
+
- type: azure_oauth
|
|
108
|
+
client_id: my_client_id
|
|
109
|
+
client_secret: my_client_secret
|
|
110
|
+
tenant_id: my_tenant_id
|
|
111
|
+
scope: <entra-id-client-id>
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
The above example will combine the headers from the different auth types. The resulting header will look like this:
|
|
115
|
+
|
|
116
|
+
```json
|
|
117
|
+
{
|
|
118
|
+
"header_key_1": "value_from_environment_variable",
|
|
119
|
+
"header_key_2": "value_from_secret",
|
|
120
|
+
"header_key_3": "value_from_secret",
|
|
121
|
+
"Authorization": "Bearer <access_token> (from azure_oauth)",
|
|
122
|
+
"Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
!!! warning "Secret information"
|
|
103
127
|
Don't write sensitive information like passwords or tokens directly in the pipeline configuration.
|
|
104
128
|
Use secret scopes or environment variables instead.
|
|
105
129
|
"""
|
|
@@ -15,13 +15,13 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
15
15
|
into a DataFrame and returned as part of an updated `PipelineContext`.
|
|
16
16
|
|
|
17
17
|
Example:
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
18
|
+
```yaml
|
|
19
|
+
Read Sales Table:
|
|
20
|
+
action: READ_CATALOG_TABLE
|
|
21
|
+
options:
|
|
22
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
23
|
+
options: <options for the CatalogReader read method>
|
|
24
|
+
```
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
27
|
name: str = "READ_CATALOG_TABLE"
|
|
@@ -43,8 +43,8 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
43
43
|
read. If not provided, the function will attempt to use the table
|
|
44
44
|
identifier from the `table_metadata` in the `context`.
|
|
45
45
|
options: A dictionary of options for customizing
|
|
46
|
-
the
|
|
47
|
-
to None.
|
|
46
|
+
the [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader]
|
|
47
|
+
behavior, such as filters or reading modes. Defaults to None.
|
|
48
48
|
|
|
49
49
|
Raises:
|
|
50
50
|
ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
|
|
@@ -21,16 +21,20 @@ class ReadExcelAction(PipelineAction):
|
|
|
21
21
|
the read files can be included in the context.
|
|
22
22
|
|
|
23
23
|
Example:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
24
|
+
```yaml
|
|
25
|
+
Read Excel Table:
|
|
26
|
+
action: READ_EXCEL
|
|
27
|
+
options:
|
|
28
|
+
file: excel_file_folder/excel_files_june/interesting_excel_file.xlsx
|
|
29
|
+
usecols:
|
|
30
|
+
- key_column
|
|
31
|
+
- interesting_column
|
|
32
|
+
options: <options for the ExcelDataFrameReader read method>
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
!!! note "More Options"
|
|
36
|
+
The `READ_EXCEL` action supports additional options that can be passed to the
|
|
37
|
+
run method. For more information, refer to the method documentation.
|
|
34
38
|
"""
|
|
35
39
|
|
|
36
40
|
name: str = "READ_EXCEL"
|
|
@@ -14,14 +14,47 @@ class ReadFilesAction(PipelineAction):
|
|
|
14
14
|
location will be read using a DataFrameReader with the specified format.
|
|
15
15
|
|
|
16
16
|
Example:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
17
|
+
=== "Read files specified by spark_format"
|
|
18
|
+
```yaml
|
|
19
|
+
Read Files:
|
|
20
|
+
action: READ_FILES
|
|
21
|
+
options:
|
|
22
|
+
location: json_file_folder/
|
|
23
|
+
search_subdirs: True
|
|
24
|
+
spark_format: JSON
|
|
25
|
+
```
|
|
26
|
+
!!! note "Define Spark Format"
|
|
27
|
+
Use the `spark_format` option to specify the format with which
|
|
28
|
+
to read the files. Supported formats are e.g., `CSV`, `JSON`,
|
|
29
|
+
`PARQUET`, `TEXT`, and `XML`.
|
|
30
|
+
|
|
31
|
+
=== "Read files specified by extension"
|
|
32
|
+
```yaml
|
|
33
|
+
Read Files:
|
|
34
|
+
action: READ_FILES
|
|
35
|
+
options:
|
|
36
|
+
location: csv_file_folder/
|
|
37
|
+
search_subdirs: True
|
|
38
|
+
extension: csv
|
|
39
|
+
```
|
|
40
|
+
!!! note "Define Extension"
|
|
41
|
+
Use the `extension` option to specify the extension of the files
|
|
42
|
+
to read. If not specified, the `spark_format` will be derived from
|
|
43
|
+
the extension.
|
|
44
|
+
|
|
45
|
+
=== "Read files with a specified spark_format AND extension"
|
|
46
|
+
```yaml
|
|
47
|
+
Read Files:
|
|
48
|
+
action: READ_FILES
|
|
49
|
+
options:
|
|
50
|
+
location: file_folder/
|
|
51
|
+
extension: abc_custom_extension # specifies the files to read
|
|
52
|
+
spark_format: CSV # specifies the format to read the files with
|
|
53
|
+
```
|
|
54
|
+
!!! note "Define both Extension & Spark Format"
|
|
55
|
+
Use the `extension` option to specify the extension of the files
|
|
56
|
+
to read. Additionally, use the `spark_format` option to specify
|
|
57
|
+
the format with which to read the files.
|
|
25
58
|
"""
|
|
26
59
|
|
|
27
60
|
name: str = "READ_FILES"
|
|
@@ -47,7 +80,8 @@ class ReadFilesAction(PipelineAction):
|
|
|
47
80
|
search_subdirs: Recursively search subdirectories for files
|
|
48
81
|
if an extension is provided.
|
|
49
82
|
extension: The file extension to filter files by.
|
|
50
|
-
spark_format: The format to use for reading the files.
|
|
83
|
+
spark_format: The format to use for reading the files. If not provided,
|
|
84
|
+
it will be deferred from the file extension.
|
|
51
85
|
schema: The schema of the data. If None, schema is obtained from
|
|
52
86
|
the context metadata.
|
|
53
87
|
add_metadata_column: Whether to include the `__metadata` column with
|
|
@@ -65,30 +99,22 @@ class ReadFilesAction(PipelineAction):
|
|
|
65
99
|
raise ValueError("No location provided. Please specify location to read files from.")
|
|
66
100
|
if not options:
|
|
67
101
|
options = dict()
|
|
102
|
+
if not spark_format and not extension:
|
|
103
|
+
raise ValueError("Either spark_format or extension must be provided.")
|
|
68
104
|
|
|
69
105
|
if (metadata := context.table_metadata) and schema is None:
|
|
70
106
|
schema = metadata.schema
|
|
71
107
|
|
|
72
108
|
file_reader = FileReader()
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
elif spark_format:
|
|
83
|
-
df = file_reader.read(
|
|
84
|
-
location=location,
|
|
85
|
-
schema=schema,
|
|
86
|
-
spark_format=spark_format,
|
|
87
|
-
options=options,
|
|
88
|
-
add_metadata_column=add_metadata_column,
|
|
89
|
-
)
|
|
90
|
-
else:
|
|
91
|
-
raise ValueError("Please provide either the 'extension' or 'spark_format'")
|
|
109
|
+
df = file_reader.read(
|
|
110
|
+
location=location,
|
|
111
|
+
schema=schema,
|
|
112
|
+
extension=extension,
|
|
113
|
+
spark_format=spark_format,
|
|
114
|
+
search_subdirs=search_subdirs,
|
|
115
|
+
options=options,
|
|
116
|
+
add_metadata_column=add_metadata_column,
|
|
117
|
+
)
|
|
92
118
|
|
|
93
119
|
runtime_info = context.runtime_info
|
|
94
120
|
|
|
@@ -10,14 +10,14 @@ class ReadMetadataYAMLAction(PipelineAction):
|
|
|
10
10
|
"""Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
|
|
11
11
|
|
|
12
12
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
13
|
+
```yaml
|
|
14
|
+
Read Schema Metadata:
|
|
15
|
+
action: READ_METADATA_YAML_ACTION
|
|
16
|
+
options:
|
|
17
|
+
path: excel_file_folder/excel_files_june/
|
|
18
|
+
file_name: sales_schema.yml
|
|
19
|
+
table_name: sales
|
|
20
|
+
```
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
23
|
name: str = "READ_METADATA_YAML_ACTION"
|
|
@@ -31,7 +31,7 @@ class ReadMetadataYAMLAction(PipelineAction):
|
|
|
31
31
|
table_name: str | None = None,
|
|
32
32
|
**_: Any,
|
|
33
33
|
) -> PipelineContext:
|
|
34
|
-
"""Reads schema metadata from a yaml file using the `Schema` model.
|
|
34
|
+
"""Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
|
|
35
35
|
|
|
36
36
|
Args:
|
|
37
37
|
context: The context in which this Action is executed.
|
|
@@ -9,15 +9,20 @@ from ..pipeline_context import PipelineContext
|
|
|
9
9
|
class TransformChangeDatatypeAction(PipelineAction):
|
|
10
10
|
"""Changes the datatypes of specified columns in the given DataFrame.
|
|
11
11
|
|
|
12
|
+
!!! note "Data Types"
|
|
13
|
+
We make use of the PySpark `cast` function to change the data types of
|
|
14
|
+
the columns. Valid data types can be found in the [PySpark
|
|
15
|
+
documentation](https://spark.apache.org/docs/3.5.3/sql-ref-datatypes.html).
|
|
16
|
+
|
|
12
17
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
```yaml
|
|
19
|
+
Cast Columns:
|
|
20
|
+
action: TRANSFORM_CHANGE_DATATYPE
|
|
21
|
+
options:
|
|
22
|
+
columns:
|
|
23
|
+
id: string
|
|
24
|
+
revenue: long
|
|
25
|
+
```
|
|
21
26
|
"""
|
|
22
27
|
|
|
23
28
|
name: str = "TRANSFORM_CHANGE_DATATYPE"
|
|
@@ -15,6 +15,10 @@ class TransformCleanColumnNamesAction(PipelineAction):
|
|
|
15
15
|
Removes invalid characters from the column names, including the fields of a struct and
|
|
16
16
|
replaces a single leading underscore by a double underscore.
|
|
17
17
|
|
|
18
|
+
Invalid characters include:
|
|
19
|
+
- Any non-word character (anything other than letters, digits, and underscores).
|
|
20
|
+
- A single leading underscore.
|
|
21
|
+
|
|
18
22
|
Example:
|
|
19
23
|
```yaml
|
|
20
24
|
Clean Column Names:
|