cloe-nessy 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/__init__.py +0 -0
- cloe_nessy/clients/__init__.py +5 -0
- cloe_nessy/clients/api_client/__init__.py +3 -0
- cloe_nessy/clients/api_client/api_client.py +188 -0
- cloe_nessy/clients/api_client/api_response.py +72 -0
- cloe_nessy/clients/api_client/auth.py +178 -0
- cloe_nessy/clients/api_client/exceptions.py +22 -0
- cloe_nessy/file_utilities/__init__.py +3 -0
- cloe_nessy/file_utilities/exceptions.py +4 -0
- cloe_nessy/file_utilities/factory.py +42 -0
- cloe_nessy/file_utilities/get_file_paths.py +72 -0
- cloe_nessy/file_utilities/location_types.py +29 -0
- cloe_nessy/file_utilities/strategies/__init__.py +0 -0
- cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
- cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
- cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
- cloe_nessy/integration/__init__.py +0 -0
- cloe_nessy/integration/reader/__init__.py +6 -0
- cloe_nessy/integration/reader/api_reader.py +141 -0
- cloe_nessy/integration/reader/catalog_reader.py +49 -0
- cloe_nessy/integration/reader/excel_reader.py +170 -0
- cloe_nessy/integration/reader/exceptions.py +10 -0
- cloe_nessy/integration/reader/file_reader.py +96 -0
- cloe_nessy/integration/reader/reader.py +34 -0
- cloe_nessy/integration/writer/__init__.py +3 -0
- cloe_nessy/integration/writer/catalog_writer.py +48 -0
- cloe_nessy/logging/__init__.py +3 -0
- cloe_nessy/logging/logger_mixin.py +162 -0
- cloe_nessy/models/__init__.py +13 -0
- cloe_nessy/models/column.py +65 -0
- cloe_nessy/models/constraint.py +9 -0
- cloe_nessy/models/foreign_key.py +34 -0
- cloe_nessy/models/mixins/__init__.py +0 -0
- cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
- cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
- cloe_nessy/models/schema.py +76 -0
- cloe_nessy/models/table.py +236 -0
- cloe_nessy/models/types.py +7 -0
- cloe_nessy/object_manager/__init__.py +3 -0
- cloe_nessy/object_manager/table_manager.py +58 -0
- cloe_nessy/pipeline/__init__.py +7 -0
- cloe_nessy/pipeline/actions/__init__.py +50 -0
- cloe_nessy/pipeline/actions/read_api.py +178 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
- cloe_nessy/pipeline/actions/read_excel.py +177 -0
- cloe_nessy/pipeline/actions/read_files.py +105 -0
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
- cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
- cloe_nessy/pipeline/actions/transform_decode.py +102 -0
- cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
- cloe_nessy/pipeline/actions/transform_filter.py +51 -0
- cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
- cloe_nessy/pipeline/actions/transform_join.py +81 -0
- cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
- cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
- cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
- cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
- cloe_nessy/pipeline/actions/transform_union.py +71 -0
- cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
- cloe_nessy/pipeline/pipeline.py +201 -0
- cloe_nessy/pipeline/pipeline_action.py +62 -0
- cloe_nessy/pipeline/pipeline_config.py +92 -0
- cloe_nessy/pipeline/pipeline_context.py +56 -0
- cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
- cloe_nessy/pipeline/pipeline_step.py +50 -0
- cloe_nessy/py.typed +0 -0
- cloe_nessy/session/__init__.py +3 -0
- cloe_nessy/session/session_manager.py +188 -0
- cloe_nessy/settings/__init__.py +3 -0
- cloe_nessy/settings/settings.py +91 -0
- cloe_nessy/utils/__init__.py +0 -0
- cloe_nessy/utils/file_and_directory_handler.py +19 -0
- cloe_nessy-0.2.9.dist-info/METADATA +26 -0
- cloe_nessy-0.2.9.dist-info/RECORD +78 -0
- cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
- cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Self
|
|
3
|
+
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from ..utils.file_and_directory_handler import process_path
|
|
7
|
+
from .mixins.read_instance_mixin import ReadInstancesMixin
|
|
8
|
+
from .table import Table
|
|
9
|
+
from .types import ValidationErrorType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Schema(ReadInstancesMixin):
|
|
13
|
+
"""A Class to represent a Schema in Unity Catalog."""
|
|
14
|
+
|
|
15
|
+
catalog: str
|
|
16
|
+
name: str
|
|
17
|
+
storage_path: str | None = None
|
|
18
|
+
tables: list[Table] = Field(default_factory=list)
|
|
19
|
+
properties: dict[str, Any] = Field(default_factory=dict)
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def read_instance_from_file(
|
|
23
|
+
cls,
|
|
24
|
+
instance_path: str | Path,
|
|
25
|
+
fail_on_missing_subfolder: bool = True,
|
|
26
|
+
table_dir_name: str = "tables",
|
|
27
|
+
**_: Any,
|
|
28
|
+
) -> tuple[Self | None, list[ValidationErrorType]]:
|
|
29
|
+
"""Read a schema from file.
|
|
30
|
+
|
|
31
|
+
Adds the table objects from a path relative to the schema definition.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
instance_path: The path to the Schema definition YAML file.
|
|
35
|
+
fail_on_missing_subfolder: If False return a tuple with 2 empty
|
|
36
|
+
lists. Otherwise raise a FileNotFoundError.
|
|
37
|
+
table_dir_name: The name of the directory containing the Table
|
|
38
|
+
definitions related to this schema. Can be a relative path.
|
|
39
|
+
"""
|
|
40
|
+
processed_instance_path = process_path(instance_path)
|
|
41
|
+
if not processed_instance_path:
|
|
42
|
+
raise FileNotFoundError("Schema file not found.")
|
|
43
|
+
|
|
44
|
+
schema, schema_errors = super().read_instance_from_file(processed_instance_path)
|
|
45
|
+
if schema:
|
|
46
|
+
schema.storage_path = "" if not schema.storage_path else schema.storage_path
|
|
47
|
+
tables, table_errors = Table.read_instances_from_directory(
|
|
48
|
+
instance_path=processed_instance_path.parents[0] / table_dir_name,
|
|
49
|
+
catalog_name=schema.catalog,
|
|
50
|
+
schema_name=schema.name,
|
|
51
|
+
schema_storage_path=Path(schema.storage_path),
|
|
52
|
+
fail_on_missing_subfolder=fail_on_missing_subfolder,
|
|
53
|
+
)
|
|
54
|
+
schema.tables = tables
|
|
55
|
+
return schema, schema_errors + table_errors
|
|
56
|
+
|
|
57
|
+
def get_table_by_name(self, table_name: str) -> Table:
|
|
58
|
+
"""Return table in schema.
|
|
59
|
+
|
|
60
|
+
Filters tables in schema for table_name and returns Table object.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
table_name: Name of table to return from schema.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If table not found in schema metadata.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
The table.
|
|
70
|
+
"""
|
|
71
|
+
table = next((table for table in self.tables if table.name == table_name), None)
|
|
72
|
+
|
|
73
|
+
if not table:
|
|
74
|
+
raise ValueError(f"Table {table_name} not found in {self.catalog}.{self.name} metadata.")
|
|
75
|
+
|
|
76
|
+
return table
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Self
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
import yaml.scanner
|
|
6
|
+
from jinja2 import TemplateNotFound
|
|
7
|
+
from pydantic import Field, ValidationError, ValidationInfo, field_validator, model_validator
|
|
8
|
+
|
|
9
|
+
from ..logging import LoggerMixin
|
|
10
|
+
from ..utils.file_and_directory_handler import process_path
|
|
11
|
+
from .column import Column
|
|
12
|
+
from .constraint import Constraint
|
|
13
|
+
from .foreign_key import ForeignKey
|
|
14
|
+
from .mixins.read_instance_mixin import ReadInstancesMixin
|
|
15
|
+
from .mixins.template_loader_mixin import TemplateLoaderMixin
|
|
16
|
+
from .types import ValidationErrorType
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Table(TemplateLoaderMixin, ReadInstancesMixin, LoggerMixin):
|
|
20
|
+
"""A Class to represent a Table in Unity Catalog."""
|
|
21
|
+
|
|
22
|
+
identifier: str
|
|
23
|
+
columns: list[Column]
|
|
24
|
+
is_external: bool | None = None
|
|
25
|
+
partition_by: list[str] = Field(default_factory=list)
|
|
26
|
+
liquid_clustering: bool | None = None
|
|
27
|
+
properties: dict[str, str] = Field(default_factory=dict)
|
|
28
|
+
constraints: list[Constraint] = Field(default_factory=list)
|
|
29
|
+
foreign_keys: list[ForeignKey] = Field(default_factory=list)
|
|
30
|
+
storage_path: Path | None = None
|
|
31
|
+
comment: str | None = None
|
|
32
|
+
|
|
33
|
+
def model_post_init(self, __context: Any) -> None:
|
|
34
|
+
"""Post init method for the Table model."""
|
|
35
|
+
self._console_logger = self.get_console_logger()
|
|
36
|
+
self._tabular_logger = self.get_tabular_logger(uc_table_name="nessy_table_logs", log_type="nessy_table_logs")
|
|
37
|
+
self._console_logger.debug(f"Model for table [ '{self.identifier}' ] has been initialized.")
|
|
38
|
+
self._tabular_logger.debug(f"Message : Model for table [ '{self.identifier}' ] has been initialized.")
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def catalog(self):
|
|
42
|
+
"""The name of the Catalog of the Table."""
|
|
43
|
+
return self.identifier.split(".")[0]
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def schema(self):
|
|
47
|
+
"""The name of the Schema of the Table."""
|
|
48
|
+
return self.identifier.split(".")[1]
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def name(self):
|
|
52
|
+
"""The name of the Table."""
|
|
53
|
+
return self.identifier.split(".")[2]
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def escaped_identifier(self):
|
|
57
|
+
"""The escaped identifier of the Table."""
|
|
58
|
+
return f"`{self.catalog}`.`{self.schema}`.`{self.name}`"
|
|
59
|
+
|
|
60
|
+
@field_validator("constraints", mode="before")
|
|
61
|
+
def _validate_constraints(cls, raw: dict[str, dict[str, str]]) -> list[Constraint]:
|
|
62
|
+
"""The constraints are defined with the name as key or as a list and must therefore be transformed."""
|
|
63
|
+
if isinstance(raw, dict):
|
|
64
|
+
constraints = [Constraint(name=constraint, **raw[constraint]) for constraint in raw]
|
|
65
|
+
elif isinstance(raw, list):
|
|
66
|
+
constraints = []
|
|
67
|
+
for item in raw:
|
|
68
|
+
if isinstance(item, Constraint):
|
|
69
|
+
constraints.append(item)
|
|
70
|
+
elif isinstance(item, dict):
|
|
71
|
+
constraints.append(Constraint(**item))
|
|
72
|
+
else:
|
|
73
|
+
raise ValueError("Invalid constraint format")
|
|
74
|
+
else:
|
|
75
|
+
raise ValueError("Constraints must be either a list or a dictionary")
|
|
76
|
+
return constraints
|
|
77
|
+
|
|
78
|
+
@field_validator("foreign_keys", mode="after")
|
|
79
|
+
def _validate_fk_columns(cls, v: list[ForeignKey], values: ValidationInfo):
|
|
80
|
+
"""Foreign keys need to be columns in the table as well."""
|
|
81
|
+
column_names = [c.name for c in values.data.get("columns", [])]
|
|
82
|
+
for fk in v:
|
|
83
|
+
for column in fk.foreign_key_columns:
|
|
84
|
+
if column not in column_names:
|
|
85
|
+
raise ValueError(f"Foreign key column '{column}' does not match any column in 'columns'")
|
|
86
|
+
return v
|
|
87
|
+
|
|
88
|
+
@model_validator(mode="after")
|
|
89
|
+
def _validate_is_external(cls, table: Self):
|
|
90
|
+
"""If is_external is set to False, storage_path has to be None."""
|
|
91
|
+
if not table.is_external and table.storage_path is not None:
|
|
92
|
+
raise ValueError("is_external cannot be false while storage_path is set.")
|
|
93
|
+
elif table.is_external and table.storage_path is None:
|
|
94
|
+
raise ValueError("is_external cannot be true while storage_path is None.")
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def read_instances_from_directory(
|
|
98
|
+
cls,
|
|
99
|
+
instance_path: str | Path,
|
|
100
|
+
fail_on_missing_subfolder: bool = True,
|
|
101
|
+
catalog_name: str | None = None,
|
|
102
|
+
schema_name: str | None = None,
|
|
103
|
+
schema_storage_path: str | Path | None = None,
|
|
104
|
+
**_: Any,
|
|
105
|
+
) -> tuple[list[Self], list[ValidationErrorType]]:
|
|
106
|
+
"""Reads instances from a directory containing YAML files.
|
|
107
|
+
|
|
108
|
+
This method scans a specified directory for YAML files (.yaml or .yml),
|
|
109
|
+
attempts to read and parse each file as an instance of the class, and
|
|
110
|
+
collects any errors encountered during the process. If the directory
|
|
111
|
+
does not exist or is not a directory, it either raises a
|
|
112
|
+
FileNotFoundError (if fail_on_missing_subfolder is True) or returns
|
|
113
|
+
empty lists.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
instance_path: The path to the directory containing instance files.
|
|
117
|
+
catalog_name: Name of the catalog to which these instances belong.
|
|
118
|
+
schema_name: Name of the schema used for validating the instances.
|
|
119
|
+
fail_on_missing_subfolder: Determines behavior when the specified
|
|
120
|
+
directory does not exist or is not a directory. Defaults to True,
|
|
121
|
+
which will raise a FileNotFoundError.
|
|
122
|
+
schema_storage_path: Path to the storage location of the schema
|
|
123
|
+
these tables instances belong to.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
- The first list contains instances of the class that were
|
|
127
|
+
successfully read and validated from the files.
|
|
128
|
+
- The second list contains errors encountered during the
|
|
129
|
+
process, which could be validation errors or YAML
|
|
130
|
+
parsing/scanning errors.
|
|
131
|
+
|
|
132
|
+
Raises:
|
|
133
|
+
FileNotFoundError: If the specified directory does not exist or is
|
|
134
|
+
not a directory and fail_on_missing_subfolder is True.
|
|
135
|
+
ValueError: If catalog_name or schema_name are not provided.
|
|
136
|
+
"""
|
|
137
|
+
processed_instance_path = process_path(instance_path)
|
|
138
|
+
schema_storage_path = process_path(schema_storage_path)
|
|
139
|
+
errors: list[ValidationErrorType] = []
|
|
140
|
+
|
|
141
|
+
if not catalog_name or not schema_name:
|
|
142
|
+
errors.append(ValueError("catalog_name and schema_name must be provided."))
|
|
143
|
+
return [], errors
|
|
144
|
+
instances: list[Self] = []
|
|
145
|
+
|
|
146
|
+
if not processed_instance_path or not processed_instance_path.exists() or not processed_instance_path.is_dir():
|
|
147
|
+
if fail_on_missing_subfolder:
|
|
148
|
+
raise FileNotFoundError(f"Directory not found: {processed_instance_path}")
|
|
149
|
+
else:
|
|
150
|
+
return instances, errors
|
|
151
|
+
|
|
152
|
+
for instance_file in processed_instance_path.iterdir():
|
|
153
|
+
sub_errors: list[ValidationErrorType] = []
|
|
154
|
+
if instance_file.is_file() and instance_file.suffix in (".yaml", ".yml"):
|
|
155
|
+
instance, sub_errors = cls.read_instance_from_file(
|
|
156
|
+
instance_file, catalog_name, schema_name, schema_storage_path
|
|
157
|
+
)
|
|
158
|
+
instances += [] if instance is None else [instance]
|
|
159
|
+
errors += sub_errors
|
|
160
|
+
|
|
161
|
+
return instances, errors
|
|
162
|
+
|
|
163
|
+
@classmethod
|
|
164
|
+
def read_instance_from_file(
|
|
165
|
+
cls,
|
|
166
|
+
instance_path: str | Path,
|
|
167
|
+
catalog_name: str | None = None,
|
|
168
|
+
schema_name: str | None = None,
|
|
169
|
+
schema_storage_path: str | Path | None = None,
|
|
170
|
+
**_: Any,
|
|
171
|
+
) -> tuple[Self | None, list[ValidationErrorType]]:
|
|
172
|
+
"""Read a table instance from file.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
instance_path: The path to the Schema definition YAML file.
|
|
176
|
+
catalog_name: The name of the Catalog of the Table.
|
|
177
|
+
schema_name: The name of the Schema of the Table.
|
|
178
|
+
schema_storage_path: The storage path location of the Schema of the Table.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
- Instance of the class that was successfully instantiated and
|
|
182
|
+
validated
|
|
183
|
+
- The second list contains errors encountered during the process,
|
|
184
|
+
which could be validation errors or YAML parsing/scanning errors.
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
FileNotFoundError: If the specified directory does not exist or is
|
|
188
|
+
not a directory and fail_on_missing_subfolder is
|
|
189
|
+
True.
|
|
190
|
+
ValueError: If catalog_name or schema_name are not provided.
|
|
191
|
+
"""
|
|
192
|
+
processed_instance_path = process_path(instance_path)
|
|
193
|
+
if not processed_instance_path:
|
|
194
|
+
raise FileNotFoundError("Table file not found.")
|
|
195
|
+
schema_storage_path = process_path(schema_storage_path)
|
|
196
|
+
errors: list[ValidationErrorType] = []
|
|
197
|
+
|
|
198
|
+
if not catalog_name or not schema_name:
|
|
199
|
+
errors.append(ValueError("catalog_name and schema_name must be provided."))
|
|
200
|
+
return None, errors
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
with processed_instance_path.open("r") as file:
|
|
204
|
+
data = yaml.safe_load(file)
|
|
205
|
+
data["identifier"] = f"{catalog_name}.{schema_name}.{data['name']}"
|
|
206
|
+
if data.get("is_external"):
|
|
207
|
+
if storage_path := data.get("storage_path"):
|
|
208
|
+
data["storage_path"] = Path(storage_path)
|
|
209
|
+
elif schema_storage_path:
|
|
210
|
+
data["storage_path"] = schema_storage_path / data["name"]
|
|
211
|
+
else:
|
|
212
|
+
raise ValueError(
|
|
213
|
+
f"Neither storage path nor schema storage path of table {data['name']} has been provided."
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
instance, sub_errors = cls.metadata_to_instance(data)
|
|
217
|
+
errors += sub_errors
|
|
218
|
+
except (ValidationError, yaml.parser.ParserError, yaml.scanner.ScannerError) as e:
|
|
219
|
+
instance = None
|
|
220
|
+
errors.append(e)
|
|
221
|
+
return instance, errors
|
|
222
|
+
|
|
223
|
+
def get_create_statement(
|
|
224
|
+
self,
|
|
225
|
+
templates: Path = Path("./templates"),
|
|
226
|
+
template_name: str = "create_table.sql.j2",
|
|
227
|
+
replace: bool = True,
|
|
228
|
+
):
|
|
229
|
+
"""Get the create statement for the Table."""
|
|
230
|
+
try:
|
|
231
|
+
template = self.get_template(templates, template_name)
|
|
232
|
+
except TemplateNotFound as err:
|
|
233
|
+
self._console_logger.error(f"Template [ {template_name} ] not found.")
|
|
234
|
+
raise err
|
|
235
|
+
render = template.render(table=self, replace=replace)
|
|
236
|
+
return render
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from ..logging import LoggerMixin
|
|
2
|
+
from ..session import SessionManager
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TableManager(LoggerMixin):
|
|
6
|
+
"""TableManager class for managing tables in the catalog."""
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
self._spark = SessionManager.get_spark_session()
|
|
10
|
+
self._utils = SessionManager.get_utils()
|
|
11
|
+
self._console_logger = self.get_console_logger()
|
|
12
|
+
self._console_logger.debug("TableManager initialized...")
|
|
13
|
+
self._tabular_logger = self.get_tabular_logger(uc_table_name="TableManager")
|
|
14
|
+
self._tabular_logger.debug("message:TableManager initialized.")
|
|
15
|
+
|
|
16
|
+
@staticmethod
|
|
17
|
+
def create_table():
|
|
18
|
+
"""Create a table in the catalog."""
|
|
19
|
+
raise NotImplementedError
|
|
20
|
+
|
|
21
|
+
def drop_table(self, table_identifier: str, delete_physical_data: bool = False):
|
|
22
|
+
"""Deletes a Table. For security reasons you are forced to pass the table_name.
|
|
23
|
+
|
|
24
|
+
If delete_physical_data is True the actual physical data on the ADLS will be deleted.
|
|
25
|
+
Use with caution!
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
table_identifier: The table identifier in the catalog. Must be in the format 'catalog.schema.table'.
|
|
29
|
+
delete_physical_data: If set to True, deletes not only the metadata
|
|
30
|
+
within the Catalog but also the physical data.
|
|
31
|
+
"""
|
|
32
|
+
self._console_logger.info(f"Deleting table [ '{table_identifier}' ] ...")
|
|
33
|
+
if not isinstance(table_identifier, str):
|
|
34
|
+
raise NotImplementedError("table_identifier must be a string, can be a Table object in the future.")
|
|
35
|
+
|
|
36
|
+
if delete_physical_data:
|
|
37
|
+
self._delete_physical_data()
|
|
38
|
+
self.drop_table_from_catalog(table_identifier)
|
|
39
|
+
|
|
40
|
+
def drop_table_from_catalog(self, table_identifier: str) -> None:
|
|
41
|
+
"""Removes a table from the catalog. Physical data is retained.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
table_identifier: The table identifier in the catalog. Must be in the format 'catalog.schema.table'.
|
|
45
|
+
"""
|
|
46
|
+
self._console_logger.info(f"... deleting table [ '{table_identifier}' ] from Catalog.")
|
|
47
|
+
if not isinstance(table_identifier, str):
|
|
48
|
+
raise NotImplementedError("table_identifier must be a string, can be a Table object in the future.")
|
|
49
|
+
self._spark.sql(f"DROP TABLE IF EXISTS {table_identifier};")
|
|
50
|
+
|
|
51
|
+
def _delete_physical_data(self):
|
|
52
|
+
"""Removes the physical data on the ADLS for the location of this table.
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
NotImplementedError: This can be implemented, once a Table object is available.
|
|
56
|
+
"""
|
|
57
|
+
self._console_logger.info("... deleting physical data for table [ '' ] from Catalog.")
|
|
58
|
+
raise NotImplementedError("This can be implemented, once a Table object is available.")
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from .pipeline import Pipeline
|
|
2
|
+
from .pipeline_action import PipelineAction
|
|
3
|
+
from .pipeline_context import PipelineContext
|
|
4
|
+
from .pipeline_parsing_service import PipelineParsingService
|
|
5
|
+
from .pipeline_step import PipelineStep
|
|
6
|
+
|
|
7
|
+
__all__ = ["Pipeline", "PipelineParsingService", "PipelineContext", "PipelineAction", "PipelineStep"]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
from ..pipeline_action import PipelineAction
|
|
4
|
+
from .read_api import ReadAPIAction
|
|
5
|
+
from .read_catalog_table import ReadCatalogTableAction
|
|
6
|
+
from .read_excel import ReadExcelAction
|
|
7
|
+
from .read_files import ReadFilesAction
|
|
8
|
+
from .read_metadata_yaml import ReadMetadataYAMLAction
|
|
9
|
+
from .transform_change_datatype import TransformChangeDatatypeAction
|
|
10
|
+
from .transform_concat_columns import TransformConcatColumnsAction
|
|
11
|
+
from .transform_decode import TransformDecodeAction
|
|
12
|
+
from .transform_distinct import TransformDistinctAction
|
|
13
|
+
from .transform_filter import TransformFilterAction
|
|
14
|
+
from .transform_generic_sql import TransformSqlAction
|
|
15
|
+
from .transform_join import TransformJoinAction
|
|
16
|
+
from .transform_json_normalize import TransformJsonNormalize
|
|
17
|
+
from .transform_rename_columns import TransformRenameColumnsAction
|
|
18
|
+
from .transform_replace_values import TransformReplaceValuesAction
|
|
19
|
+
from .transform_select_columns import TransformSelectColumnsAction
|
|
20
|
+
from .transform_union import TransformUnionAction
|
|
21
|
+
from .write_catalog_table import WriteCatalogTableAction
|
|
22
|
+
|
|
23
|
+
# Get all subclasses of PipelineAction defined in this submodule
|
|
24
|
+
pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
|
|
25
|
+
# Register all subclasses dynamically as enum using their "name" attribute as
|
|
26
|
+
# key. We need to do this here, because otherwise we don't get all subclasses
|
|
27
|
+
# from a relative import of PipelineAction
|
|
28
|
+
PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"ReadAPIAction",
|
|
32
|
+
"ReadCatalogTableAction",
|
|
33
|
+
"ReadExcelAction",
|
|
34
|
+
"ReadFilesAction",
|
|
35
|
+
"ReadMetadataYAMLAction",
|
|
36
|
+
"WriteCatalogTableAction",
|
|
37
|
+
"PipelineActionType",
|
|
38
|
+
"TransformFilterAction",
|
|
39
|
+
"TransformUnionAction",
|
|
40
|
+
"TransformChangeDatatypeAction",
|
|
41
|
+
"TransformConcatColumnsAction",
|
|
42
|
+
"TransformDecodeAction",
|
|
43
|
+
"TransformDistinctAction",
|
|
44
|
+
"TransformSqlAction",
|
|
45
|
+
"TransformJoinAction",
|
|
46
|
+
"TransformJsonNormalize",
|
|
47
|
+
"TransformRenameColumnsAction",
|
|
48
|
+
"TransformReplaceValuesAction",
|
|
49
|
+
"TransformSelectColumnsAction",
|
|
50
|
+
]
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
from typing import Any, cast
|
|
3
|
+
|
|
4
|
+
from requests.auth import AuthBase, HTTPBasicAuth
|
|
5
|
+
|
|
6
|
+
from ...clients.api_client.auth import AzureCredentialAuth, ChainedAuth, EnvVariableAuth, SecretScopeAuth
|
|
7
|
+
from ...integration.reader import APIReader
|
|
8
|
+
from ..pipeline_action import PipelineAction
|
|
9
|
+
from ..pipeline_context import PipelineContext
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def process_auth(
|
|
13
|
+
auth: Mapping[str, str | Mapping[str, str] | list[Mapping[str, str]]] | AuthBase | None,
|
|
14
|
+
) -> AuthBase | None:
|
|
15
|
+
"""Processes the auth parameter to create an AuthBase object.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
auth: The auth parameter to be processed.
|
|
19
|
+
"""
|
|
20
|
+
result: AuthBase | None = None
|
|
21
|
+
|
|
22
|
+
if isinstance(auth, list):
|
|
23
|
+
auths = [process_auth(sub_auth) for sub_auth in auth]
|
|
24
|
+
result = ChainedAuth(*auths)
|
|
25
|
+
elif isinstance(auth, dict):
|
|
26
|
+
match auth.get("type"):
|
|
27
|
+
case "basic":
|
|
28
|
+
result = HTTPBasicAuth(auth["username"], auth["password"])
|
|
29
|
+
case "secret_scope":
|
|
30
|
+
secret_scope_header_template: dict[str, str] = auth["header_template"]
|
|
31
|
+
result = SecretScopeAuth(secret_scope_header_template, auth["secret_scope"])
|
|
32
|
+
case "env":
|
|
33
|
+
env_header_template: dict[str, str] = auth["header_template"]
|
|
34
|
+
result = EnvVariableAuth(env_header_template)
|
|
35
|
+
case "azure_oauth":
|
|
36
|
+
result = AzureCredentialAuth(
|
|
37
|
+
scope=auth["scope"],
|
|
38
|
+
client_id=auth["client_id"],
|
|
39
|
+
client_secret=auth["client_secret"],
|
|
40
|
+
tenant_id=auth["tenant_id"],
|
|
41
|
+
)
|
|
42
|
+
case _:
|
|
43
|
+
raise ValueError("Invalid auth type specified. Supported types are: basic, secret_scope, env")
|
|
44
|
+
else:
|
|
45
|
+
result = cast(AuthBase, auth)
|
|
46
|
+
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ReadAPIAction(PipelineAction):
|
|
51
|
+
"""Reads data from an API and loads it into a Spark DataFrame.
|
|
52
|
+
|
|
53
|
+
This method uses the provided API parameters to make a request using the
|
|
54
|
+
[`APIReader`][cloe_nessy.integration.reader.api_reader] and return a
|
|
55
|
+
DataFrame containing the response data.
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
```yaml
|
|
59
|
+
Read API:
|
|
60
|
+
action: READ_API
|
|
61
|
+
options:
|
|
62
|
+
base_url: https://some_url.com/api/
|
|
63
|
+
endpoint: my/endpoint/
|
|
64
|
+
method: GET
|
|
65
|
+
timeout: 90
|
|
66
|
+
auth:
|
|
67
|
+
- type: basic
|
|
68
|
+
username: my_username
|
|
69
|
+
password: my_password
|
|
70
|
+
- type: secret_scope
|
|
71
|
+
secret_scope: my_secret_scope
|
|
72
|
+
header_template:
|
|
73
|
+
"header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
|
|
74
|
+
- type: secret_scope
|
|
75
|
+
secret_scope: my_secret_scope
|
|
76
|
+
header_template:
|
|
77
|
+
"header_key_2": "<SECRET_NAME>"
|
|
78
|
+
- type: secret_scope
|
|
79
|
+
secret_scope: my_other_secret_scope
|
|
80
|
+
header_template:
|
|
81
|
+
"header_key_3": "<SECRET_NAME>"
|
|
82
|
+
- type: azure_oauth
|
|
83
|
+
client_id: my_client_id
|
|
84
|
+
client_secret: my_client_secret
|
|
85
|
+
tenant_id: my_tenant_id
|
|
86
|
+
scope: <entra-id-client-id>
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The above example will combine the headers from the different auth types. The resulting header will look like this:
|
|
90
|
+
|
|
91
|
+
```json
|
|
92
|
+
{
|
|
93
|
+
"header_key_1": "value_from_environment_variable",
|
|
94
|
+
"header_key_2": "value_from_secret",
|
|
95
|
+
"header_key_3": "value_from_secret",
|
|
96
|
+
"Authorization": "Bearer <access_token> (from azure_oauth)",
|
|
97
|
+
"Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
|
|
98
|
+
}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
!!! warning
|
|
102
|
+
|
|
103
|
+
Don't write sensitive information like passwords or tokens directly in the pipeline configuration.
|
|
104
|
+
Use secret scopes or environment variables instead.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
name: str = "READ_API"
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def run(
|
|
111
|
+
context: PipelineContext,
|
|
112
|
+
*,
|
|
113
|
+
base_url: str | None = None,
|
|
114
|
+
auth: AuthBase | dict[str, str] | None = None,
|
|
115
|
+
default_headers: dict[str, str] | None = None,
|
|
116
|
+
endpoint: str = "", # www.neo4j.de/api/table/2020/01/01
|
|
117
|
+
method: str = "GET",
|
|
118
|
+
key: str | None = None,
|
|
119
|
+
timeout: int = 30,
|
|
120
|
+
params: dict[str, str] | None = None,
|
|
121
|
+
headers: dict[str, str] | None = None,
|
|
122
|
+
data: dict[str, str] | None = None,
|
|
123
|
+
json: dict[str, str] | None = None,
|
|
124
|
+
max_retries: int = 0,
|
|
125
|
+
options: dict[str, str] | None = None,
|
|
126
|
+
**_: Any,
|
|
127
|
+
) -> PipelineContext:
|
|
128
|
+
"""Utility class for reading an API into a DataFrame.
|
|
129
|
+
|
|
130
|
+
This class uses an APIClient to fetch data from an API and load it into a Spark DataFrame.
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
context: The pipeline context containing information about the pipeline.
|
|
135
|
+
base_url: The base URL for the API to be called.
|
|
136
|
+
auth: The authentication credentials for the API.
|
|
137
|
+
default_headers: Default headers to include in the API request.
|
|
138
|
+
endpoint: The specific API endpoint to call.
|
|
139
|
+
method: The HTTP method to use for the request (default is "GET").
|
|
140
|
+
key: Key for accessing specific data in the response.
|
|
141
|
+
timeout: Timeout for the API request in seconds (default is 30).
|
|
142
|
+
params: URL parameters to include in the API request.
|
|
143
|
+
headers: Additional headers to include in the request.
|
|
144
|
+
data: Data to send with the request for POST methods.
|
|
145
|
+
json: JSON data to send with the request for POST methods.
|
|
146
|
+
max_retries: Maximum number of retries for the API request (default is 0).
|
|
147
|
+
options: Additional options for the API request.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
The updated pipeline context containing the DataFrame with the API response data.
|
|
151
|
+
|
|
152
|
+
Raises:
|
|
153
|
+
ValueError: If the base_url is not specified.
|
|
154
|
+
"""
|
|
155
|
+
if not options:
|
|
156
|
+
options = dict()
|
|
157
|
+
|
|
158
|
+
if base_url is None:
|
|
159
|
+
raise ValueError("base_url must be specified to fetch data from API.")
|
|
160
|
+
|
|
161
|
+
deserialized_auth = process_auth(auth)
|
|
162
|
+
|
|
163
|
+
api_reader = APIReader(base_url=base_url, auth=deserialized_auth, default_headers=default_headers)
|
|
164
|
+
|
|
165
|
+
df = api_reader.read(
|
|
166
|
+
method=method,
|
|
167
|
+
endpoint=endpoint,
|
|
168
|
+
timeout=timeout,
|
|
169
|
+
params=params,
|
|
170
|
+
key=key,
|
|
171
|
+
headers=headers,
|
|
172
|
+
data=data,
|
|
173
|
+
json=json,
|
|
174
|
+
max_retries=max_retries,
|
|
175
|
+
options=options,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
return context.from_existing(data=df)
|