cloe-nessy 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. cloe_nessy/__init__.py +0 -0
  2. cloe_nessy/clients/__init__.py +5 -0
  3. cloe_nessy/clients/api_client/__init__.py +3 -0
  4. cloe_nessy/clients/api_client/api_client.py +188 -0
  5. cloe_nessy/clients/api_client/api_response.py +72 -0
  6. cloe_nessy/clients/api_client/auth.py +178 -0
  7. cloe_nessy/clients/api_client/exceptions.py +22 -0
  8. cloe_nessy/file_utilities/__init__.py +3 -0
  9. cloe_nessy/file_utilities/exceptions.py +4 -0
  10. cloe_nessy/file_utilities/factory.py +42 -0
  11. cloe_nessy/file_utilities/get_file_paths.py +72 -0
  12. cloe_nessy/file_utilities/location_types.py +29 -0
  13. cloe_nessy/file_utilities/strategies/__init__.py +0 -0
  14. cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
  15. cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
  16. cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
  17. cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
  18. cloe_nessy/integration/__init__.py +0 -0
  19. cloe_nessy/integration/reader/__init__.py +6 -0
  20. cloe_nessy/integration/reader/api_reader.py +141 -0
  21. cloe_nessy/integration/reader/catalog_reader.py +49 -0
  22. cloe_nessy/integration/reader/excel_reader.py +170 -0
  23. cloe_nessy/integration/reader/exceptions.py +10 -0
  24. cloe_nessy/integration/reader/file_reader.py +96 -0
  25. cloe_nessy/integration/reader/reader.py +34 -0
  26. cloe_nessy/integration/writer/__init__.py +3 -0
  27. cloe_nessy/integration/writer/catalog_writer.py +48 -0
  28. cloe_nessy/logging/__init__.py +3 -0
  29. cloe_nessy/logging/logger_mixin.py +162 -0
  30. cloe_nessy/models/__init__.py +13 -0
  31. cloe_nessy/models/column.py +65 -0
  32. cloe_nessy/models/constraint.py +9 -0
  33. cloe_nessy/models/foreign_key.py +34 -0
  34. cloe_nessy/models/mixins/__init__.py +0 -0
  35. cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
  36. cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
  37. cloe_nessy/models/schema.py +76 -0
  38. cloe_nessy/models/table.py +236 -0
  39. cloe_nessy/models/types.py +7 -0
  40. cloe_nessy/object_manager/__init__.py +3 -0
  41. cloe_nessy/object_manager/table_manager.py +58 -0
  42. cloe_nessy/pipeline/__init__.py +7 -0
  43. cloe_nessy/pipeline/actions/__init__.py +50 -0
  44. cloe_nessy/pipeline/actions/read_api.py +178 -0
  45. cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
  46. cloe_nessy/pipeline/actions/read_excel.py +177 -0
  47. cloe_nessy/pipeline/actions/read_files.py +105 -0
  48. cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
  49. cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
  50. cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
  51. cloe_nessy/pipeline/actions/transform_decode.py +102 -0
  52. cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
  53. cloe_nessy/pipeline/actions/transform_filter.py +51 -0
  54. cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
  55. cloe_nessy/pipeline/actions/transform_join.py +81 -0
  56. cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
  57. cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
  58. cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
  59. cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
  60. cloe_nessy/pipeline/actions/transform_union.py +71 -0
  61. cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
  62. cloe_nessy/pipeline/pipeline.py +201 -0
  63. cloe_nessy/pipeline/pipeline_action.py +62 -0
  64. cloe_nessy/pipeline/pipeline_config.py +92 -0
  65. cloe_nessy/pipeline/pipeline_context.py +56 -0
  66. cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
  67. cloe_nessy/pipeline/pipeline_step.py +50 -0
  68. cloe_nessy/py.typed +0 -0
  69. cloe_nessy/session/__init__.py +3 -0
  70. cloe_nessy/session/session_manager.py +188 -0
  71. cloe_nessy/settings/__init__.py +3 -0
  72. cloe_nessy/settings/settings.py +91 -0
  73. cloe_nessy/utils/__init__.py +0 -0
  74. cloe_nessy/utils/file_and_directory_handler.py +19 -0
  75. cloe_nessy-0.2.9.dist-info/METADATA +26 -0
  76. cloe_nessy-0.2.9.dist-info/RECORD +78 -0
  77. cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
  78. cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,76 @@
1
+ from pathlib import Path
2
+ from typing import Any, Self
3
+
4
+ from pydantic import Field
5
+
6
+ from ..utils.file_and_directory_handler import process_path
7
+ from .mixins.read_instance_mixin import ReadInstancesMixin
8
+ from .table import Table
9
+ from .types import ValidationErrorType
10
+
11
+
12
+ class Schema(ReadInstancesMixin):
13
+ """A Class to represent a Schema in Unity Catalog."""
14
+
15
+ catalog: str
16
+ name: str
17
+ storage_path: str | None = None
18
+ tables: list[Table] = Field(default_factory=list)
19
+ properties: dict[str, Any] = Field(default_factory=dict)
20
+
21
+ @classmethod
22
+ def read_instance_from_file(
23
+ cls,
24
+ instance_path: str | Path,
25
+ fail_on_missing_subfolder: bool = True,
26
+ table_dir_name: str = "tables",
27
+ **_: Any,
28
+ ) -> tuple[Self | None, list[ValidationErrorType]]:
29
+ """Read a schema from file.
30
+
31
+ Adds the table objects from a path relative to the schema definition.
32
+
33
+ Args:
34
+ instance_path: The path to the Schema definition YAML file.
35
+ fail_on_missing_subfolder: If False return a tuple with 2 empty
36
+ lists. Otherwise raise a FileNotFoundError.
37
+ table_dir_name: The name of the directory containing the Table
38
+ definitions related to this schema. Can be a relative path.
39
+ """
40
+ processed_instance_path = process_path(instance_path)
41
+ if not processed_instance_path:
42
+ raise FileNotFoundError("Schema file not found.")
43
+
44
+ schema, schema_errors = super().read_instance_from_file(processed_instance_path)
45
+ if schema:
46
+ schema.storage_path = "" if not schema.storage_path else schema.storage_path
47
+ tables, table_errors = Table.read_instances_from_directory(
48
+ instance_path=processed_instance_path.parents[0] / table_dir_name,
49
+ catalog_name=schema.catalog,
50
+ schema_name=schema.name,
51
+ schema_storage_path=Path(schema.storage_path),
52
+ fail_on_missing_subfolder=fail_on_missing_subfolder,
53
+ )
54
+ schema.tables = tables
55
+ return schema, schema_errors + table_errors
56
+
57
+ def get_table_by_name(self, table_name: str) -> Table:
58
+ """Return table in schema.
59
+
60
+ Filters tables in schema for table_name and returns Table object.
61
+
62
+ Args:
63
+ table_name: Name of table to return from schema.
64
+
65
+ Raises:
66
+ ValueError: If table not found in schema metadata.
67
+
68
+ Returns:
69
+ The table.
70
+ """
71
+ table = next((table for table in self.tables if table.name == table_name), None)
72
+
73
+ if not table:
74
+ raise ValueError(f"Table {table_name} not found in {self.catalog}.{self.name} metadata.")
75
+
76
+ return table
@@ -0,0 +1,236 @@
1
+ from pathlib import Path
2
+ from typing import Any, Self
3
+
4
+ import yaml
5
+ import yaml.scanner
6
+ from jinja2 import TemplateNotFound
7
+ from pydantic import Field, ValidationError, ValidationInfo, field_validator, model_validator
8
+
9
+ from ..logging import LoggerMixin
10
+ from ..utils.file_and_directory_handler import process_path
11
+ from .column import Column
12
+ from .constraint import Constraint
13
+ from .foreign_key import ForeignKey
14
+ from .mixins.read_instance_mixin import ReadInstancesMixin
15
+ from .mixins.template_loader_mixin import TemplateLoaderMixin
16
+ from .types import ValidationErrorType
17
+
18
+
19
+ class Table(TemplateLoaderMixin, ReadInstancesMixin, LoggerMixin):
20
+ """A Class to represent a Table in Unity Catalog."""
21
+
22
+ identifier: str
23
+ columns: list[Column]
24
+ is_external: bool | None = None
25
+ partition_by: list[str] = Field(default_factory=list)
26
+ liquid_clustering: bool | None = None
27
+ properties: dict[str, str] = Field(default_factory=dict)
28
+ constraints: list[Constraint] = Field(default_factory=list)
29
+ foreign_keys: list[ForeignKey] = Field(default_factory=list)
30
+ storage_path: Path | None = None
31
+ comment: str | None = None
32
+
33
+ def model_post_init(self, __context: Any) -> None:
34
+ """Post init method for the Table model."""
35
+ self._console_logger = self.get_console_logger()
36
+ self._tabular_logger = self.get_tabular_logger(uc_table_name="nessy_table_logs", log_type="nessy_table_logs")
37
+ self._console_logger.debug(f"Model for table [ '{self.identifier}' ] has been initialized.")
38
+ self._tabular_logger.debug(f"Message : Model for table [ '{self.identifier}' ] has been initialized.")
39
+
40
+ @property
41
+ def catalog(self):
42
+ """The name of the Catalog of the Table."""
43
+ return self.identifier.split(".")[0]
44
+
45
+ @property
46
+ def schema(self):
47
+ """The name of the Schema of the Table."""
48
+ return self.identifier.split(".")[1]
49
+
50
+ @property
51
+ def name(self):
52
+ """The name of the Table."""
53
+ return self.identifier.split(".")[2]
54
+
55
+ @property
56
+ def escaped_identifier(self):
57
+ """The escaped identifier of the Table."""
58
+ return f"`{self.catalog}`.`{self.schema}`.`{self.name}`"
59
+
60
+ @field_validator("constraints", mode="before")
61
+ def _validate_constraints(cls, raw: dict[str, dict[str, str]]) -> list[Constraint]:
62
+ """The constraints are defined with the name as key or as a list and must therefore be transformed."""
63
+ if isinstance(raw, dict):
64
+ constraints = [Constraint(name=constraint, **raw[constraint]) for constraint in raw]
65
+ elif isinstance(raw, list):
66
+ constraints = []
67
+ for item in raw:
68
+ if isinstance(item, Constraint):
69
+ constraints.append(item)
70
+ elif isinstance(item, dict):
71
+ constraints.append(Constraint(**item))
72
+ else:
73
+ raise ValueError("Invalid constraint format")
74
+ else:
75
+ raise ValueError("Constraints must be either a list or a dictionary")
76
+ return constraints
77
+
78
+ @field_validator("foreign_keys", mode="after")
79
+ def _validate_fk_columns(cls, v: list[ForeignKey], values: ValidationInfo):
80
+ """Foreign keys need to be columns in the table as well."""
81
+ column_names = [c.name for c in values.data.get("columns", [])]
82
+ for fk in v:
83
+ for column in fk.foreign_key_columns:
84
+ if column not in column_names:
85
+ raise ValueError(f"Foreign key column '{column}' does not match any column in 'columns'")
86
+ return v
87
+
88
+ @model_validator(mode="after")
89
+ def _validate_is_external(cls, table: Self):
90
+ """If is_external is set to False, storage_path has to be None."""
91
+ if not table.is_external and table.storage_path is not None:
92
+ raise ValueError("is_external cannot be false while storage_path is set.")
93
+ elif table.is_external and table.storage_path is None:
94
+ raise ValueError("is_external cannot be true while storage_path is None.")
95
+
96
+ @classmethod
97
+ def read_instances_from_directory(
98
+ cls,
99
+ instance_path: str | Path,
100
+ fail_on_missing_subfolder: bool = True,
101
+ catalog_name: str | None = None,
102
+ schema_name: str | None = None,
103
+ schema_storage_path: str | Path | None = None,
104
+ **_: Any,
105
+ ) -> tuple[list[Self], list[ValidationErrorType]]:
106
+ """Reads instances from a directory containing YAML files.
107
+
108
+ This method scans a specified directory for YAML files (.yaml or .yml),
109
+ attempts to read and parse each file as an instance of the class, and
110
+ collects any errors encountered during the process. If the directory
111
+ does not exist or is not a directory, it either raises a
112
+ FileNotFoundError (if fail_on_missing_subfolder is True) or returns
113
+ empty lists.
114
+
115
+ Args:
116
+ instance_path: The path to the directory containing instance files.
117
+ catalog_name: Name of the catalog to which these instances belong.
118
+ schema_name: Name of the schema used for validating the instances.
119
+ fail_on_missing_subfolder: Determines behavior when the specified
120
+ directory does not exist or is not a directory. Defaults to True,
121
+ which will raise a FileNotFoundError.
122
+ schema_storage_path: Path to the storage location of the schema
123
+ these tables instances belong to.
124
+
125
+ Returns:
126
+ - The first list contains instances of the class that were
127
+ successfully read and validated from the files.
128
+ - The second list contains errors encountered during the
129
+ process, which could be validation errors or YAML
130
+ parsing/scanning errors.
131
+
132
+ Raises:
133
+ FileNotFoundError: If the specified directory does not exist or is
134
+ not a directory and fail_on_missing_subfolder is True.
135
+ ValueError: If catalog_name or schema_name are not provided.
136
+ """
137
+ processed_instance_path = process_path(instance_path)
138
+ schema_storage_path = process_path(schema_storage_path)
139
+ errors: list[ValidationErrorType] = []
140
+
141
+ if not catalog_name or not schema_name:
142
+ errors.append(ValueError("catalog_name and schema_name must be provided."))
143
+ return [], errors
144
+ instances: list[Self] = []
145
+
146
+ if not processed_instance_path or not processed_instance_path.exists() or not processed_instance_path.is_dir():
147
+ if fail_on_missing_subfolder:
148
+ raise FileNotFoundError(f"Directory not found: {processed_instance_path}")
149
+ else:
150
+ return instances, errors
151
+
152
+ for instance_file in processed_instance_path.iterdir():
153
+ sub_errors: list[ValidationErrorType] = []
154
+ if instance_file.is_file() and instance_file.suffix in (".yaml", ".yml"):
155
+ instance, sub_errors = cls.read_instance_from_file(
156
+ instance_file, catalog_name, schema_name, schema_storage_path
157
+ )
158
+ instances += [] if instance is None else [instance]
159
+ errors += sub_errors
160
+
161
+ return instances, errors
162
+
163
+ @classmethod
164
+ def read_instance_from_file(
165
+ cls,
166
+ instance_path: str | Path,
167
+ catalog_name: str | None = None,
168
+ schema_name: str | None = None,
169
+ schema_storage_path: str | Path | None = None,
170
+ **_: Any,
171
+ ) -> tuple[Self | None, list[ValidationErrorType]]:
172
+ """Read a table instance from file.
173
+
174
+ Args:
175
+ instance_path: The path to the Schema definition YAML file.
176
+ catalog_name: The name of the Catalog of the Table.
177
+ schema_name: The name of the Schema of the Table.
178
+ schema_storage_path: The storage path location of the Schema of the Table.
179
+
180
+ Returns:
181
+ - Instance of the class that was successfully instantiated and
182
+ validated
183
+ - The second list contains errors encountered during the process,
184
+ which could be validation errors or YAML parsing/scanning errors.
185
+
186
+ Raises:
187
+ FileNotFoundError: If the specified directory does not exist or is
188
+ not a directory and fail_on_missing_subfolder is
189
+ True.
190
+ ValueError: If catalog_name or schema_name are not provided.
191
+ """
192
+ processed_instance_path = process_path(instance_path)
193
+ if not processed_instance_path:
194
+ raise FileNotFoundError("Table file not found.")
195
+ schema_storage_path = process_path(schema_storage_path)
196
+ errors: list[ValidationErrorType] = []
197
+
198
+ if not catalog_name or not schema_name:
199
+ errors.append(ValueError("catalog_name and schema_name must be provided."))
200
+ return None, errors
201
+
202
+ try:
203
+ with processed_instance_path.open("r") as file:
204
+ data = yaml.safe_load(file)
205
+ data["identifier"] = f"{catalog_name}.{schema_name}.{data['name']}"
206
+ if data.get("is_external"):
207
+ if storage_path := data.get("storage_path"):
208
+ data["storage_path"] = Path(storage_path)
209
+ elif schema_storage_path:
210
+ data["storage_path"] = schema_storage_path / data["name"]
211
+ else:
212
+ raise ValueError(
213
+ f"Neither storage path nor schema storage path of table {data['name']} has been provided."
214
+ )
215
+
216
+ instance, sub_errors = cls.metadata_to_instance(data)
217
+ errors += sub_errors
218
+ except (ValidationError, yaml.parser.ParserError, yaml.scanner.ScannerError) as e:
219
+ instance = None
220
+ errors.append(e)
221
+ return instance, errors
222
+
223
+ def get_create_statement(
224
+ self,
225
+ templates: Path = Path("./templates"),
226
+ template_name: str = "create_table.sql.j2",
227
+ replace: bool = True,
228
+ ):
229
+ """Get the create statement for the Table."""
230
+ try:
231
+ template = self.get_template(templates, template_name)
232
+ except TemplateNotFound as err:
233
+ self._console_logger.error(f"Template [ {template_name} ] not found.")
234
+ raise err
235
+ render = template.render(table=self, replace=replace)
236
+ return render
@@ -0,0 +1,7 @@
1
+ from typing import TypeAlias
2
+
3
+ import yaml.parser
4
+ import yaml.scanner
5
+ from pydantic import ValidationError
6
+
7
+ ValidationErrorType: TypeAlias = ValidationError | yaml.parser.ParserError | yaml.scanner.ScannerError | ValueError
@@ -0,0 +1,3 @@
1
+ from .table_manager import TableManager
2
+
3
+ __all__ = ["TableManager"]
@@ -0,0 +1,58 @@
1
+ from ..logging import LoggerMixin
2
+ from ..session import SessionManager
3
+
4
+
5
+ class TableManager(LoggerMixin):
6
+ """TableManager class for managing tables in the catalog."""
7
+
8
+ def __init__(self):
9
+ self._spark = SessionManager.get_spark_session()
10
+ self._utils = SessionManager.get_utils()
11
+ self._console_logger = self.get_console_logger()
12
+ self._console_logger.debug("TableManager initialized...")
13
+ self._tabular_logger = self.get_tabular_logger(uc_table_name="TableManager")
14
+ self._tabular_logger.debug("message:TableManager initialized.")
15
+
16
+ @staticmethod
17
+ def create_table():
18
+ """Create a table in the catalog."""
19
+ raise NotImplementedError
20
+
21
+ def drop_table(self, table_identifier: str, delete_physical_data: bool = False):
22
+ """Deletes a Table. For security reasons you are forced to pass the table_name.
23
+
24
+ If delete_physical_data is True the actual physical data on the ADLS will be deleted.
25
+ Use with caution!
26
+
27
+ Args:
28
+ table_identifier: The table identifier in the catalog. Must be in the format 'catalog.schema.table'.
29
+ delete_physical_data: If set to True, deletes not only the metadata
30
+ within the Catalog but also the physical data.
31
+ """
32
+ self._console_logger.info(f"Deleting table [ '{table_identifier}' ] ...")
33
+ if not isinstance(table_identifier, str):
34
+ raise NotImplementedError("table_identifier must be a string, can be a Table object in the future.")
35
+
36
+ if delete_physical_data:
37
+ self._delete_physical_data()
38
+ self.drop_table_from_catalog(table_identifier)
39
+
40
+ def drop_table_from_catalog(self, table_identifier: str) -> None:
41
+ """Removes a table from the catalog. Physical data is retained.
42
+
43
+ Args:
44
+ table_identifier: The table identifier in the catalog. Must be in the format 'catalog.schema.table'.
45
+ """
46
+ self._console_logger.info(f"... deleting table [ '{table_identifier}' ] from Catalog.")
47
+ if not isinstance(table_identifier, str):
48
+ raise NotImplementedError("table_identifier must be a string, can be a Table object in the future.")
49
+ self._spark.sql(f"DROP TABLE IF EXISTS {table_identifier};")
50
+
51
+ def _delete_physical_data(self):
52
+ """Removes the physical data on the ADLS for the location of this table.
53
+
54
+ Raises:
55
+ NotImplementedError: This can be implemented, once a Table object is available.
56
+ """
57
+ self._console_logger.info("... deleting physical data for table [ '' ] from Catalog.")
58
+ raise NotImplementedError("This can be implemented, once a Table object is available.")
@@ -0,0 +1,7 @@
1
+ from .pipeline import Pipeline
2
+ from .pipeline_action import PipelineAction
3
+ from .pipeline_context import PipelineContext
4
+ from .pipeline_parsing_service import PipelineParsingService
5
+ from .pipeline_step import PipelineStep
6
+
7
+ __all__ = ["Pipeline", "PipelineParsingService", "PipelineContext", "PipelineAction", "PipelineStep"]
@@ -0,0 +1,50 @@
1
+ from enum import Enum
2
+
3
+ from ..pipeline_action import PipelineAction
4
+ from .read_api import ReadAPIAction
5
+ from .read_catalog_table import ReadCatalogTableAction
6
+ from .read_excel import ReadExcelAction
7
+ from .read_files import ReadFilesAction
8
+ from .read_metadata_yaml import ReadMetadataYAMLAction
9
+ from .transform_change_datatype import TransformChangeDatatypeAction
10
+ from .transform_concat_columns import TransformConcatColumnsAction
11
+ from .transform_decode import TransformDecodeAction
12
+ from .transform_distinct import TransformDistinctAction
13
+ from .transform_filter import TransformFilterAction
14
+ from .transform_generic_sql import TransformSqlAction
15
+ from .transform_join import TransformJoinAction
16
+ from .transform_json_normalize import TransformJsonNormalize
17
+ from .transform_rename_columns import TransformRenameColumnsAction
18
+ from .transform_replace_values import TransformReplaceValuesAction
19
+ from .transform_select_columns import TransformSelectColumnsAction
20
+ from .transform_union import TransformUnionAction
21
+ from .write_catalog_table import WriteCatalogTableAction
22
+
23
+ # Get all subclasses of PipelineAction defined in this submodule
24
+ pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
25
+ # Register all subclasses dynamically as enum using their "name" attribute as
26
+ # key. We need to do this here, because otherwise we don't get all subclasses
27
+ # from a relative import of PipelineAction
28
+ PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore
29
+
30
+ __all__ = [
31
+ "ReadAPIAction",
32
+ "ReadCatalogTableAction",
33
+ "ReadExcelAction",
34
+ "ReadFilesAction",
35
+ "ReadMetadataYAMLAction",
36
+ "WriteCatalogTableAction",
37
+ "PipelineActionType",
38
+ "TransformFilterAction",
39
+ "TransformUnionAction",
40
+ "TransformChangeDatatypeAction",
41
+ "TransformConcatColumnsAction",
42
+ "TransformDecodeAction",
43
+ "TransformDistinctAction",
44
+ "TransformSqlAction",
45
+ "TransformJoinAction",
46
+ "TransformJsonNormalize",
47
+ "TransformRenameColumnsAction",
48
+ "TransformReplaceValuesAction",
49
+ "TransformSelectColumnsAction",
50
+ ]
@@ -0,0 +1,178 @@
1
+ from collections.abc import Mapping
2
+ from typing import Any, cast
3
+
4
+ from requests.auth import AuthBase, HTTPBasicAuth
5
+
6
+ from ...clients.api_client.auth import AzureCredentialAuth, ChainedAuth, EnvVariableAuth, SecretScopeAuth
7
+ from ...integration.reader import APIReader
8
+ from ..pipeline_action import PipelineAction
9
+ from ..pipeline_context import PipelineContext
10
+
11
+
12
+ def process_auth(
13
+ auth: Mapping[str, str | Mapping[str, str] | list[Mapping[str, str]]] | AuthBase | None,
14
+ ) -> AuthBase | None:
15
+ """Processes the auth parameter to create an AuthBase object.
16
+
17
+ Args:
18
+ auth: The auth parameter to be processed.
19
+ """
20
+ result: AuthBase | None = None
21
+
22
+ if isinstance(auth, list):
23
+ auths = [process_auth(sub_auth) for sub_auth in auth]
24
+ result = ChainedAuth(*auths)
25
+ elif isinstance(auth, dict):
26
+ match auth.get("type"):
27
+ case "basic":
28
+ result = HTTPBasicAuth(auth["username"], auth["password"])
29
+ case "secret_scope":
30
+ secret_scope_header_template: dict[str, str] = auth["header_template"]
31
+ result = SecretScopeAuth(secret_scope_header_template, auth["secret_scope"])
32
+ case "env":
33
+ env_header_template: dict[str, str] = auth["header_template"]
34
+ result = EnvVariableAuth(env_header_template)
35
+ case "azure_oauth":
36
+ result = AzureCredentialAuth(
37
+ scope=auth["scope"],
38
+ client_id=auth["client_id"],
39
+ client_secret=auth["client_secret"],
40
+ tenant_id=auth["tenant_id"],
41
+ )
42
+ case _:
43
+ raise ValueError("Invalid auth type specified. Supported types are: basic, secret_scope, env")
44
+ else:
45
+ result = cast(AuthBase, auth)
46
+
47
+ return result
48
+
49
+
50
+ class ReadAPIAction(PipelineAction):
51
+ """Reads data from an API and loads it into a Spark DataFrame.
52
+
53
+ This method uses the provided API parameters to make a request using the
54
+ [`APIReader`][cloe_nessy.integration.reader.api_reader] and return a
55
+ DataFrame containing the response data.
56
+
57
+ Example:
58
+ ```yaml
59
+ Read API:
60
+ action: READ_API
61
+ options:
62
+ base_url: https://some_url.com/api/
63
+ endpoint: my/endpoint/
64
+ method: GET
65
+ timeout: 90
66
+ auth:
67
+ - type: basic
68
+ username: my_username
69
+ password: my_password
70
+ - type: secret_scope
71
+ secret_scope: my_secret_scope
72
+ header_template:
73
+ "header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
74
+ - type: secret_scope
75
+ secret_scope: my_secret_scope
76
+ header_template:
77
+ "header_key_2": "<SECRET_NAME>"
78
+ - type: secret_scope
79
+ secret_scope: my_other_secret_scope
80
+ header_template:
81
+ "header_key_3": "<SECRET_NAME>"
82
+ - type: azure_oauth
83
+ client_id: my_client_id
84
+ client_secret: my_client_secret
85
+ tenant_id: my_tenant_id
86
+ scope: <entra-id-client-id>
87
+ ```
88
+
89
+ The above example will combine the headers from the different auth types. The resulting header will look like this:
90
+
91
+ ```json
92
+ {
93
+ "header_key_1": "value_from_environment_variable",
94
+ "header_key_2": "value_from_secret",
95
+ "header_key_3": "value_from_secret",
96
+ "Authorization": "Bearer <access_token> (from azure_oauth)",
97
+ "Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
98
+ }
99
+ ```
100
+
101
+ !!! warning
102
+
103
+ Don't write sensitive information like passwords or tokens directly in the pipeline configuration.
104
+ Use secret scopes or environment variables instead.
105
+ """
106
+
107
+ name: str = "READ_API"
108
+
109
+ @staticmethod
110
+ def run(
111
+ context: PipelineContext,
112
+ *,
113
+ base_url: str | None = None,
114
+ auth: AuthBase | dict[str, str] | None = None,
115
+ default_headers: dict[str, str] | None = None,
116
+ endpoint: str = "", # www.neo4j.de/api/table/2020/01/01
117
+ method: str = "GET",
118
+ key: str | None = None,
119
+ timeout: int = 30,
120
+ params: dict[str, str] | None = None,
121
+ headers: dict[str, str] | None = None,
122
+ data: dict[str, str] | None = None,
123
+ json: dict[str, str] | None = None,
124
+ max_retries: int = 0,
125
+ options: dict[str, str] | None = None,
126
+ **_: Any,
127
+ ) -> PipelineContext:
128
+ """Utility class for reading an API into a DataFrame.
129
+
130
+ This class uses an APIClient to fetch data from an API and load it into a Spark DataFrame.
131
+
132
+
133
+ Args:
134
+ context: The pipeline context containing information about the pipeline.
135
+ base_url: The base URL for the API to be called.
136
+ auth: The authentication credentials for the API.
137
+ default_headers: Default headers to include in the API request.
138
+ endpoint: The specific API endpoint to call.
139
+ method: The HTTP method to use for the request (default is "GET").
140
+ key: Key for accessing specific data in the response.
141
+ timeout: Timeout for the API request in seconds (default is 30).
142
+ params: URL parameters to include in the API request.
143
+ headers: Additional headers to include in the request.
144
+ data: Data to send with the request for POST methods.
145
+ json: JSON data to send with the request for POST methods.
146
+ max_retries: Maximum number of retries for the API request (default is 0).
147
+ options: Additional options for the API request.
148
+
149
+ Returns:
150
+ The updated pipeline context containing the DataFrame with the API response data.
151
+
152
+ Raises:
153
+ ValueError: If the base_url is not specified.
154
+ """
155
+ if not options:
156
+ options = dict()
157
+
158
+ if base_url is None:
159
+ raise ValueError("base_url must be specified to fetch data from API.")
160
+
161
+ deserialized_auth = process_auth(auth)
162
+
163
+ api_reader = APIReader(base_url=base_url, auth=deserialized_auth, default_headers=default_headers)
164
+
165
+ df = api_reader.read(
166
+ method=method,
167
+ endpoint=endpoint,
168
+ timeout=timeout,
169
+ params=params,
170
+ key=key,
171
+ headers=headers,
172
+ data=data,
173
+ json=json,
174
+ max_retries=max_retries,
175
+ options=options,
176
+ )
177
+
178
+ return context.from_existing(data=df)