cloe-nessy 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/__init__.py +0 -0
- cloe_nessy/clients/__init__.py +5 -0
- cloe_nessy/clients/api_client/__init__.py +3 -0
- cloe_nessy/clients/api_client/api_client.py +188 -0
- cloe_nessy/clients/api_client/api_response.py +72 -0
- cloe_nessy/clients/api_client/auth.py +178 -0
- cloe_nessy/clients/api_client/exceptions.py +22 -0
- cloe_nessy/file_utilities/__init__.py +3 -0
- cloe_nessy/file_utilities/exceptions.py +4 -0
- cloe_nessy/file_utilities/factory.py +42 -0
- cloe_nessy/file_utilities/get_file_paths.py +72 -0
- cloe_nessy/file_utilities/location_types.py +29 -0
- cloe_nessy/file_utilities/strategies/__init__.py +0 -0
- cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
- cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
- cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
- cloe_nessy/integration/__init__.py +0 -0
- cloe_nessy/integration/reader/__init__.py +6 -0
- cloe_nessy/integration/reader/api_reader.py +141 -0
- cloe_nessy/integration/reader/catalog_reader.py +49 -0
- cloe_nessy/integration/reader/excel_reader.py +170 -0
- cloe_nessy/integration/reader/exceptions.py +10 -0
- cloe_nessy/integration/reader/file_reader.py +96 -0
- cloe_nessy/integration/reader/reader.py +34 -0
- cloe_nessy/integration/writer/__init__.py +3 -0
- cloe_nessy/integration/writer/catalog_writer.py +48 -0
- cloe_nessy/logging/__init__.py +3 -0
- cloe_nessy/logging/logger_mixin.py +162 -0
- cloe_nessy/models/__init__.py +13 -0
- cloe_nessy/models/column.py +65 -0
- cloe_nessy/models/constraint.py +9 -0
- cloe_nessy/models/foreign_key.py +34 -0
- cloe_nessy/models/mixins/__init__.py +0 -0
- cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
- cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
- cloe_nessy/models/schema.py +76 -0
- cloe_nessy/models/table.py +236 -0
- cloe_nessy/models/types.py +7 -0
- cloe_nessy/object_manager/__init__.py +3 -0
- cloe_nessy/object_manager/table_manager.py +58 -0
- cloe_nessy/pipeline/__init__.py +7 -0
- cloe_nessy/pipeline/actions/__init__.py +50 -0
- cloe_nessy/pipeline/actions/read_api.py +178 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
- cloe_nessy/pipeline/actions/read_excel.py +177 -0
- cloe_nessy/pipeline/actions/read_files.py +105 -0
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
- cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
- cloe_nessy/pipeline/actions/transform_decode.py +102 -0
- cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
- cloe_nessy/pipeline/actions/transform_filter.py +51 -0
- cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
- cloe_nessy/pipeline/actions/transform_join.py +81 -0
- cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
- cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
- cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
- cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
- cloe_nessy/pipeline/actions/transform_union.py +71 -0
- cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
- cloe_nessy/pipeline/pipeline.py +201 -0
- cloe_nessy/pipeline/pipeline_action.py +62 -0
- cloe_nessy/pipeline/pipeline_config.py +92 -0
- cloe_nessy/pipeline/pipeline_context.py +56 -0
- cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
- cloe_nessy/pipeline/pipeline_step.py +50 -0
- cloe_nessy/py.typed +0 -0
- cloe_nessy/session/__init__.py +3 -0
- cloe_nessy/session/session_manager.py +188 -0
- cloe_nessy/settings/__init__.py +3 -0
- cloe_nessy/settings/settings.py +91 -0
- cloe_nessy/utils/__init__.py +0 -0
- cloe_nessy/utils/file_and_directory_handler.py +19 -0
- cloe_nessy-0.2.9.dist-info/METADATA +26 -0
- cloe_nessy-0.2.9.dist-info/RECORD +78 -0
- cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
- cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class FileRetrievalStrategy(ABC):
|
|
5
|
+
"""Abstract base class for file retrieval strategies.
|
|
6
|
+
|
|
7
|
+
This class defines the interface for strategies that retrieve file paths
|
|
8
|
+
based on certain criteria. Concrete implementations of this class should
|
|
9
|
+
provide the logic for retrieving file paths.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
@staticmethod
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list[str]:
|
|
15
|
+
"""Retrieves a list of file paths based on the specified criteria.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
location: The location to search for files.
|
|
19
|
+
extension: The file extension to filter by. If None, no extension filtering is applied.
|
|
20
|
+
If an empty string, it matches files with no extension.
|
|
21
|
+
search_subdirs: Whether to search in subdirectories.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
list[str]: A list of file paths that match the specified criteria.
|
|
25
|
+
"""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def _matches_extension(file_name: str, extension: str | None) -> bool:
|
|
30
|
+
"""Determines if a file name ends with the specified extension.
|
|
31
|
+
|
|
32
|
+
This method checks whether the provided file name matches the given file extension. The comparison is case-insensitive.
|
|
33
|
+
|
|
34
|
+
If the `extension` is an empty string, it checks if the file name either does not contain a dot or ends with a dot,
|
|
35
|
+
which indicates a file with no extension. If the `extension` is `None`, it matches any file name regardless of extension.
|
|
36
|
+
|
|
37
|
+
If the `extension` contains a dot (e.g., ".txt"), it is compared directly against the end of the file name. Otherwise,
|
|
38
|
+
a dot is prefixed to the `extension` to create the expected file extension format (e.g., "txt" becomes ".txt").
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
file_name: The name of the file to check. This is converted to lowercase for case-insensitive comparison.
|
|
42
|
+
extension: The extension to match against. Can be a string with or without a leading dot.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
bool: True if the file name ends with the specified extension, False otherwise.
|
|
46
|
+
"""
|
|
47
|
+
file_name_lower = file_name.lower()
|
|
48
|
+
matches = False
|
|
49
|
+
|
|
50
|
+
if extension == "":
|
|
51
|
+
matches = "." not in file_name_lower or file_name.endswith(".")
|
|
52
|
+
elif extension is None:
|
|
53
|
+
matches = True
|
|
54
|
+
elif "." in extension:
|
|
55
|
+
matches = file_name_lower.endswith(extension.lower())
|
|
56
|
+
else:
|
|
57
|
+
matches = file_name_lower.endswith(f".{extension.lower()}")
|
|
58
|
+
|
|
59
|
+
return matches
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from ..exceptions import FileUtilitiesError
|
|
4
|
+
from .base_strategy import FileRetrievalStrategy
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LocalDirectoryStrategy(FileRetrievalStrategy):
|
|
8
|
+
"""Strategy for retrieving files from a local directory.
|
|
9
|
+
|
|
10
|
+
This strategy implements the file retrieval logic for local directories, including
|
|
11
|
+
optional recursive search through subdirectories and filtering by file extension.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list[str]:
|
|
16
|
+
"""Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
location: Top-level directory to read from, e.g., '/Volumes/my_volume/landing/example_landing/'.
|
|
20
|
+
extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
|
|
21
|
+
extension, input None to get all files.
|
|
22
|
+
search_subdirs: If True, function will also search within all subdirectories.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
List: List of files in the directory and its subdirectories with the given extension.
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
ValueError: If the location is not provided.
|
|
29
|
+
FileUtilitiesError: For any other unexpected errors.
|
|
30
|
+
"""
|
|
31
|
+
if not location:
|
|
32
|
+
raise ValueError("location is required")
|
|
33
|
+
|
|
34
|
+
if not os.path.isdir(location):
|
|
35
|
+
raise FileUtilitiesError(f"The provided path '{location}' is not a valid directory.")
|
|
36
|
+
|
|
37
|
+
file_list = []
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
for root, _, files in os.walk(location):
|
|
41
|
+
if not search_subdirs and root != location:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
for file_name in files:
|
|
45
|
+
if FileRetrievalStrategy._matches_extension(file_name, extension):
|
|
46
|
+
file_list.append(os.path.join(root, file_name))
|
|
47
|
+
|
|
48
|
+
except Exception as err:
|
|
49
|
+
raise FileUtilitiesError(f"An error occurred while retrieving file paths: {err}") from err
|
|
50
|
+
|
|
51
|
+
return file_list
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from .base_strategy import FileRetrievalStrategy
|
|
2
|
+
from .local_strategy import LocalDirectoryStrategy
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class OneLakeStrategy(FileRetrievalStrategy):
|
|
6
|
+
"""Strategy for retrieving files from the OneLake."""
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list:
|
|
10
|
+
"""Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
location: Top-level directory to read from, e.g., '/Volumes/my_volume/landing/example_landing/'.
|
|
14
|
+
extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
|
|
15
|
+
extension, input None to get all files.
|
|
16
|
+
search_subdirs: If True, function will also search within all subdirectories.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
List: List of files in the directory and its subdirectories with the given extension.
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ValueError: If the location is not provided.
|
|
23
|
+
Exception: For any other unexpected errors.
|
|
24
|
+
"""
|
|
25
|
+
if not location:
|
|
26
|
+
raise ValueError("location is required")
|
|
27
|
+
|
|
28
|
+
file_paths = LocalDirectoryStrategy.get_file_paths(location, extension, search_subdirs)
|
|
29
|
+
|
|
30
|
+
shortened_file_paths = [p.replace("/lakehouse/default/", "") for p in file_paths]
|
|
31
|
+
return shortened_file_paths
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from ...session import SessionManager
|
|
2
|
+
from ..exceptions import FileUtilitiesError
|
|
3
|
+
from .base_strategy import FileRetrievalStrategy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class UtilsStrategy(FileRetrievalStrategy):
|
|
7
|
+
"""Strategy for retrieving files using DButils (in Databricks) and mssparkutils (in Fabric).
|
|
8
|
+
|
|
9
|
+
This strategy implements the file retrieval logic using utils, including
|
|
10
|
+
recursive search through directories and filtering by file extension.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list:
|
|
15
|
+
"""Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
location: Top-level directory to read from, e.g., '/Volumes/my_volume/landing/example_landing/'.
|
|
19
|
+
extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
|
|
20
|
+
extension, input None to get all files.
|
|
21
|
+
search_subdirs: If True, function will also search within all subdirectories.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
List: List of files in the directory and its subdirectories with the given extension.
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
ValueError: If the location is not provided.
|
|
28
|
+
Exception: For any other unexpected errors.
|
|
29
|
+
"""
|
|
30
|
+
if not location:
|
|
31
|
+
raise ValueError("location is required")
|
|
32
|
+
|
|
33
|
+
utils = SessionManager.get_utils()
|
|
34
|
+
|
|
35
|
+
def _inner_loop(directory: str) -> list:
|
|
36
|
+
"""Inner loop that recursively traverses directories to find all files with a given extension.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
directory: The directory to start searching in.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List: List of all files in the directory and its subdirectories with the given extension.
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
dirs = utils.fs.ls(directory)
|
|
46
|
+
except Exception as err:
|
|
47
|
+
raise FileUtilitiesError(
|
|
48
|
+
f"An error occurred while listing files in directory '{directory}': {err}"
|
|
49
|
+
) from err
|
|
50
|
+
|
|
51
|
+
file_list = [file for file in dirs if FileRetrievalStrategy._matches_extension(file.name, extension)]
|
|
52
|
+
|
|
53
|
+
if search_subdirs:
|
|
54
|
+
for p in dirs:
|
|
55
|
+
if p.isDir() and p.path != directory:
|
|
56
|
+
try:
|
|
57
|
+
sub_dir_files = _inner_loop(p.path)
|
|
58
|
+
file_list.extend(sub_dir_files)
|
|
59
|
+
except Exception as err:
|
|
60
|
+
raise FileUtilitiesError(
|
|
61
|
+
f"An error occurred while processing subdirectory '{p.path}': {err}"
|
|
62
|
+
) from err
|
|
63
|
+
|
|
64
|
+
return file_list
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
file_list = _inner_loop(location)
|
|
68
|
+
except Exception as err:
|
|
69
|
+
raise FileUtilitiesError(f"An error occurred while retrieving file paths: {err}") from err
|
|
70
|
+
|
|
71
|
+
file_list = [p.path for p in file_list if not p.isDir()]
|
|
72
|
+
return file_list
|
|
File without changes
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import pyspark.sql.functions as F
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from requests.auth import AuthBase
|
|
7
|
+
|
|
8
|
+
from cloe_nessy.clients.api_client.api_response import APIResponse
|
|
9
|
+
|
|
10
|
+
from ...clients.api_client import APIClient
|
|
11
|
+
from ...clients.api_client.exceptions import (
|
|
12
|
+
APIClientConnectionError,
|
|
13
|
+
APIClientError,
|
|
14
|
+
APIClientHTTPError,
|
|
15
|
+
APIClientTimeoutError,
|
|
16
|
+
)
|
|
17
|
+
from .reader import BaseReader
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class APIReader(BaseReader):
|
|
21
|
+
"""Utility class for reading an API into a DataFrame.
|
|
22
|
+
|
|
23
|
+
This class uses an APIClient to fetch data from an API and load it into a Spark DataFrame.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
api_client: The client for making API requests.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, base_url: str, auth: AuthBase | None, default_headers: dict[str, str] | None = None):
|
|
30
|
+
"""Initializes the APIReader object.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
base_url : The base URL for the API.
|
|
34
|
+
auth: The authentication method for the API.
|
|
35
|
+
default_headers: Default headers to include in requests.
|
|
36
|
+
"""
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.api_client = APIClient(base_url, auth, default_headers)
|
|
39
|
+
|
|
40
|
+
def read(
|
|
41
|
+
self,
|
|
42
|
+
endpoint: str = "",
|
|
43
|
+
method: str = "GET",
|
|
44
|
+
key: str | None = None,
|
|
45
|
+
timeout: int = 30,
|
|
46
|
+
params: dict[str, str] | None = None,
|
|
47
|
+
headers: dict[str, str] | None = None,
|
|
48
|
+
data: dict[str, str] | None = None,
|
|
49
|
+
json_body: dict[str, str] | None = None,
|
|
50
|
+
max_retries: int = 0,
|
|
51
|
+
options: dict[str, str] | None = None,
|
|
52
|
+
add_metadata_column: bool = False,
|
|
53
|
+
**kwargs: Any,
|
|
54
|
+
) -> DataFrame:
|
|
55
|
+
"""Reads data from an API endpoint and returns it as a DataFrame.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
endpoint: The endpoint to send the request to.
|
|
59
|
+
method: The HTTP method to use for the request.
|
|
60
|
+
key: The key to extract from the JSON response.
|
|
61
|
+
timeout: The timeout for the request in seconds.
|
|
62
|
+
params: The query parameters for the request.
|
|
63
|
+
headers: The headers to include in the request.
|
|
64
|
+
data: The form data to include in the request.
|
|
65
|
+
json_body: The JSON data to include in the request.
|
|
66
|
+
max_retries: The maximum number of retries for the request.
|
|
67
|
+
options: Additional options for the createDataFrame function.
|
|
68
|
+
add_metadata_column: If set, adds a __metadata column containing metadata about the API response.
|
|
69
|
+
kwargs: This method does not accept any additional keyword arguments.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
DataFrame: The Spark DataFrame containing the read data in the json_object column.
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
RuntimeError: If there is an error with the API request or reading the data.
|
|
76
|
+
"""
|
|
77
|
+
options = options or {}
|
|
78
|
+
try:
|
|
79
|
+
response = self.api_client.request(
|
|
80
|
+
method=method,
|
|
81
|
+
endpoint=endpoint,
|
|
82
|
+
timeout=timeout,
|
|
83
|
+
params=params,
|
|
84
|
+
headers=headers,
|
|
85
|
+
data=data,
|
|
86
|
+
json=json_body,
|
|
87
|
+
max_retries=max_retries,
|
|
88
|
+
)
|
|
89
|
+
data_list = response.to_dict(key)
|
|
90
|
+
json_string = json.dumps(data_list)
|
|
91
|
+
df: DataFrame = self._spark.createDataFrame(data={json_string}, schema=["json_string"], **options) # type: ignore
|
|
92
|
+
row = df.select("json_string").head()
|
|
93
|
+
if row is not None:
|
|
94
|
+
schema = F.schema_of_json(row[0])
|
|
95
|
+
else:
|
|
96
|
+
raise RuntimeError("It was not possible to infer the schema of the JSON data.")
|
|
97
|
+
df_result = df.withColumn("json_object", F.from_json("json_string", schema)).select("json_object")
|
|
98
|
+
if add_metadata_column:
|
|
99
|
+
df_result = self._add_metadata_column(df_result, response)
|
|
100
|
+
return df_result
|
|
101
|
+
|
|
102
|
+
except (APIClientHTTPError, APIClientConnectionError, APIClientTimeoutError) as e:
|
|
103
|
+
raise RuntimeError(f"API request failed: {e}") from e
|
|
104
|
+
except APIClientError as e:
|
|
105
|
+
raise RuntimeError(f"An error occurred while reading the API data: {e}") from e
|
|
106
|
+
except Exception as e:
|
|
107
|
+
raise RuntimeError(f"An unexpected error occurred: {e}") from e
|
|
108
|
+
|
|
109
|
+
def _add_metadata_column(self, df: DataFrame, response: APIResponse):
|
|
110
|
+
"""Adds a metadata column to a DataFrame.
|
|
111
|
+
|
|
112
|
+
This method appends a column named `__metadata` to the given DataFrame, containing a map
|
|
113
|
+
of metadata related to an API response. The metadata includes the current timestamp,
|
|
114
|
+
the base URL of the API, the URL of the request, the HTTP status code, the reason phrase,
|
|
115
|
+
and the elapsed time of the request in seconds.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
df: The DataFrame to which the metadata column will be added.
|
|
119
|
+
response: The API response object containing the metadata to be added.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
DataFrame: The original DataFrame with an added `__metadata` column containing the API response metadata.
|
|
123
|
+
"""
|
|
124
|
+
df = df.withColumn(
|
|
125
|
+
"__metadata",
|
|
126
|
+
F.create_map(
|
|
127
|
+
F.lit("timestamp"),
|
|
128
|
+
F.current_timestamp(),
|
|
129
|
+
F.lit("base_url"),
|
|
130
|
+
F.lit(self.api_client.base_url),
|
|
131
|
+
F.lit("url"),
|
|
132
|
+
F.lit(response.url),
|
|
133
|
+
F.lit("status_code"),
|
|
134
|
+
F.lit(response.status_code),
|
|
135
|
+
F.lit("reason"),
|
|
136
|
+
F.lit(response.reason),
|
|
137
|
+
F.lit("elapsed"),
|
|
138
|
+
F.lit(response.elapsed),
|
|
139
|
+
),
|
|
140
|
+
)
|
|
141
|
+
return df
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
from pyspark.sql.utils import AnalysisException
|
|
5
|
+
|
|
6
|
+
from .exceptions import ReadOperationFailedError
|
|
7
|
+
from .reader import BaseReader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CatalogReader(BaseReader):
|
|
11
|
+
"""A reader for Unity Catalog objects.
|
|
12
|
+
|
|
13
|
+
This class reads data from a Unity Catalog table and loads it into a Spark DataFrame.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
"""Initializes the CatalogReader object."""
|
|
18
|
+
super().__init__()
|
|
19
|
+
|
|
20
|
+
def read(self, table_identifier: str = "", **kwargs: Any) -> DataFrame:
|
|
21
|
+
"""Reads a table from the Unity Catalog.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
table_identifier: The table identifier in the Unity Catalog in the format 'catalog.schema.table'.
|
|
25
|
+
**kwargs: This method does not accept any additional keyword arguments.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
The Spark DataFrame containing the read data.
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
ValueError: If the table_identifier is not provided, is not a string, or is not in the correct format.
|
|
32
|
+
Exception: For any other unexpected errors.
|
|
33
|
+
"""
|
|
34
|
+
if not table_identifier:
|
|
35
|
+
raise ValueError("table_identifier is required")
|
|
36
|
+
if not isinstance(table_identifier, str):
|
|
37
|
+
raise ValueError("table_identifier must be a string")
|
|
38
|
+
if len(table_identifier.split(".")) != 3:
|
|
39
|
+
raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
df = self._spark.read.table(table_identifier)
|
|
43
|
+
return df
|
|
44
|
+
except AnalysisException as err:
|
|
45
|
+
raise ValueError(f"Table not found: {table_identifier}") from err
|
|
46
|
+
except Exception as err:
|
|
47
|
+
raise ReadOperationFailedError(
|
|
48
|
+
f"An error occurred while reading the table '{table_identifier}': {err}"
|
|
49
|
+
) from err
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pyspark.sql.functions as F
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
|
|
8
|
+
from .reader import BaseReader
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ExcelDataFrameReader(BaseReader):
|
|
12
|
+
"""Utility class for reading an Excel file into a DataFrame.
|
|
13
|
+
|
|
14
|
+
This class uses the Pandas API on Spark to read Excel files to a DataFrame.
|
|
15
|
+
More information can be found in the [official
|
|
16
|
+
documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self):
|
|
20
|
+
"""Initializes the ExcelDataFrameReader object."""
|
|
21
|
+
super().__init__()
|
|
22
|
+
|
|
23
|
+
def read_stream(self) -> DataFrame:
|
|
24
|
+
"""Currently not implemented."""
|
|
25
|
+
raise NotImplementedError("Currently not implemented.")
|
|
26
|
+
|
|
27
|
+
def read(
|
|
28
|
+
self,
|
|
29
|
+
location: str,
|
|
30
|
+
*,
|
|
31
|
+
sheet_name: str | int | list = 0,
|
|
32
|
+
header: int | list[int] = 0,
|
|
33
|
+
index_col: int | list[int] | None = None,
|
|
34
|
+
usecols: int | str | list | Callable | None = None,
|
|
35
|
+
true_values: list | None = None,
|
|
36
|
+
false_values: list | None = None,
|
|
37
|
+
nrows: int | None = None,
|
|
38
|
+
na_values: list[str] | dict[str, list[str]] | None = None,
|
|
39
|
+
keep_default_na: bool = True,
|
|
40
|
+
parse_dates: bool | list | dict = False,
|
|
41
|
+
date_parser: Callable | None = None,
|
|
42
|
+
thousands: str | None = None,
|
|
43
|
+
options: dict | None = None,
|
|
44
|
+
load_as_strings: bool = False,
|
|
45
|
+
add_metadata_column: bool = False,
|
|
46
|
+
**kwargs: Any,
|
|
47
|
+
) -> DataFrame:
|
|
48
|
+
"""Reads Excel file on specified location and returns DataFrame.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
location: Location of files to read.
|
|
52
|
+
sheet_name: Strings are used for sheet names.
|
|
53
|
+
Integers are used in zero-indexed sheet positions. Lists of
|
|
54
|
+
strings/integers are used to request multiple sheets. Specify None
|
|
55
|
+
to get all sheets.
|
|
56
|
+
header: Row to use for column labels. If a
|
|
57
|
+
list of integers is passed those row positions will be combined. Use
|
|
58
|
+
None if there is no header.
|
|
59
|
+
index_col: Column to use as the row labels of the
|
|
60
|
+
DataFrame. Pass None if there is no such column. If a list is
|
|
61
|
+
passed, those columns will be combined.
|
|
62
|
+
usecols: Return a subset of the columns. If
|
|
63
|
+
None, then parse all columns. If str, then indicates comma separated
|
|
64
|
+
list of Excel column letters and column ranges (e.g. “A:E” or
|
|
65
|
+
“A,C,E:F”). Ranges are inclusive of both sides. nIf list of int,
|
|
66
|
+
then indicates list of column numbers to be parsed. If list of
|
|
67
|
+
string, then indicates list of column names to be parsed. If
|
|
68
|
+
Callable, then evaluate each column name against it and parse the
|
|
69
|
+
column if the Callable returns True.
|
|
70
|
+
true_values: Values to consider as True.
|
|
71
|
+
false_values: Values to consider as False.
|
|
72
|
+
nrows: Number of rows to parse.
|
|
73
|
+
na_values: Additional strings to recognize as
|
|
74
|
+
NA/NaN. If dict passed, specific per-column NA values.
|
|
75
|
+
keep_default_na: If na_values are specified and
|
|
76
|
+
keep_default_na is False the default NaN values are overridden,
|
|
77
|
+
otherwise they're appended to.
|
|
78
|
+
parse_dates: The behavior is as follows:
|
|
79
|
+
- bool. If True -> try parsing the index.
|
|
80
|
+
- list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column.
|
|
81
|
+
- list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column.
|
|
82
|
+
- dict, e.g. {{"foo" : [1, 3]}} -> parse columns 1, 3 as date and call result "foo"
|
|
83
|
+
If a column or index contains an unparseable date, the entire column or index will be returned unaltered as an object data type.
|
|
84
|
+
date_parser: Function to use for converting a sequence of
|
|
85
|
+
string columns to an array of datetime instances. The default uses
|
|
86
|
+
dateutil.parser.parser to do the conversion.
|
|
87
|
+
thousands: Thousands separator for parsing string columns to
|
|
88
|
+
numeric. Note that this parameter is only necessary for columns
|
|
89
|
+
stored as TEXT in Excel, any numeric columns will automatically be
|
|
90
|
+
parsed, regardless of display format.
|
|
91
|
+
options: Optional keyword arguments passed to
|
|
92
|
+
pyspark.pandas.read_excel and handed to TextFileReader.
|
|
93
|
+
load_as_strings: If True, converts all columns to string type to avoid datatype conversion errors in Spark.
|
|
94
|
+
add_metadata_column: If True, adds a metadata column containing the file location and sheet name.
|
|
95
|
+
kwargs: This method does not accept any additional keyword arguments.
|
|
96
|
+
"""
|
|
97
|
+
if options is None:
|
|
98
|
+
options = {}
|
|
99
|
+
if ".xls" not in location:
|
|
100
|
+
raise ValueError(
|
|
101
|
+
"The excel reader can only be used for files with extension .xls. Use FileReader or some other reader instead."
|
|
102
|
+
)
|
|
103
|
+
try:
|
|
104
|
+
df = pd.read_excel( # type: ignore
|
|
105
|
+
location,
|
|
106
|
+
sheet_name=sheet_name,
|
|
107
|
+
header=header,
|
|
108
|
+
index_col=index_col,
|
|
109
|
+
usecols=usecols,
|
|
110
|
+
true_values=true_values,
|
|
111
|
+
false_values=false_values,
|
|
112
|
+
nrows=nrows,
|
|
113
|
+
na_values=na_values,
|
|
114
|
+
keep_default_na=keep_default_na,
|
|
115
|
+
parse_dates=parse_dates,
|
|
116
|
+
date_parser=date_parser,
|
|
117
|
+
thousands=thousands,
|
|
118
|
+
dtype="string" if load_as_strings else None,
|
|
119
|
+
**options,
|
|
120
|
+
)
|
|
121
|
+
if isinstance(df, dict):
|
|
122
|
+
# in case pandas.read_excel returns a dict, union to single df
|
|
123
|
+
df = pd.concat(list(df.values()), ignore_index=True)
|
|
124
|
+
|
|
125
|
+
except FileNotFoundError:
|
|
126
|
+
self._console_logger.error(f"No xls(x) file was found at the specified location [ '{location}' ].")
|
|
127
|
+
raise
|
|
128
|
+
except Exception as e:
|
|
129
|
+
self._console_logger.error(f"read file [ '{location}' ] failed. Error: {e}")
|
|
130
|
+
else:
|
|
131
|
+
self._console_logger.info(f"Read file [ '{location}' ] succeeded.")
|
|
132
|
+
|
|
133
|
+
spark_df = self._spark.createDataFrame(df)
|
|
134
|
+
if add_metadata_column:
|
|
135
|
+
spark_df = self._add_metadata_column(df=spark_df, location=location, sheet_name=sheet_name)
|
|
136
|
+
return spark_df
|
|
137
|
+
|
|
138
|
+
def _add_metadata_column(self, df: DataFrame, location: str, sheet_name: str | int | list):
|
|
139
|
+
"""Adds a metadata column to a DataFrame.
|
|
140
|
+
|
|
141
|
+
This method appends a column named `__metadata` to the given DataFrame, containing a map
|
|
142
|
+
of metadata related to the Excel file read operation. The metadata includes the current
|
|
143
|
+
timestamp, the location of the Excel file, and the sheet name(s) from which the data was read.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
df: The DataFrame to which the metadata column will be added.
|
|
147
|
+
location: The file path of the Excel file.
|
|
148
|
+
sheet_name: The sheet name or sheet index used when reading the Excel file.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
DataFrame: The original DataFrame with an added `__metadata` column containing the Excel file metadata.
|
|
152
|
+
"""
|
|
153
|
+
# Convert sheet_name to string if it is not already a string
|
|
154
|
+
if isinstance(sheet_name, list):
|
|
155
|
+
sheet_name = ", ".join(map(str, sheet_name))
|
|
156
|
+
else:
|
|
157
|
+
sheet_name = str(sheet_name)
|
|
158
|
+
|
|
159
|
+
df = df.withColumn(
|
|
160
|
+
"__metadata",
|
|
161
|
+
F.create_map(
|
|
162
|
+
F.lit("timestamp"),
|
|
163
|
+
F.current_timestamp(),
|
|
164
|
+
F.lit("file_location"),
|
|
165
|
+
F.lit(location),
|
|
166
|
+
F.lit("sheet_name"),
|
|
167
|
+
F.lit(sheet_name),
|
|
168
|
+
),
|
|
169
|
+
)
|
|
170
|
+
return df
|