cloe-nessy 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/file_utilities/get_file_paths.py +8 -2
- cloe_nessy/file_utilities/strategies/base_strategy.py +8 -2
- cloe_nessy/file_utilities/strategies/local_strategy.py +7 -1
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +22 -3
- cloe_nessy/file_utilities/strategies/utils_strategy.py +7 -1
- cloe_nessy/integration/reader/file_reader.py +1 -1
- cloe_nessy/session/session_manager.py +11 -5
- {cloe_nessy-0.2.9.dist-info → cloe_nessy-0.2.10.dist-info}/METADATA +1 -1
- {cloe_nessy-0.2.9.dist-info → cloe_nessy-0.2.10.dist-info}/RECORD +11 -11
- {cloe_nessy-0.2.9.dist-info → cloe_nessy-0.2.10.dist-info}/WHEEL +0 -0
- {cloe_nessy-0.2.9.dist-info → cloe_nessy-0.2.10.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,12 @@ from .factory import FileRetrievalFactory
|
|
|
5
5
|
from .location_types import LocationType
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def get_file_paths(
|
|
8
|
+
def get_file_paths(
|
|
9
|
+
location: str,
|
|
10
|
+
file_name_pattern: str | None = None,
|
|
11
|
+
search_subdirs: bool = True,
|
|
12
|
+
**kwargs,
|
|
13
|
+
) -> list[str]:
|
|
9
14
|
"""Retrieves file paths from a specified location based on the provided criteria.
|
|
10
15
|
|
|
11
16
|
This function determines the type of location (e.g., local directory, blob storage),
|
|
@@ -16,6 +21,7 @@ def get_file_paths(location: str, file_name_pattern: str | None = None, search_s
|
|
|
16
21
|
location: The location to search for files. This could be a path to a local directory or a URI for blob storage.
|
|
17
22
|
file_name_pattern: The file file_name_pattern to filter by as string. None retrieves all files regardless of file_name_pattern.
|
|
18
23
|
search_subdirs: Whether to include files from subdirectories in the search.
|
|
24
|
+
kwargs: Additional keyword arguments.
|
|
19
25
|
|
|
20
26
|
Returns:
|
|
21
27
|
A list of file paths that match the specified criteria. The paths are returned as strings.
|
|
@@ -37,7 +43,7 @@ def get_file_paths(location: str, file_name_pattern: str | None = None, search_s
|
|
|
37
43
|
logger.info(
|
|
38
44
|
f"Retrieving file paths from location [ '{location}' ] with strategy [ '{strategy.__class__.__name__}' ]"
|
|
39
45
|
)
|
|
40
|
-
paths = strategy.get_file_paths(location, file_name_pattern, search_subdirs)
|
|
46
|
+
paths = strategy.get_file_paths(location, file_name_pattern, search_subdirs, **kwargs)
|
|
41
47
|
logger.debug("paths:", paths)
|
|
42
48
|
return paths
|
|
43
49
|
|
|
@@ -11,7 +11,12 @@ class FileRetrievalStrategy(ABC):
|
|
|
11
11
|
|
|
12
12
|
@staticmethod
|
|
13
13
|
@abstractmethod
|
|
14
|
-
def get_file_paths(
|
|
14
|
+
def get_file_paths(
|
|
15
|
+
location: str,
|
|
16
|
+
extension: str | None = None,
|
|
17
|
+
search_subdirs: bool = True,
|
|
18
|
+
**kwargs,
|
|
19
|
+
) -> list[str]:
|
|
15
20
|
"""Retrieves a list of file paths based on the specified criteria.
|
|
16
21
|
|
|
17
22
|
Args:
|
|
@@ -19,11 +24,12 @@ class FileRetrievalStrategy(ABC):
|
|
|
19
24
|
extension: The file extension to filter by. If None, no extension filtering is applied.
|
|
20
25
|
If an empty string, it matches files with no extension.
|
|
21
26
|
search_subdirs: Whether to search in subdirectories.
|
|
27
|
+
kwargs: Additional keyword arguments that may be used by concrete implementations
|
|
22
28
|
|
|
23
29
|
Returns:
|
|
24
30
|
list[str]: A list of file paths that match the specified criteria.
|
|
25
31
|
"""
|
|
26
|
-
|
|
32
|
+
raise NotImplementedError("Concrete implementations must provide the logic for retrieving file paths.")
|
|
27
33
|
|
|
28
34
|
@staticmethod
|
|
29
35
|
def _matches_extension(file_name: str, extension: str | None) -> bool:
|
|
@@ -12,7 +12,12 @@ class LocalDirectoryStrategy(FileRetrievalStrategy):
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
@staticmethod
|
|
15
|
-
def get_file_paths(
|
|
15
|
+
def get_file_paths(
|
|
16
|
+
location: str,
|
|
17
|
+
extension: str | None = None,
|
|
18
|
+
search_subdirs: bool = True,
|
|
19
|
+
**kwargs, # noqa: ARG004
|
|
20
|
+
) -> list[str]:
|
|
16
21
|
"""Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
|
|
17
22
|
|
|
18
23
|
Args:
|
|
@@ -20,6 +25,7 @@ class LocalDirectoryStrategy(FileRetrievalStrategy):
|
|
|
20
25
|
extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
|
|
21
26
|
extension, input None to get all files.
|
|
22
27
|
search_subdirs: If True, function will also search within all subdirectories.
|
|
28
|
+
kwargs: Additional keyword arguments. Used in the OneLakeStrategy.
|
|
23
29
|
|
|
24
30
|
Returns:
|
|
25
31
|
List: List of files in the directory and its subdirectories with the given extension.
|
|
@@ -6,7 +6,12 @@ class OneLakeStrategy(FileRetrievalStrategy):
|
|
|
6
6
|
"""Strategy for retrieving files from the OneLake."""
|
|
7
7
|
|
|
8
8
|
@staticmethod
|
|
9
|
-
def get_file_paths(
|
|
9
|
+
def get_file_paths(
|
|
10
|
+
location: str,
|
|
11
|
+
extension: str | None = None,
|
|
12
|
+
search_subdirs: bool = True,
|
|
13
|
+
**kwargs,
|
|
14
|
+
) -> list:
|
|
10
15
|
"""Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
|
|
11
16
|
|
|
12
17
|
Args:
|
|
@@ -14,6 +19,7 @@ class OneLakeStrategy(FileRetrievalStrategy):
|
|
|
14
19
|
extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
|
|
15
20
|
extension, input None to get all files.
|
|
16
21
|
search_subdirs: If True, function will also search within all subdirectories.
|
|
22
|
+
kwargs: Additional keyword arguments.
|
|
17
23
|
|
|
18
24
|
Returns:
|
|
19
25
|
List: List of files in the directory and its subdirectories with the given extension.
|
|
@@ -27,5 +33,18 @@ class OneLakeStrategy(FileRetrievalStrategy):
|
|
|
27
33
|
|
|
28
34
|
file_paths = LocalDirectoryStrategy.get_file_paths(location, extension, search_subdirs)
|
|
29
35
|
|
|
30
|
-
|
|
31
|
-
|
|
36
|
+
if kwargs.get("onelake_relative_paths", False) is True:
|
|
37
|
+
file_paths = OneLakeStrategy._relative_file_paths(file_paths)
|
|
38
|
+
|
|
39
|
+
return file_paths
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def _relative_file_paths(file_paths: list[str]) -> list[str]:
|
|
43
|
+
"""OneLake expects relative paths when working with spark.
|
|
44
|
+
|
|
45
|
+
Note:
|
|
46
|
+
Long Paths (in the format '/lakehouse/default/Files/my_file') are
|
|
47
|
+
used, e.g., when working with Pandas or os.
|
|
48
|
+
"""
|
|
49
|
+
relative_file_paths = [p.replace("/lakehouse/default/", "") for p in file_paths]
|
|
50
|
+
return relative_file_paths
|
|
@@ -11,7 +11,12 @@ class UtilsStrategy(FileRetrievalStrategy):
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
@staticmethod
|
|
14
|
-
def get_file_paths(
|
|
14
|
+
def get_file_paths(
|
|
15
|
+
location: str,
|
|
16
|
+
extension: str | None = None,
|
|
17
|
+
search_subdirs: bool = True,
|
|
18
|
+
**kwargs, # noqa: ARG004
|
|
19
|
+
) -> list:
|
|
15
20
|
"""Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
|
|
16
21
|
|
|
17
22
|
Args:
|
|
@@ -19,6 +24,7 @@ class UtilsStrategy(FileRetrievalStrategy):
|
|
|
19
24
|
extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
|
|
20
25
|
extension, input None to get all files.
|
|
21
26
|
search_subdirs: If True, function will also search within all subdirectories.
|
|
27
|
+
kwargs: Additional keyword arguments. Used in the OneLakeStrategy.
|
|
22
28
|
|
|
23
29
|
Returns:
|
|
24
30
|
List: List of files in the directory and its subdirectories with the given extension.
|
|
@@ -54,7 +54,7 @@ class FileReader(BaseReader):
|
|
|
54
54
|
spark_format = extension_to_datatype_dict[extension]
|
|
55
55
|
self._console_logger.debug(f"Reading files with format: {spark_format}")
|
|
56
56
|
if extension:
|
|
57
|
-
file_paths = get_file_paths(location, extension, search_subdirs)
|
|
57
|
+
file_paths = get_file_paths(location, extension, search_subdirs, onelake_relative_paths=True)
|
|
58
58
|
else:
|
|
59
59
|
file_paths = [location]
|
|
60
60
|
self._console_logger.debug(f"Found {len(file_paths)} files to read")
|
|
@@ -62,7 +62,9 @@ class SessionManager:
|
|
|
62
62
|
return cls._spark
|
|
63
63
|
|
|
64
64
|
@classmethod
|
|
65
|
-
def get_utils(
|
|
65
|
+
def get_utils(
|
|
66
|
+
cls,
|
|
67
|
+
) -> Any: # return type should be Union[DBUtils, MsSparkUtils], but can't import locally.
|
|
66
68
|
"""Get or create a DBUtils or MsSparkUtils instance, depending on the context.
|
|
67
69
|
|
|
68
70
|
In Databricks this will return DBUtils, while in Fabric it will return MsSparkUtils.
|
|
@@ -167,18 +169,22 @@ class SessionManager:
|
|
|
167
169
|
@classmethod
|
|
168
170
|
def get_spark_builder(cls):
|
|
169
171
|
"""Get the SparkSession builder based on the current environment."""
|
|
172
|
+
cls._detect_env()
|
|
170
173
|
builders = {
|
|
171
174
|
cls.Environment.DATABRICKS_UI: SparkSession.builder,
|
|
172
175
|
cls.Environment.FABRIC_UI: SparkSession.builder,
|
|
173
|
-
cls.Environment.DATABRICKS_CONNECT: cls._get_databricks_connect_builder
|
|
174
|
-
cls.Environment.OTHER_REMOTE_SPARK: cls._get_databricks_connect_builder
|
|
176
|
+
cls.Environment.DATABRICKS_CONNECT: cls._get_databricks_connect_builder,
|
|
177
|
+
cls.Environment.OTHER_REMOTE_SPARK: cls._get_databricks_connect_builder,
|
|
175
178
|
}
|
|
176
|
-
|
|
177
179
|
builder = builders.get(cls._env)
|
|
178
180
|
if builder is None:
|
|
179
181
|
raise ValueError(f"Unsupported environment: {cls._env}")
|
|
180
182
|
|
|
181
|
-
|
|
183
|
+
match cls._env:
|
|
184
|
+
case cls.Environment.DATABRICKS_CONNECT | cls.Environment.OTHER_REMOTE_SPARK:
|
|
185
|
+
return builder()
|
|
186
|
+
case _:
|
|
187
|
+
return builder
|
|
182
188
|
|
|
183
189
|
@staticmethod
|
|
184
190
|
def _get_databricks_connect_builder():
|
|
@@ -9,20 +9,20 @@ cloe_nessy/clients/api_client/exceptions.py,sha256=VR9nYMHWzIRLlMZMrPpOsEX0X_P0j
|
|
|
9
9
|
cloe_nessy/file_utilities/__init__.py,sha256=nY8H48jYHvTy0VYSRHVhZaFMlzfch4-T7y3N73tgMpI,73
|
|
10
10
|
cloe_nessy/file_utilities/exceptions.py,sha256=RDeV2S6AQnFhFINRo84HDV_hk2RMrf5oNQ7GhHmAZy0,97
|
|
11
11
|
cloe_nessy/file_utilities/factory.py,sha256=JONYGI8MCkNwG2_ujvjN3iB7BIdl7SqXKgV05YY_i4E,1735
|
|
12
|
-
cloe_nessy/file_utilities/get_file_paths.py,sha256=
|
|
12
|
+
cloe_nessy/file_utilities/get_file_paths.py,sha256=wQCNBi7kgM32BSFlCuKFnORd9myjZUygpNm2-tF1F54,2980
|
|
13
13
|
cloe_nessy/file_utilities/location_types.py,sha256=G0FjpEu4_inmWbu5tvs2FyZv2TIhmPgjWU_Rtvmd6i8,801
|
|
14
14
|
cloe_nessy/file_utilities/strategies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
cloe_nessy/file_utilities/strategies/base_strategy.py,sha256=
|
|
16
|
-
cloe_nessy/file_utilities/strategies/local_strategy.py,sha256=
|
|
17
|
-
cloe_nessy/file_utilities/strategies/onelake_strategy.py,sha256=
|
|
18
|
-
cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=
|
|
15
|
+
cloe_nessy/file_utilities/strategies/base_strategy.py,sha256=2BdGdP8ThjIP4e_fv7apx7Hg_L6q3nsPdek4oPgN7CI,2833
|
|
16
|
+
cloe_nessy/file_utilities/strategies/local_strategy.py,sha256=6OcEjzLvRTBT8FKXhkLI0befT48SHutGHFIXMq5Sq8E,2217
|
|
17
|
+
cloe_nessy/file_utilities/strategies/onelake_strategy.py,sha256=RnQjWtWIFzFj-zPqzyZaPYIjtjXkgP-K7-VA8GhkNmg,1980
|
|
18
|
+
cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=urayKfOUpSaXKgTs1KVK0TS7FWVrJ3k4OLKh35sCxAU,3194
|
|
19
19
|
cloe_nessy/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
cloe_nessy/integration/reader/__init__.py,sha256=J5vlORqHLBpHEvzIwfIjzN5xEdOat-8jlmdLcGj8nsA,239
|
|
21
21
|
cloe_nessy/integration/reader/api_reader.py,sha256=j3Z5O1oH-Zc43TyA_aYtnDNYC9xFMxMqDsRQWtEZGD8,5636
|
|
22
22
|
cloe_nessy/integration/reader/catalog_reader.py,sha256=tGK-Y0jZQGOrF9eZUzSr7ils-L58uex6qH9PZ81ZLy8,1835
|
|
23
23
|
cloe_nessy/integration/reader/excel_reader.py,sha256=4kifpIakHpGmap0-P0SUgjJoQdY-eeiZBIDrQp87wK8,8012
|
|
24
24
|
cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
|
|
25
|
-
cloe_nessy/integration/reader/file_reader.py,sha256=
|
|
25
|
+
cloe_nessy/integration/reader/file_reader.py,sha256=pkrW_N5avqQpqcZuIQgHw5CFf7DFpSuKvq88zPZPfyY,3879
|
|
26
26
|
cloe_nessy/integration/reader/reader.py,sha256=e2KVPePQme8SBQJEbL-3zpGasOgTiEvKFTslow2wGPw,1034
|
|
27
27
|
cloe_nessy/integration/writer/__init__.py,sha256=NIh0t1RYlG3J1Y5_CvnR36N9tISmcElD5Tq06ksmqoA,71
|
|
28
28
|
cloe_nessy/integration/writer/catalog_writer.py,sha256=49lDvYttUY79Ye_OMN2cji7lGJNNML4TTsjY7VvLVfc,2137
|
|
@@ -67,12 +67,12 @@ cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=Kez8puDK7cRfhleBE
|
|
|
67
67
|
cloe_nessy/pipeline/actions/transform_union.py,sha256=TDER06IABzxvIez4bGLKCLaDA4eScpTzYRbfUzwv_RQ,2342
|
|
68
68
|
cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=6yAHTX5kZviumgBW_NYVGAUin6U2nDzmic9of6wA8FY,2590
|
|
69
69
|
cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
|
|
70
|
-
cloe_nessy/session/session_manager.py,sha256=
|
|
70
|
+
cloe_nessy/session/session_manager.py,sha256=7LNerwILGkgt752cZLs2nlABGWiaoKdmOuLGWHZ6uYQ,6618
|
|
71
71
|
cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
|
|
72
72
|
cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
|
|
73
73
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
74
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
75
|
-
cloe_nessy-0.2.
|
|
76
|
-
cloe_nessy-0.2.
|
|
77
|
-
cloe_nessy-0.2.
|
|
78
|
-
cloe_nessy-0.2.
|
|
75
|
+
cloe_nessy-0.2.10.dist-info/METADATA,sha256=W9E01GNme6Zst17uy9TAW_eP7FL_Ng-HkKaUvXf8838,1838
|
|
76
|
+
cloe_nessy-0.2.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
77
|
+
cloe_nessy-0.2.10.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
|
|
78
|
+
cloe_nessy-0.2.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|