cloe-nessy 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,12 @@ from .factory import FileRetrievalFactory
5
5
  from .location_types import LocationType
6
6
 
7
7
 
8
- def get_file_paths(location: str, file_name_pattern: str | None = None, search_subdirs: bool = True) -> list[str]:
8
+ def get_file_paths(
9
+ location: str,
10
+ file_name_pattern: str | None = None,
11
+ search_subdirs: bool = True,
12
+ **kwargs,
13
+ ) -> list[str]:
9
14
  """Retrieves file paths from a specified location based on the provided criteria.
10
15
 
11
16
  This function determines the type of location (e.g., local directory, blob storage),
@@ -16,6 +21,7 @@ def get_file_paths(location: str, file_name_pattern: str | None = None, search_s
16
21
  location: The location to search for files. This could be a path to a local directory or a URI for blob storage.
17
22
  file_name_pattern: The file file_name_pattern to filter by as string. None retrieves all files regardless of file_name_pattern.
18
23
  search_subdirs: Whether to include files from subdirectories in the search.
24
+ kwargs: Additional keyword arguments.
19
25
 
20
26
  Returns:
21
27
  A list of file paths that match the specified criteria. The paths are returned as strings.
@@ -37,7 +43,7 @@ def get_file_paths(location: str, file_name_pattern: str | None = None, search_s
37
43
  logger.info(
38
44
  f"Retrieving file paths from location [ '{location}' ] with strategy [ '{strategy.__class__.__name__}' ]"
39
45
  )
40
- paths = strategy.get_file_paths(location, file_name_pattern, search_subdirs)
46
+ paths = strategy.get_file_paths(location, file_name_pattern, search_subdirs, **kwargs)
41
47
  logger.debug("paths:", paths)
42
48
  return paths
43
49
 
@@ -11,7 +11,12 @@ class FileRetrievalStrategy(ABC):
11
11
 
12
12
  @staticmethod
13
13
  @abstractmethod
14
- def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list[str]:
14
+ def get_file_paths(
15
+ location: str,
16
+ extension: str | None = None,
17
+ search_subdirs: bool = True,
18
+ **kwargs,
19
+ ) -> list[str]:
15
20
  """Retrieves a list of file paths based on the specified criteria.
16
21
 
17
22
  Args:
@@ -19,11 +24,12 @@ class FileRetrievalStrategy(ABC):
19
24
  extension: The file extension to filter by. If None, no extension filtering is applied.
20
25
  If an empty string, it matches files with no extension.
21
26
  search_subdirs: Whether to search in subdirectories.
27
+ kwargs: Additional keyword arguments that may be used by concrete implementations
22
28
 
23
29
  Returns:
24
30
  list[str]: A list of file paths that match the specified criteria.
25
31
  """
26
- pass
32
+ raise NotImplementedError("Concrete implementations must provide the logic for retrieving file paths.")
27
33
 
28
34
  @staticmethod
29
35
  def _matches_extension(file_name: str, extension: str | None) -> bool:
@@ -12,7 +12,12 @@ class LocalDirectoryStrategy(FileRetrievalStrategy):
12
12
  """
13
13
 
14
14
  @staticmethod
15
- def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list[str]:
15
+ def get_file_paths(
16
+ location: str,
17
+ extension: str | None = None,
18
+ search_subdirs: bool = True,
19
+ **kwargs, # noqa: ARG004
20
+ ) -> list[str]:
16
21
  """Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
17
22
 
18
23
  Args:
@@ -20,6 +25,7 @@ class LocalDirectoryStrategy(FileRetrievalStrategy):
20
25
  extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
21
26
  extension, input None to get all files.
22
27
  search_subdirs: If True, function will also search within all subdirectories.
28
+ kwargs: Additional keyword arguments. Used in the OneLakeStrategy.
23
29
 
24
30
  Returns:
25
31
  List: List of files in the directory and its subdirectories with the given extension.
@@ -6,7 +6,12 @@ class OneLakeStrategy(FileRetrievalStrategy):
6
6
  """Strategy for retrieving files from the OneLake."""
7
7
 
8
8
  @staticmethod
9
- def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list:
9
+ def get_file_paths(
10
+ location: str,
11
+ extension: str | None = None,
12
+ search_subdirs: bool = True,
13
+ **kwargs,
14
+ ) -> list:
10
15
  """Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
11
16
 
12
17
  Args:
@@ -14,6 +19,7 @@ class OneLakeStrategy(FileRetrievalStrategy):
14
19
  extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
15
20
  extension, input None to get all files.
16
21
  search_subdirs: If True, function will also search within all subdirectories.
22
+ kwargs: Additional keyword arguments.
17
23
 
18
24
  Returns:
19
25
  List: List of files in the directory and its subdirectories with the given extension.
@@ -27,5 +33,18 @@ class OneLakeStrategy(FileRetrievalStrategy):
27
33
 
28
34
  file_paths = LocalDirectoryStrategy.get_file_paths(location, extension, search_subdirs)
29
35
 
30
- shortened_file_paths = [p.replace("/lakehouse/default/", "") for p in file_paths]
31
- return shortened_file_paths
36
+ if kwargs.get("onelake_relative_paths", False) is True:
37
+ file_paths = OneLakeStrategy._relative_file_paths(file_paths)
38
+
39
+ return file_paths
40
+
41
+ @staticmethod
42
+ def _relative_file_paths(file_paths: list[str]) -> list[str]:
43
+ """OneLake expects relative paths when working with spark.
44
+
45
+ Note:
46
+ Long Paths (in the format '/lakehouse/default/Files/my_file') are
47
+ used, e.g., when working with Pandas or os.
48
+ """
49
+ relative_file_paths = [p.replace("/lakehouse/default/", "") for p in file_paths]
50
+ return relative_file_paths
@@ -11,7 +11,12 @@ class UtilsStrategy(FileRetrievalStrategy):
11
11
  """
12
12
 
13
13
  @staticmethod
14
- def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list:
14
+ def get_file_paths(
15
+ location: str,
16
+ extension: str | None = None,
17
+ search_subdirs: bool = True,
18
+ **kwargs, # noqa: ARG004
19
+ ) -> list:
15
20
  """Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
16
21
 
17
22
  Args:
@@ -19,6 +24,7 @@ class UtilsStrategy(FileRetrievalStrategy):
19
24
  extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
20
25
  extension, input None to get all files.
21
26
  search_subdirs: If True, function will also search within all subdirectories.
27
+ kwargs: Additional keyword arguments. Used in the OneLakeStrategy.
22
28
 
23
29
  Returns:
24
30
  List: List of files in the directory and its subdirectories with the given extension.
@@ -54,7 +54,7 @@ class FileReader(BaseReader):
54
54
  spark_format = extension_to_datatype_dict[extension]
55
55
  self._console_logger.debug(f"Reading files with format: {spark_format}")
56
56
  if extension:
57
- file_paths = get_file_paths(location, extension, search_subdirs)
57
+ file_paths = get_file_paths(location, extension, search_subdirs, onelake_relative_paths=True)
58
58
  else:
59
59
  file_paths = [location]
60
60
  self._console_logger.debug(f"Found {len(file_paths)} files to read")
@@ -1,4 +1,3 @@
1
- import os
2
1
  from enum import Enum
3
2
  from typing import Any
4
3
 
@@ -62,7 +61,9 @@ class SessionManager:
62
61
  return cls._spark
63
62
 
64
63
  @classmethod
65
- def get_utils(cls) -> Any: # return type should be Union[DBUtils, MsSparkUtils], but can't import locally.
64
+ def get_utils(
65
+ cls,
66
+ ) -> Any: # return type should be Union[DBUtils, MsSparkUtils], but can't import locally.
66
67
  """Get or create a DBUtils or MsSparkUtils instance, depending on the context.
67
68
 
68
69
  In Databricks this will return DBUtils, while in Fabric it will return MsSparkUtils.
@@ -167,22 +168,25 @@ class SessionManager:
167
168
  @classmethod
168
169
  def get_spark_builder(cls):
169
170
  """Get the SparkSession builder based on the current environment."""
171
+ cls._detect_env()
170
172
  builders = {
171
173
  cls.Environment.DATABRICKS_UI: SparkSession.builder,
172
174
  cls.Environment.FABRIC_UI: SparkSession.builder,
173
- cls.Environment.DATABRICKS_CONNECT: cls._get_databricks_connect_builder(),
174
- cls.Environment.OTHER_REMOTE_SPARK: cls._get_databricks_connect_builder(),
175
+ cls.Environment.DATABRICKS_CONNECT: cls._get_databricks_connect_builder,
176
+ cls.Environment.OTHER_REMOTE_SPARK: cls._get_databricks_connect_builder,
175
177
  }
176
-
177
178
  builder = builders.get(cls._env)
178
179
  if builder is None:
179
180
  raise ValueError(f"Unsupported environment: {cls._env}")
180
181
 
181
- return builder
182
+ match cls._env:
183
+ case cls.Environment.DATABRICKS_CONNECT | cls.Environment.OTHER_REMOTE_SPARK:
184
+ return builder()
185
+ case _:
186
+ return builder
182
187
 
183
188
  @staticmethod
184
189
  def _get_databricks_connect_builder():
185
190
  from databricks.connect import DatabricksSession
186
191
 
187
- selected_profile_name = os.environ.get("NESSY_DATABRICKSPROFILE") or "DEFAULT"
188
- return DatabricksSession.builder.profile(selected_profile_name)
192
+ return DatabricksSession.builder
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: cloe-nessy
3
- Version: 0.2.9
3
+ Version: 0.2.11
4
4
  Summary: Your friendly datalake monster.
5
5
  Home-page: https://initions.com/
6
6
  Author: initions
@@ -9,20 +9,20 @@ cloe_nessy/clients/api_client/exceptions.py,sha256=VR9nYMHWzIRLlMZMrPpOsEX0X_P0j
9
9
  cloe_nessy/file_utilities/__init__.py,sha256=nY8H48jYHvTy0VYSRHVhZaFMlzfch4-T7y3N73tgMpI,73
10
10
  cloe_nessy/file_utilities/exceptions.py,sha256=RDeV2S6AQnFhFINRo84HDV_hk2RMrf5oNQ7GhHmAZy0,97
11
11
  cloe_nessy/file_utilities/factory.py,sha256=JONYGI8MCkNwG2_ujvjN3iB7BIdl7SqXKgV05YY_i4E,1735
12
- cloe_nessy/file_utilities/get_file_paths.py,sha256=RoIOaBcHCMPiVHVEDm5ijnhLxFABZNVZK-nXooh2c7A,2895
12
+ cloe_nessy/file_utilities/get_file_paths.py,sha256=wQCNBi7kgM32BSFlCuKFnORd9myjZUygpNm2-tF1F54,2980
13
13
  cloe_nessy/file_utilities/location_types.py,sha256=G0FjpEu4_inmWbu5tvs2FyZv2TIhmPgjWU_Rtvmd6i8,801
14
14
  cloe_nessy/file_utilities/strategies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- cloe_nessy/file_utilities/strategies/base_strategy.py,sha256=qOcJE7re9-LNlrwtpFoAQIzH_YXx0jHHNKeqKEZzKrs,2591
16
- cloe_nessy/file_utilities/strategies/local_strategy.py,sha256=z_fKAfPcAeKE7SJ8-v8iFwHiWBxyFipfPs7VJhv1FSU,2073
17
- cloe_nessy/file_utilities/strategies/onelake_strategy.py,sha256=dIAkHPbmybVVxZSlnCPx3CpprahtopV5lxjJXPUKhz8,1405
18
- cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=luFPpjdc-B9-vcLoHun9gguG86o0ERzH9lplmzOpoQE,3050
15
+ cloe_nessy/file_utilities/strategies/base_strategy.py,sha256=2BdGdP8ThjIP4e_fv7apx7Hg_L6q3nsPdek4oPgN7CI,2833
16
+ cloe_nessy/file_utilities/strategies/local_strategy.py,sha256=6OcEjzLvRTBT8FKXhkLI0befT48SHutGHFIXMq5Sq8E,2217
17
+ cloe_nessy/file_utilities/strategies/onelake_strategy.py,sha256=RnQjWtWIFzFj-zPqzyZaPYIjtjXkgP-K7-VA8GhkNmg,1980
18
+ cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=urayKfOUpSaXKgTs1KVK0TS7FWVrJ3k4OLKh35sCxAU,3194
19
19
  cloe_nessy/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  cloe_nessy/integration/reader/__init__.py,sha256=J5vlORqHLBpHEvzIwfIjzN5xEdOat-8jlmdLcGj8nsA,239
21
21
  cloe_nessy/integration/reader/api_reader.py,sha256=j3Z5O1oH-Zc43TyA_aYtnDNYC9xFMxMqDsRQWtEZGD8,5636
22
22
  cloe_nessy/integration/reader/catalog_reader.py,sha256=tGK-Y0jZQGOrF9eZUzSr7ils-L58uex6qH9PZ81ZLy8,1835
23
23
  cloe_nessy/integration/reader/excel_reader.py,sha256=4kifpIakHpGmap0-P0SUgjJoQdY-eeiZBIDrQp87wK8,8012
24
24
  cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
25
- cloe_nessy/integration/reader/file_reader.py,sha256=CsKjn2W7-w6drbWtD7PNMniJ8kCgxm4qW4knhYA37tg,3850
25
+ cloe_nessy/integration/reader/file_reader.py,sha256=pkrW_N5avqQpqcZuIQgHw5CFf7DFpSuKvq88zPZPfyY,3879
26
26
  cloe_nessy/integration/reader/reader.py,sha256=e2KVPePQme8SBQJEbL-3zpGasOgTiEvKFTslow2wGPw,1034
27
27
  cloe_nessy/integration/writer/__init__.py,sha256=NIh0t1RYlG3J1Y5_CvnR36N9tISmcElD5Tq06ksmqoA,71
28
28
  cloe_nessy/integration/writer/catalog_writer.py,sha256=49lDvYttUY79Ye_OMN2cji7lGJNNML4TTsjY7VvLVfc,2137
@@ -67,12 +67,12 @@ cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=Kez8puDK7cRfhleBE
67
67
  cloe_nessy/pipeline/actions/transform_union.py,sha256=TDER06IABzxvIez4bGLKCLaDA4eScpTzYRbfUzwv_RQ,2342
68
68
  cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=6yAHTX5kZviumgBW_NYVGAUin6U2nDzmic9of6wA8FY,2590
69
69
  cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
70
- cloe_nessy/session/session_manager.py,sha256=B1TCfpZ8aieN37WWyY2b9qs2U7muyL1edzDCCPeOxHs,6407
70
+ cloe_nessy/session/session_manager.py,sha256=rd33lSafzomuyGf1BzhyjIWuy9sXgFjr-ca7A7Sw8eo,6490
71
71
  cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
72
72
  cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
73
73
  cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
75
- cloe_nessy-0.2.9.dist-info/METADATA,sha256=42HBrdWGyYtb3eOER40KHnW0wVOUK0yQHI8Xi5uEscE,1837
76
- cloe_nessy-0.2.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
77
- cloe_nessy-0.2.9.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
78
- cloe_nessy-0.2.9.dist-info/RECORD,,
75
+ cloe_nessy-0.2.11.dist-info/METADATA,sha256=ggGZFc78AgFrjarO9XcfZAQcqc-mSJKwpwozdk_frlQ,1838
76
+ cloe_nessy-0.2.11.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
77
+ cloe_nessy-0.2.11.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
78
+ cloe_nessy-0.2.11.dist-info/RECORD,,