cloe-nessy 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,12 @@ from .factory import FileRetrievalFactory
5
5
  from .location_types import LocationType
6
6
 
7
7
 
8
- def get_file_paths(location: str, file_name_pattern: str | None = None, search_subdirs: bool = True) -> list[str]:
8
+ def get_file_paths(
9
+ location: str,
10
+ file_name_pattern: str | None = None,
11
+ search_subdirs: bool = True,
12
+ **kwargs,
13
+ ) -> list[str]:
9
14
  """Retrieves file paths from a specified location based on the provided criteria.
10
15
 
11
16
  This function determines the type of location (e.g., local directory, blob storage),
@@ -16,6 +21,7 @@ def get_file_paths(location: str, file_name_pattern: str | None = None, search_s
16
21
  location: The location to search for files. This could be a path to a local directory or a URI for blob storage.
17
22
  file_name_pattern: The file file_name_pattern to filter by as string. None retrieves all files regardless of file_name_pattern.
18
23
  search_subdirs: Whether to include files from subdirectories in the search.
24
+ kwargs: Additional keyword arguments.
19
25
 
20
26
  Returns:
21
27
  A list of file paths that match the specified criteria. The paths are returned as strings.
@@ -37,7 +43,7 @@ def get_file_paths(location: str, file_name_pattern: str | None = None, search_s
37
43
  logger.info(
38
44
  f"Retrieving file paths from location [ '{location}' ] with strategy [ '{strategy.__class__.__name__}' ]"
39
45
  )
40
- paths = strategy.get_file_paths(location, file_name_pattern, search_subdirs)
46
+ paths = strategy.get_file_paths(location, file_name_pattern, search_subdirs, **kwargs)
41
47
  logger.debug("paths:", paths)
42
48
  return paths
43
49
 
@@ -11,7 +11,12 @@ class FileRetrievalStrategy(ABC):
11
11
 
12
12
  @staticmethod
13
13
  @abstractmethod
14
- def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list[str]:
14
+ def get_file_paths(
15
+ location: str,
16
+ extension: str | None = None,
17
+ search_subdirs: bool = True,
18
+ **kwargs,
19
+ ) -> list[str]:
15
20
  """Retrieves a list of file paths based on the specified criteria.
16
21
 
17
22
  Args:
@@ -19,11 +24,12 @@ class FileRetrievalStrategy(ABC):
19
24
  extension: The file extension to filter by. If None, no extension filtering is applied.
20
25
  If an empty string, it matches files with no extension.
21
26
  search_subdirs: Whether to search in subdirectories.
27
+ kwargs: Additional keyword arguments that may be used by concrete implementations
22
28
 
23
29
  Returns:
24
30
  list[str]: A list of file paths that match the specified criteria.
25
31
  """
26
- pass
32
+ raise NotImplementedError("Concrete implementations must provide the logic for retrieving file paths.")
27
33
 
28
34
  @staticmethod
29
35
  def _matches_extension(file_name: str, extension: str | None) -> bool:
@@ -12,7 +12,12 @@ class LocalDirectoryStrategy(FileRetrievalStrategy):
12
12
  """
13
13
 
14
14
  @staticmethod
15
- def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list[str]:
15
+ def get_file_paths(
16
+ location: str,
17
+ extension: str | None = None,
18
+ search_subdirs: bool = True,
19
+ **kwargs, # noqa: ARG004
20
+ ) -> list[str]:
16
21
  """Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
17
22
 
18
23
  Args:
@@ -20,6 +25,7 @@ class LocalDirectoryStrategy(FileRetrievalStrategy):
20
25
  extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
21
26
  extension, input None to get all files.
22
27
  search_subdirs: If True, function will also search within all subdirectories.
28
+ kwargs: Additional keyword arguments. Used in the OneLakeStrategy.
23
29
 
24
30
  Returns:
25
31
  List: List of files in the directory and its subdirectories with the given extension.
@@ -6,7 +6,12 @@ class OneLakeStrategy(FileRetrievalStrategy):
6
6
  """Strategy for retrieving files from the OneLake."""
7
7
 
8
8
  @staticmethod
9
- def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list:
9
+ def get_file_paths(
10
+ location: str,
11
+ extension: str | None = None,
12
+ search_subdirs: bool = True,
13
+ **kwargs,
14
+ ) -> list:
10
15
  """Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
11
16
 
12
17
  Args:
@@ -14,6 +19,7 @@ class OneLakeStrategy(FileRetrievalStrategy):
14
19
  extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
15
20
  extension, input None to get all files.
16
21
  search_subdirs: If True, function will also search within all subdirectories.
22
+ kwargs: Additional keyword arguments.
17
23
 
18
24
  Returns:
19
25
  List: List of files in the directory and its subdirectories with the given extension.
@@ -27,5 +33,18 @@ class OneLakeStrategy(FileRetrievalStrategy):
27
33
 
28
34
  file_paths = LocalDirectoryStrategy.get_file_paths(location, extension, search_subdirs)
29
35
 
30
- shortened_file_paths = [p.replace("/lakehouse/default/", "") for p in file_paths]
31
- return shortened_file_paths
36
+ if kwargs.get("onelake_relative_paths", False) is True:
37
+ file_paths = OneLakeStrategy._relative_file_paths(file_paths)
38
+
39
+ return file_paths
40
+
41
+ @staticmethod
42
+ def _relative_file_paths(file_paths: list[str]) -> list[str]:
43
+ """OneLake expects relative paths when working with spark.
44
+
45
+ Note:
46
+ Long Paths (in the format '/lakehouse/default/Files/my_file') are
47
+ used, e.g., when working with Pandas or os.
48
+ """
49
+ relative_file_paths = [p.replace("/lakehouse/default/", "") for p in file_paths]
50
+ return relative_file_paths
@@ -11,7 +11,12 @@ class UtilsStrategy(FileRetrievalStrategy):
11
11
  """
12
12
 
13
13
  @staticmethod
14
- def get_file_paths(location: str, extension: str | None = None, search_subdirs: bool = True) -> list:
14
+ def get_file_paths(
15
+ location: str,
16
+ extension: str | None = None,
17
+ search_subdirs: bool = True,
18
+ **kwargs, # noqa: ARG004
19
+ ) -> list:
15
20
  """Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
16
21
 
17
22
  Args:
@@ -19,6 +24,7 @@ class UtilsStrategy(FileRetrievalStrategy):
19
24
  extension: File extension, e.g., 'csv', 'json'. Input an empty string to get files without any
20
25
  extension, input None to get all files.
21
26
  search_subdirs: If True, function will also search within all subdirectories.
27
+ kwargs: Additional keyword arguments. Used in the OneLakeStrategy.
22
28
 
23
29
  Returns:
24
30
  List: List of files in the directory and its subdirectories with the given extension.
@@ -54,7 +54,7 @@ class FileReader(BaseReader):
54
54
  spark_format = extension_to_datatype_dict[extension]
55
55
  self._console_logger.debug(f"Reading files with format: {spark_format}")
56
56
  if extension:
57
- file_paths = get_file_paths(location, extension, search_subdirs)
57
+ file_paths = get_file_paths(location, extension, search_subdirs, onelake_relative_paths=True)
58
58
  else:
59
59
  file_paths = [location]
60
60
  self._console_logger.debug(f"Found {len(file_paths)} files to read")
@@ -62,7 +62,9 @@ class SessionManager:
62
62
  return cls._spark
63
63
 
64
64
  @classmethod
65
- def get_utils(cls) -> Any: # return type should be Union[DBUtils, MsSparkUtils], but can't import locally.
65
+ def get_utils(
66
+ cls,
67
+ ) -> Any: # return type should be Union[DBUtils, MsSparkUtils], but can't import locally.
66
68
  """Get or create a DBUtils or MsSparkUtils instance, depending on the context.
67
69
 
68
70
  In Databricks this will return DBUtils, while in Fabric it will return MsSparkUtils.
@@ -167,18 +169,22 @@ class SessionManager:
167
169
  @classmethod
168
170
  def get_spark_builder(cls):
169
171
  """Get the SparkSession builder based on the current environment."""
172
+ cls._detect_env()
170
173
  builders = {
171
174
  cls.Environment.DATABRICKS_UI: SparkSession.builder,
172
175
  cls.Environment.FABRIC_UI: SparkSession.builder,
173
- cls.Environment.DATABRICKS_CONNECT: cls._get_databricks_connect_builder(),
174
- cls.Environment.OTHER_REMOTE_SPARK: cls._get_databricks_connect_builder(),
176
+ cls.Environment.DATABRICKS_CONNECT: cls._get_databricks_connect_builder,
177
+ cls.Environment.OTHER_REMOTE_SPARK: cls._get_databricks_connect_builder,
175
178
  }
176
-
177
179
  builder = builders.get(cls._env)
178
180
  if builder is None:
179
181
  raise ValueError(f"Unsupported environment: {cls._env}")
180
182
 
181
- return builder
183
+ match cls._env:
184
+ case cls.Environment.DATABRICKS_CONNECT | cls.Environment.OTHER_REMOTE_SPARK:
185
+ return builder()
186
+ case _:
187
+ return builder
182
188
 
183
189
  @staticmethod
184
190
  def _get_databricks_connect_builder():
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: cloe-nessy
3
- Version: 0.2.9
3
+ Version: 0.2.10
4
4
  Summary: Your friendly datalake monster.
5
5
  Home-page: https://initions.com/
6
6
  Author: initions
@@ -9,20 +9,20 @@ cloe_nessy/clients/api_client/exceptions.py,sha256=VR9nYMHWzIRLlMZMrPpOsEX0X_P0j
9
9
  cloe_nessy/file_utilities/__init__.py,sha256=nY8H48jYHvTy0VYSRHVhZaFMlzfch4-T7y3N73tgMpI,73
10
10
  cloe_nessy/file_utilities/exceptions.py,sha256=RDeV2S6AQnFhFINRo84HDV_hk2RMrf5oNQ7GhHmAZy0,97
11
11
  cloe_nessy/file_utilities/factory.py,sha256=JONYGI8MCkNwG2_ujvjN3iB7BIdl7SqXKgV05YY_i4E,1735
12
- cloe_nessy/file_utilities/get_file_paths.py,sha256=RoIOaBcHCMPiVHVEDm5ijnhLxFABZNVZK-nXooh2c7A,2895
12
+ cloe_nessy/file_utilities/get_file_paths.py,sha256=wQCNBi7kgM32BSFlCuKFnORd9myjZUygpNm2-tF1F54,2980
13
13
  cloe_nessy/file_utilities/location_types.py,sha256=G0FjpEu4_inmWbu5tvs2FyZv2TIhmPgjWU_Rtvmd6i8,801
14
14
  cloe_nessy/file_utilities/strategies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- cloe_nessy/file_utilities/strategies/base_strategy.py,sha256=qOcJE7re9-LNlrwtpFoAQIzH_YXx0jHHNKeqKEZzKrs,2591
16
- cloe_nessy/file_utilities/strategies/local_strategy.py,sha256=z_fKAfPcAeKE7SJ8-v8iFwHiWBxyFipfPs7VJhv1FSU,2073
17
- cloe_nessy/file_utilities/strategies/onelake_strategy.py,sha256=dIAkHPbmybVVxZSlnCPx3CpprahtopV5lxjJXPUKhz8,1405
18
- cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=luFPpjdc-B9-vcLoHun9gguG86o0ERzH9lplmzOpoQE,3050
15
+ cloe_nessy/file_utilities/strategies/base_strategy.py,sha256=2BdGdP8ThjIP4e_fv7apx7Hg_L6q3nsPdek4oPgN7CI,2833
16
+ cloe_nessy/file_utilities/strategies/local_strategy.py,sha256=6OcEjzLvRTBT8FKXhkLI0befT48SHutGHFIXMq5Sq8E,2217
17
+ cloe_nessy/file_utilities/strategies/onelake_strategy.py,sha256=RnQjWtWIFzFj-zPqzyZaPYIjtjXkgP-K7-VA8GhkNmg,1980
18
+ cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=urayKfOUpSaXKgTs1KVK0TS7FWVrJ3k4OLKh35sCxAU,3194
19
19
  cloe_nessy/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  cloe_nessy/integration/reader/__init__.py,sha256=J5vlORqHLBpHEvzIwfIjzN5xEdOat-8jlmdLcGj8nsA,239
21
21
  cloe_nessy/integration/reader/api_reader.py,sha256=j3Z5O1oH-Zc43TyA_aYtnDNYC9xFMxMqDsRQWtEZGD8,5636
22
22
  cloe_nessy/integration/reader/catalog_reader.py,sha256=tGK-Y0jZQGOrF9eZUzSr7ils-L58uex6qH9PZ81ZLy8,1835
23
23
  cloe_nessy/integration/reader/excel_reader.py,sha256=4kifpIakHpGmap0-P0SUgjJoQdY-eeiZBIDrQp87wK8,8012
24
24
  cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
25
- cloe_nessy/integration/reader/file_reader.py,sha256=CsKjn2W7-w6drbWtD7PNMniJ8kCgxm4qW4knhYA37tg,3850
25
+ cloe_nessy/integration/reader/file_reader.py,sha256=pkrW_N5avqQpqcZuIQgHw5CFf7DFpSuKvq88zPZPfyY,3879
26
26
  cloe_nessy/integration/reader/reader.py,sha256=e2KVPePQme8SBQJEbL-3zpGasOgTiEvKFTslow2wGPw,1034
27
27
  cloe_nessy/integration/writer/__init__.py,sha256=NIh0t1RYlG3J1Y5_CvnR36N9tISmcElD5Tq06ksmqoA,71
28
28
  cloe_nessy/integration/writer/catalog_writer.py,sha256=49lDvYttUY79Ye_OMN2cji7lGJNNML4TTsjY7VvLVfc,2137
@@ -67,12 +67,12 @@ cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=Kez8puDK7cRfhleBE
67
67
  cloe_nessy/pipeline/actions/transform_union.py,sha256=TDER06IABzxvIez4bGLKCLaDA4eScpTzYRbfUzwv_RQ,2342
68
68
  cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=6yAHTX5kZviumgBW_NYVGAUin6U2nDzmic9of6wA8FY,2590
69
69
  cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
70
- cloe_nessy/session/session_manager.py,sha256=B1TCfpZ8aieN37WWyY2b9qs2U7muyL1edzDCCPeOxHs,6407
70
+ cloe_nessy/session/session_manager.py,sha256=7LNerwILGkgt752cZLs2nlABGWiaoKdmOuLGWHZ6uYQ,6618
71
71
  cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
72
72
  cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
73
73
  cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
75
- cloe_nessy-0.2.9.dist-info/METADATA,sha256=42HBrdWGyYtb3eOER40KHnW0wVOUK0yQHI8Xi5uEscE,1837
76
- cloe_nessy-0.2.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
77
- cloe_nessy-0.2.9.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
78
- cloe_nessy-0.2.9.dist-info/RECORD,,
75
+ cloe_nessy-0.2.10.dist-info/METADATA,sha256=W9E01GNme6Zst17uy9TAW_eP7FL_Ng-HkKaUvXf8838,1838
76
+ cloe_nessy-0.2.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
77
+ cloe_nessy-0.2.10.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
78
+ cloe_nessy-0.2.10.dist-info/RECORD,,