sibi-dst 0.3.30__py3-none-any.whl → 0.3.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ import dask.dataframe as dd
9
9
  from dask import delayed, compute
10
10
  import pandas as pd
11
11
  from pydantic import BaseModel
12
+ import fsspec
12
13
 
13
14
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
14
15
  from sibi_dst.utils import Logger
@@ -86,8 +87,11 @@ class DfHelper:
86
87
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
87
88
  self.dt_field = kwargs.setdefault("dt_field", None)
88
89
  self.as_pandas = kwargs.setdefault("as_pandas", False)
90
+ self.filesystem = kwargs.pop('filesystem', 'file')
91
+ self.filesystem_options = kwargs.pop('filesystem_options', {})
89
92
  kwargs.setdefault("live", True)
90
93
  kwargs.setdefault("logger", self.logger)
94
+ kwargs.setdefault("fs", fsspec.filesystem('file'))
91
95
  self.__post_init(**kwargs)
92
96
 
93
97
  def __str__(self):
@@ -13,7 +13,7 @@ class ParquetArtifact(DfHelper):
13
13
  'backend': 'parquet'
14
14
  }
15
15
 
16
- def __init__(self, data_wrapper_class, filesystem_type="file", filesystem_options=None, **kwargs):
16
+ def __init__(self, data_wrapper_class, **kwargs):
17
17
  self.config = {
18
18
  **self.DEFAULT_CONFIG,
19
19
  **kwargs,
@@ -39,13 +39,14 @@ class ParquetArtifact(DfHelper):
39
39
  raise ValueError('parquet_end_date must be set')
40
40
 
41
41
  # Filesystem setup
42
- self.filesystem_type = filesystem_type
43
- self.filesystem_options = filesystem_options or {}
44
- self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
45
-
42
+ self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
43
+ self.filesystem_options = self.config.setdefault('filesystem_options', {})
44
+ self.fs = self.config.setdefault('fs', None)
45
+ if self.fs is None:
46
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
47
+ self.config.setdefault('fs', self.fs)
46
48
  # Ensure the directory exists
47
49
  self.ensure_directory_exists(self.parquet_storage_path)
48
-
49
50
  super().__init__(**self.config)
50
51
 
51
52
  def load(self, **kwargs):
@@ -97,6 +98,7 @@ class ParquetArtifact(DfHelper):
97
98
  'history_days_threshold': kwargs.pop('history_days_threshold', 30),
98
99
  'max_age_minutes': kwargs.pop('max_age_minutes', 10),
99
100
  'show_progress': kwargs.pop('show_progress', False),
101
+ 'fs': self.fs,
100
102
  'filesystem_type': self.filesystem_type,
101
103
  'filesystem_options': self.filesystem_options,
102
104
  }
@@ -13,8 +13,8 @@ from sibi_dst.utils import Logger
13
13
  class ParquetConfig(BaseModel):
14
14
  load_parquet: bool = False
15
15
  parquet_filename: Optional[str] = None
16
- parquet_storage_path: Optional[DirectoryPath] = None
17
- parquet_full_path: Optional[FilePath] = None
16
+ parquet_storage_path: Optional[str] = None
17
+ parquet_full_path: Optional[str] = None
18
18
  parquet_folder_list: Optional[List[str]] = None
19
19
  parquet_size_bytes: int = 0
20
20
  parquet_max_age_minutes: int = 0
@@ -30,14 +30,17 @@ class ParquetConfig(BaseModel):
30
30
  # Configure paths based on fsspec
31
31
  if self.logger is None:
32
32
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
33
- self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
34
- str(self.parquet_storage_path).split("://")[0])
35
-
33
+ #self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
34
+ # str(self.parquet_storage_path).split("://")[0])
36
35
  # Validation for parquet path
36
+
37
+
37
38
  if self.parquet_storage_path is None:
38
39
  raise ValueError('Parquet storage path must be specified')
40
+ self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
39
41
  if not self.fs.exists(self.parquet_storage_path):
40
- raise ValueError('Parquet storage path does not exist')
42
+ self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
43
+ #raise ValueError('Parquet storage path does not exist')
41
44
  self.load_parquet = False
42
45
  if self.parquet_filename is not None:
43
46
  self.parquet_full_path = self.ensure_file_extension(
@@ -57,8 +60,9 @@ class ParquetConfig(BaseModel):
57
60
  raise ValueError('Parquet end date must be greater than start date')
58
61
 
59
62
  # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
60
- self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
63
+ self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), fs=self.fs,
61
64
  logger=self.logger).generate_file_paths(start_date, end_date)
65
+
62
66
  self.parquet_size_bytes = self.get_parquet_size_bytes()
63
67
  self.load_parquet = True
64
68
  # self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
@@ -84,11 +88,12 @@ class ParquetConfig(BaseModel):
84
88
  return total_size
85
89
 
86
90
  def load_files(self):
91
+
87
92
  if self.load_parquet:
88
93
  if self.parquet_folder_list:
89
- return dd.read_parquet(self.parquet_folder_list, engine="pyarrow")
94
+ return dd.read_parquet(self.parquet_folder_list, engine="pyarrow", filesystem=self.fs)
90
95
  else:
91
- return dd.read_parquet(self.parquet_full_path, engine="pyarrow")
96
+ return dd.read_parquet(self.parquet_full_path, engine="pyarrow", filesystem=self.fs)
92
97
 
93
98
  @staticmethod
94
99
  def ensure_file_extension(filepath: str, extension: str) -> str:
@@ -22,6 +22,7 @@ class DataWrapper:
22
22
  parquet_filename: str,
23
23
  start_date: Any,
24
24
  end_date: Any,
25
+ fs: Optional[fsspec.AbstractFileSystem] = None,
25
26
  filesystem_type: str = "file",
26
27
  filesystem_options: Optional[Dict] = None,
27
28
  verbose: bool = False,
@@ -41,7 +42,7 @@ class DataWrapper:
41
42
  self.parquet_filename = parquet_filename
42
43
  self.filesystem_type = filesystem_type
43
44
  self.filesystem_options = filesystem_options or {}
44
- self.fs = fsspec.filesystem(filesystem_type, **self.filesystem_options)
45
+ self.fs = fs or fsspec.filesystem(filesystem_type, **self.filesystem_options)
45
46
  self.verbose = verbose
46
47
  self.class_params = class_params or {}
47
48
  self.load_params = load_params or {}
@@ -129,23 +130,45 @@ class DataWrapper:
129
130
  def is_file_older_than(self, file_path: str) -> bool:
130
131
  """
131
132
  Check if a file is older than the specified max_age_minutes.
133
+
134
+ :param file_path: Path to the file.
135
+ :return: True if the file is older than max_age_minutes, False otherwise.
132
136
  """
133
137
  try:
138
+ # Get file info
134
139
  info = self.fs.info(file_path)
135
- file_modification_time = info['mtime']
136
- file_modification_datetime = datetime.datetime.fromtimestamp(
137
- file_modification_time, tz=datetime.timezone.utc
138
- )
140
+
141
+ # Determine the modification time from available keys
142
+ file_modification_time = None
143
+ if "mtime" in info: # Local filesystem
144
+ file_modification_time = info["mtime"]
145
+ file_modification_datetime = datetime.datetime.fromtimestamp(
146
+ file_modification_time, tz=datetime.timezone.utc
147
+ )
148
+ elif "LastModified" in info: # S3-compatible filesystem
149
+ file_modification_datetime = (
150
+ info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
151
+ else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
152
+ )
153
+ else:
154
+ self.logger.warning(f"Modification time not available for {file_path}.")
155
+ return True # Assume file is too old if we cannot determine its age
156
+
157
+ # Compare file age
139
158
  current_time = datetime.datetime.now(datetime.timezone.utc)
140
159
  file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
141
160
  self.logger.info(
142
161
  f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
143
162
  f"(threshold: {self.max_age_minutes} minutes)"
144
163
  )
145
-
146
164
  return file_age_minutes > self.max_age_minutes
165
+
147
166
  except FileNotFoundError:
148
- return True
167
+ self.logger.warning(f"File {file_path} not found.")
168
+ return True # File is considered old if it doesn't exist
169
+ except Exception as e:
170
+ self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
171
+ return True #
149
172
 
150
173
  def process_date(self, date: datetime.date):
151
174
  """Process a specific date by regenerating data as necessary."""
@@ -162,7 +185,7 @@ class DataWrapper:
162
185
  self.logger.error("No data found for the specified date.")
163
186
  return
164
187
 
165
- parquet_saver = ParquetSaver(df, folder, self.logger)
188
+ parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
166
189
  parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
167
190
 
168
191
  end_time = datetime.datetime.now()
@@ -91,7 +91,7 @@ class FilePathGenerator:
91
91
  if engine == 'dask':
92
92
  # Collect individual file paths
93
93
  file_pattern = f"{base_dir}/**/*.{self.file_extension}"
94
- all_paths = self.fs.glob(file_pattern, recursive=True)
94
+ all_paths = self.fs.glob(file_pattern)
95
95
 
96
96
  if not all_paths and self.debug:
97
97
  self.logger.debug(f"No files found with pattern: {file_pattern}")
@@ -1,25 +1,36 @@
1
1
  from pathlib import Path
2
2
  from typing import Optional
3
3
 
4
- import fsspec
5
4
  import pyarrow as pa
5
+ import fsspec
6
6
 
7
7
  from sibi_dst.utils import Logger
8
8
 
9
9
 
10
10
  class ParquetSaver:
11
- def __init__(self, df_result, parquet_storage_path, logger=None):
12
- # Ensure df_result is a Dask DataFrame
11
+ def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
12
+ """
13
+ Initialize ParquetSaver.
14
+ :param df_result: Dask DataFrame to save.
15
+ :param parquet_storage_path: Base storage path (e.g., "s3://bucket-name/path/").
16
+ :param logger: Logger instance for logging messages.
17
+ :param fs: Pre-initialized fsspec filesystem instance. Defaults to 'file' if None.
18
+ """
13
19
  self.df_result = df_result
14
- self.parquet_storage_path = parquet_storage_path
20
+ self.parquet_storage_path = parquet_storage_path.rstrip("/")
15
21
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
16
22
 
23
+ # Default to the local filesystem if `fs` is not provided
24
+ self.fs = fs or fsspec.filesystem("file")
25
+
17
26
  def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
27
+ """
28
+ Save the DataFrame to Parquet format.
29
+ :param parquet_filename: Filename for the Parquet file.
30
+ :param clear_existing: Whether to clear existing files in the target directory.
31
+ """
18
32
  full_path = self._construct_full_path(parquet_filename)
19
33
 
20
- # We cannot check for empty DataFrame directly with Dask without computation
21
- # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
22
-
23
34
  # Ensure directory exists and clear if necessary
24
35
  self._ensure_directory_exists(full_path, clear_existing=clear_existing)
25
36
 
@@ -31,19 +42,19 @@ class ParquetSaver:
31
42
  def _define_schema(self) -> pa.Schema:
32
43
  """Define a PyArrow schema dynamically based on df_result column types."""
33
44
  pandas_dtype_to_pa = {
34
- 'object': pa.string(),
35
- 'string': pa.string(),
36
- 'Int64': pa.int64(),
37
- 'int64': pa.int64(),
38
- 'float64': pa.float64(),
39
- 'float32': pa.float32(),
40
- 'bool': pa.bool_(),
41
- 'boolean': pa.bool_(), # pandas nullable boolean
42
- 'datetime64[ns]': pa.timestamp('ns'),
43
- 'timedelta[ns]': pa.duration('ns')
45
+ "object": pa.string(),
46
+ "string": pa.string(),
47
+ "Int64": pa.int64(),
48
+ "int64": pa.int64(),
49
+ "float64": pa.float64(),
50
+ "float32": pa.float32(),
51
+ "bool": pa.bool_(),
52
+ "boolean": pa.bool_(), # pandas nullable boolean
53
+ "datetime64[ns]": pa.timestamp("ns"),
54
+ "timedelta[ns]": pa.duration("ns"),
44
55
  }
45
56
 
46
- dtypes = self.df_result.dtypes # No need to call .compute()
57
+ dtypes = self.df_result.dtypes
47
58
 
48
59
  fields = [
49
60
  pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
@@ -58,47 +69,160 @@ class ParquetSaver:
58
69
  col_name = field.name
59
70
  if col_name in self.df_result.columns:
60
71
  if pa.types.is_string(field.type):
61
- dtype_mapping[col_name] = 'string'
72
+ dtype_mapping[col_name] = "string"
62
73
  elif pa.types.is_int64(field.type):
63
- dtype_mapping[col_name] = 'Int64' # pandas nullable integer
74
+ dtype_mapping[col_name] = "Int64"
64
75
  elif pa.types.is_float64(field.type):
65
- dtype_mapping[col_name] = 'float64'
76
+ dtype_mapping[col_name] = "float64"
66
77
  elif pa.types.is_float32(field.type):
67
- dtype_mapping[col_name] = 'float32'
78
+ dtype_mapping[col_name] = "float32"
68
79
  elif pa.types.is_boolean(field.type):
69
- dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
80
+ dtype_mapping[col_name] = "boolean"
70
81
  elif pa.types.is_timestamp(field.type):
71
- dtype_mapping[col_name] = 'datetime64[ns]'
82
+ dtype_mapping[col_name] = "datetime64[ns]"
72
83
  else:
73
- dtype_mapping[col_name] = 'object' # Fallback to object
74
- # Convert dtypes
84
+ dtype_mapping[col_name] = "object"
75
85
  self.df_result = self.df_result.astype(dtype_mapping)
76
86
 
77
- def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
87
+ def _construct_full_path(self, parquet_filename: Optional[str]) -> str:
78
88
  """Construct and return the full path for the Parquet file."""
79
- _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
80
89
  parquet_filename = parquet_filename or "default.parquet"
81
- return Path(base_path) / parquet_filename
90
+ return f"{self.parquet_storage_path}/{parquet_filename}"
82
91
 
83
- @staticmethod
84
- def _ensure_directory_exists(full_path: Path, clear_existing=False):
85
- """Ensure that the directory for the path exists, clearing it if specified."""
86
- fs, _ = fsspec.core.url_to_fs(str(full_path))
87
- directory = str(full_path.parent)
92
+ def _ensure_directory_exists(self, full_path: str, clear_existing=False):
93
+ """
94
+ Ensure that the directory for the path exists, clearing it if specified.
95
+ :param full_path: Full path for the target file.
96
+ :param clear_existing: Whether to clear existing files/directories.
97
+ """
98
+ directory = "/".join(full_path.split("/")[:-1])
88
99
 
89
- if fs.exists(directory):
100
+ if self.fs.exists(directory):
90
101
  if clear_existing:
91
- fs.rm(directory, recursive=True)
102
+ self.logger.info(f"Clearing existing directory: {directory}")
103
+ self.fs.rm(directory, recursive=True)
92
104
  else:
93
- fs.mkdirs(directory, exist_ok=True)
105
+ self.logger.info(f"Creating directory: {directory}")
106
+ self.fs.mkdirs(directory, exist_ok=True)
94
107
 
95
- def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
108
+ def _save_dataframe_to_parquet(self, full_path: str, schema: pa.Schema):
96
109
  """Save the DataFrame to Parquet using the specified schema."""
97
- fs, _ = fsspec.core.url_to_fs(str(full_path))
98
- if fs.exists(str(full_path)):
99
- fs.rm(str(full_path), recursive=True)
110
+ if self.fs.exists(full_path):
111
+ self.logger.info(f"Overwriting existing file: {full_path}")
112
+ self.fs.rm(full_path, recursive=True)
100
113
 
101
- # Save the Dask DataFrame to Parquet
114
+ self.logger.info(f"Saving Parquet file to: {full_path}")
102
115
  self.df_result.to_parquet(
103
- str(full_path), engine="pyarrow", schema=schema, write_index=False
116
+ full_path,
117
+ engine="pyarrow",
118
+ schema=schema,
119
+ storage_options=self.fs.storage_options if hasattr(self.fs, "storage_options") else None,
120
+ write_index=False,
104
121
  )
122
+
123
+ # from pathlib import Path
124
+ # from typing import Optional
125
+ #
126
+ # import fsspec
127
+ # import pyarrow as pa
128
+ #
129
+ # from sibi_dst.utils import Logger
130
+ #
131
+ #
132
+ # class ParquetSaver:
133
+ # def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
134
+ # # Ensure df_result is a Dask DataFrame
135
+ # self.fs = fs or fsspec.filesystem("file")
136
+ # self.df_result = df_result
137
+ # self.parquet_storage_path = parquet_storage_path
138
+ # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
139
+ #
140
+ # def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
141
+ # full_path = self._construct_full_path(parquet_filename)
142
+ #
143
+ # # We cannot check for empty DataFrame directly with Dask without computation
144
+ # # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
145
+ #
146
+ # # Ensure directory exists and clear if necessary
147
+ # self._ensure_directory_exists(full_path, clear_existing=clear_existing)
148
+ #
149
+ # # Define schema and save DataFrame to Parquet
150
+ # schema = self._define_schema()
151
+ # self._convert_dtypes(schema)
152
+ # self._save_dataframe_to_parquet(full_path, schema)
153
+ #
154
+ # def _define_schema(self) -> pa.Schema:
155
+ # """Define a PyArrow schema dynamically based on df_result column types."""
156
+ # pandas_dtype_to_pa = {
157
+ # 'object': pa.string(),
158
+ # 'string': pa.string(),
159
+ # 'Int64': pa.int64(),
160
+ # 'int64': pa.int64(),
161
+ # 'float64': pa.float64(),
162
+ # 'float32': pa.float32(),
163
+ # 'bool': pa.bool_(),
164
+ # 'boolean': pa.bool_(), # pandas nullable boolean
165
+ # 'datetime64[ns]': pa.timestamp('ns'),
166
+ # 'timedelta[ns]': pa.duration('ns')
167
+ # }
168
+ #
169
+ # dtypes = self.df_result.dtypes # No need to call .compute()
170
+ #
171
+ # fields = [
172
+ # pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
173
+ # for col, dtype in dtypes.items()
174
+ # ]
175
+ # return pa.schema(fields)
176
+ #
177
+ # def _convert_dtypes(self, schema: pa.Schema):
178
+ # """Convert DataFrame columns to match the specified schema."""
179
+ # dtype_mapping = {}
180
+ # for field in schema:
181
+ # col_name = field.name
182
+ # if col_name in self.df_result.columns:
183
+ # if pa.types.is_string(field.type):
184
+ # dtype_mapping[col_name] = 'string'
185
+ # elif pa.types.is_int64(field.type):
186
+ # dtype_mapping[col_name] = 'Int64' # pandas nullable integer
187
+ # elif pa.types.is_float64(field.type):
188
+ # dtype_mapping[col_name] = 'float64'
189
+ # elif pa.types.is_float32(field.type):
190
+ # dtype_mapping[col_name] = 'float32'
191
+ # elif pa.types.is_boolean(field.type):
192
+ # dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
193
+ # elif pa.types.is_timestamp(field.type):
194
+ # dtype_mapping[col_name] = 'datetime64[ns]'
195
+ # else:
196
+ # dtype_mapping[col_name] = 'object' # Fallback to object
197
+ # # Convert dtypes
198
+ # self.df_result = self.df_result.astype(dtype_mapping)
199
+ #
200
+ # def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
201
+ # """Construct and return the full path for the Parquet file."""
202
+ # _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
203
+ # parquet_filename = parquet_filename or "default.parquet"
204
+ # return Path(base_path) / parquet_filename
205
+ #
206
+ # @staticmethod
207
+ # def _ensure_directory_exists(full_path: Path, clear_existing=False):
208
+ # """Ensure that the directory for the path exists, clearing it if specified."""
209
+ # fs, _ = fsspec.core.url_to_fs(str(full_path))
210
+ # directory = str(full_path.parent)
211
+ #
212
+ # if fs.exists(directory):
213
+ # if clear_existing:
214
+ # fs.rm(directory, recursive=True)
215
+ # else:
216
+ # fs.mkdirs(directory, exist_ok=True)
217
+ #
218
+ # def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
219
+ # """Save the DataFrame to Parquet using the specified schema."""
220
+ # fs, _ = fsspec.core.url_to_fs(str(full_path))
221
+ # print(f"Saving to {str(full_path)}")
222
+ # if fs.exists(str(full_path)):
223
+ # fs.rm(str(full_path), recursive=True)
224
+ #
225
+ # # Save the Dask DataFrame to Parquet
226
+ # self.df_result.to_parquet(
227
+ # str(full_path), engine="pyarrow", schema=schema, write_index=False
228
+ # )
@@ -7,11 +7,12 @@ class StorageManager:
7
7
  def __init__(self, storage_path, fs_type="file", fs_options=None):
8
8
  """
9
9
  Initializes the StorageManager with the base storage path and file system settings.
10
- :param storage_path: Base path for the storage.
10
+ :param storage_path: Base path for the storage (e.g., "s3://my-bucket").
11
11
  :param fs_type: File system type (e.g., "file", "s3").
12
12
  :param fs_options: Dictionary of options for fsspec file system (e.g., credentials).
13
13
  """
14
- self.storage_path = storage_path
14
+ # Ensure the storage_path ends with a slash for consistency
15
+ self.storage_path = storage_path.rstrip("/")
15
16
  self.fs_type = fs_type
16
17
  self.fs_options = fs_options or {}
17
18
  self.fs = fsspec.filesystem(fs_type, **self.fs_options)
@@ -33,6 +34,7 @@ class StorageManager:
33
34
  :param dirs_to_create: List of subdirectories to create.
34
35
  :param clear_existing: Whether to clear existing directories.
35
36
  """
37
+ print(f"Setting up directories under: {base_path}")
36
38
  if clear_existing:
37
39
  print(f"Warning: All existing contents in {base_path} will be removed.")
38
40
  if self.fs.exists(base_path):
@@ -44,6 +46,7 @@ class StorageManager:
44
46
  # Create subdirectories
45
47
  for sub_directory in dirs_to_create:
46
48
  sub_path = self.join_paths(base_path, sub_directory)
49
+ print(f"Creating directory: {sub_path}")
47
50
  if clear_existing and self.fs.exists(sub_path):
48
51
  self.fs.rm(sub_path, recursive=True)
49
52
  self.fs.mkdirs(sub_path, exist_ok=True)
@@ -59,6 +62,7 @@ class StorageManager:
59
62
  # Ensure directories exist (optionally clear existing ones)
60
63
  for depot, sub_directories in depots.items():
61
64
  depot_path = self.join_paths(self.storage_path, depot)
65
+ print(f"Rebuilding depot at: {depot_path}")
62
66
  self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
63
67
 
64
68
  # Generate depot_paths dictionary
@@ -86,4 +90,4 @@ class StorageManager:
86
90
  """
87
91
  print("Rebuilding depot structure...")
88
92
  self.rebuild_depot_paths(depots, clear_existing=clear_existing)
89
- print("Rebuild complete.")
93
+ print("Rebuild complete.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.30
3
+ Version: 0.3.31
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,7 +1,7 @@
1
1
  sibi_dst/__init__.py,sha256=CLHfzrFNqklNx5uMKAPtbZfkbBbVYR5qsiMro0RTfmA,252
2
2
  sibi_dst/df_helper/__init__.py,sha256=5yzslP6zYYOHsTtAzHnNDXHYjf_T6yW7baxwgtduWqQ,292
3
- sibi_dst/df_helper/_df_helper.py,sha256=MttqHot8dlHzo4G522JL-z6LOFWYVXqqz06k-4YcvRM,23447
4
- sibi_dst/df_helper/_parquet_artifact.py,sha256=nx1wTEyrjARpCCPNwBxYiBROee3CSb6c-u7Cpme_tdk,4978
3
+ sibi_dst/df_helper/_df_helper.py,sha256=sZaI998N9yd7FuUgZ8Esrz-K0eh2kXky53h9K8-l4cw,23650
4
+ sibi_dst/df_helper/_parquet_artifact.py,sha256=CtM0u6Y9I1ZjAOYJouAYv23VO88UMzJxJtd_Ooh7bNg,5144
5
5
  sibi_dst/df_helper/_parquet_reader.py,sha256=sbe8DsScNT2h6huNsz8mUxVnUGpJeRzbaONZ3u2sQeQ,1685
6
6
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  sibi_dst/df_helper/backends/django/__init__.py,sha256=uWHi-DtQX5re7b2HcqoXUH3_FZWOw1VTmDf552FAkNs,256
@@ -13,7 +13,7 @@ sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJS
13
13
  sibi_dst/df_helper/backends/http/_http_config.py,sha256=l6GdzTsknfzyf8LAo_TuIWeiswLRRrLcmqAmirxpH8Q,2132
14
14
  sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1uqJzvdaPNTYRb5qXTlQ,182
15
15
  sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=Q8Ic9PLDGT4L97yqr20mr_NsdEeMMOlFkT7Z12yYCxI,3663
16
- sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=5fAv7KzSRvCpW-6ZiXcvrWAyf1KThs1qCgtrzGo3x8A,4503
16
+ sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=BJQC2ZPnMMeN8iVq2scmhYtoZzkhdkZIPV1KamCPasc,4689
17
17
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=TuVp8Ce49dCIIxtyrtFGRblarQUl8QGcS-TDZd515IE,348
18
18
  sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=Og8dDFZX0FnS_ClLAik5O36mNgHSixUdg0_FNo-w-t4,1641
19
19
  sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
@@ -42,14 +42,14 @@ sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWd
42
42
  sibi_dst/utils/clickhouse_writer.py,sha256=xUhFDOuZt0eZDpVJNuLb7pfTHUV06NCYrNUx_a7qrSM,8580
43
43
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
44
44
  sibi_dst/utils/data_utils.py,sha256=Kv87Br78EXlH_MSVzRspqLwrf6sqHIRQc0t3LDI0dSM,7045
45
- sibi_dst/utils/data_wrapper.py,sha256=DTK4hd_GUUi5lxcbEbMraKwzpmPh2IwX6WNWA4t-vx0,10693
45
+ sibi_dst/utils/data_wrapper.py,sha256=Ope_G2Eq9FWg-phdTyU_7nsGnu4evsvofUVedd_SGas,11941
46
46
  sibi_dst/utils/date_utils.py,sha256=CMAZBNwVj7cvERcNiTA8Pf7_5EjV9By9yxkYJpkqz1g,10656
47
47
  sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10989
48
48
  sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
49
- sibi_dst/utils/filepath_generator.py,sha256=hjI7gQwfwRToPeuzoUQDayHKQrr4Ivhi4Chl1J4Phlk,6689
49
+ sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEBRY5FOUQ,6673
50
50
  sibi_dst/utils/log_utils.py,sha256=4eLmoV8VC7wDwPr1mRfDKP24_-laGO6ogE4U0u3DUuA,2315
51
- sibi_dst/utils/parquet_saver.py,sha256=hLrWr1G132y94eLopDPPGQGDsAiR1lQ8id4QQtGYPE4,4349
52
- sibi_dst/utils/storage_manager.py,sha256=7nkfeBW_2xlF59pGj7V2aY5TLwpJnPQuPVclqjavJOA,3856
53
- sibi_dst-0.3.30.dist-info/METADATA,sha256=nFzF7QqK-Rbyve6Ss2kkezC1LNEAUbktzZrgT6cUBpg,2474
54
- sibi_dst-0.3.30.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
- sibi_dst-0.3.30.dist-info/RECORD,,
51
+ sibi_dst/utils/parquet_saver.py,sha256=BOXYLLRB2f3YSEwrKzX71Cs7d1BmOBSlcCfUzW2ON68,9716
52
+ sibi_dst/utils/storage_manager.py,sha256=3KwPB0q-eGL2TTC7egox5ipvzTsAg1xVsHU8gK6vU3g,4129
53
+ sibi_dst-0.3.31.dist-info/METADATA,sha256=QLKalk5RAXPb13dfDtncycmDPvlhr8vjII1cbvnNqjk,2474
54
+ sibi_dst-0.3.31.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
+ sibi_dst-0.3.31.dist-info/RECORD,,