sibi-dst 0.3.30__py3-none-any.whl → 0.3.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ import dask.dataframe as dd
9
9
  from dask import delayed, compute
10
10
  import pandas as pd
11
11
  from pydantic import BaseModel
12
+ import fsspec
12
13
 
13
14
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
14
15
  from sibi_dst.utils import Logger
@@ -86,8 +87,11 @@ class DfHelper:
86
87
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
87
88
  self.dt_field = kwargs.setdefault("dt_field", None)
88
89
  self.as_pandas = kwargs.setdefault("as_pandas", False)
90
+ self.filesystem = kwargs.pop('filesystem', 'file')
91
+ self.filesystem_options = kwargs.pop('filesystem_options', {})
89
92
  kwargs.setdefault("live", True)
90
93
  kwargs.setdefault("logger", self.logger)
94
+ kwargs.setdefault("fs", fsspec.filesystem('file'))
91
95
  self.__post_init(**kwargs)
92
96
 
93
97
  def __str__(self):
@@ -13,7 +13,7 @@ class ParquetArtifact(DfHelper):
13
13
  'backend': 'parquet'
14
14
  }
15
15
 
16
- def __init__(self, data_wrapper_class, filesystem_type="file", filesystem_options=None, **kwargs):
16
+ def __init__(self, data_wrapper_class, **kwargs):
17
17
  self.config = {
18
18
  **self.DEFAULT_CONFIG,
19
19
  **kwargs,
@@ -39,13 +39,14 @@ class ParquetArtifact(DfHelper):
39
39
  raise ValueError('parquet_end_date must be set')
40
40
 
41
41
  # Filesystem setup
42
- self.filesystem_type = filesystem_type
43
- self.filesystem_options = filesystem_options or {}
44
- self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
45
-
42
+ self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
43
+ self.filesystem_options = self.config.setdefault('filesystem_options', {})
44
+ self.fs = self.config.setdefault('fs', None)
45
+ if self.fs is None:
46
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
47
+ self.config.setdefault('fs', self.fs)
46
48
  # Ensure the directory exists
47
49
  self.ensure_directory_exists(self.parquet_storage_path)
48
-
49
50
  super().__init__(**self.config)
50
51
 
51
52
  def load(self, **kwargs):
@@ -60,6 +61,11 @@ class ParquetArtifact(DfHelper):
60
61
  dw = DataWrapper(self.data_wrapper_class, **params)
61
62
  dw.process()
62
63
 
64
+ def __exit__(self, exc_type, exc_value, traceback):
65
+ # Ensure resources are cleaned up
66
+ if self.fs:
67
+ self.fs.close()
68
+
63
69
  def update_parquet(self, period: str = 'today', **kwargs) -> None:
64
70
  """Update the Parquet file with data from a specific period."""
65
71
  kwargs.update(self.parse_parquet_period(period=period))
@@ -97,6 +103,7 @@ class ParquetArtifact(DfHelper):
97
103
  'history_days_threshold': kwargs.pop('history_days_threshold', 30),
98
104
  'max_age_minutes': kwargs.pop('max_age_minutes', 10),
99
105
  'show_progress': kwargs.pop('show_progress', False),
106
+ 'fs': self.fs,
100
107
  'filesystem_type': self.filesystem_type,
101
108
  'filesystem_options': self.filesystem_options,
102
109
  }
@@ -31,7 +31,10 @@ class ParquetReader(DfHelper):
31
31
  # Filesystem setup
32
32
  self.filesystem_type = filesystem_type
33
33
  self.filesystem_options = filesystem_options or {}
34
- self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
34
+ self.fs = self.config.setdefault('fs', None)
35
+ if self.fs is None:
36
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
37
+ self.config.setdefault('fs', self.fs)
35
38
 
36
39
  if not self.directory_exists():
37
40
  raise ValueError(f"{self.parquet_storage_path} does not exist")
@@ -48,3 +51,8 @@ class ParquetReader(DfHelper):
48
51
  return info['type'] == 'directory'
49
52
  except FileNotFoundError:
50
53
  return False
54
+
55
+ def __exit__(self, exc_type, exc_value, traceback):
56
+ # Ensure resources are cleaned up
57
+ if self.fs:
58
+ self.fs.close()
@@ -13,8 +13,8 @@ from sibi_dst.utils import Logger
13
13
  class ParquetConfig(BaseModel):
14
14
  load_parquet: bool = False
15
15
  parquet_filename: Optional[str] = None
16
- parquet_storage_path: Optional[DirectoryPath] = None
17
- parquet_full_path: Optional[FilePath] = None
16
+ parquet_storage_path: Optional[str] = None
17
+ parquet_full_path: Optional[str] = None
18
18
  parquet_folder_list: Optional[List[str]] = None
19
19
  parquet_size_bytes: int = 0
20
20
  parquet_max_age_minutes: int = 0
@@ -30,14 +30,17 @@ class ParquetConfig(BaseModel):
30
30
  # Configure paths based on fsspec
31
31
  if self.logger is None:
32
32
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
33
- self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
34
- str(self.parquet_storage_path).split("://")[0])
35
-
33
+ #self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
34
+ # str(self.parquet_storage_path).split("://")[0])
36
35
  # Validation for parquet path
36
+
37
+
37
38
  if self.parquet_storage_path is None:
38
39
  raise ValueError('Parquet storage path must be specified')
40
+ self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
39
41
  if not self.fs.exists(self.parquet_storage_path):
40
- raise ValueError('Parquet storage path does not exist')
42
+ self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
43
+ #raise ValueError('Parquet storage path does not exist')
41
44
  self.load_parquet = False
42
45
  if self.parquet_filename is not None:
43
46
  self.parquet_full_path = self.ensure_file_extension(
@@ -57,8 +60,9 @@ class ParquetConfig(BaseModel):
57
60
  raise ValueError('Parquet end date must be greater than start date')
58
61
 
59
62
  # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
60
- self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
63
+ self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), fs=self.fs,
61
64
  logger=self.logger).generate_file_paths(start_date, end_date)
65
+
62
66
  self.parquet_size_bytes = self.get_parquet_size_bytes()
63
67
  self.load_parquet = True
64
68
  # self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
@@ -84,11 +88,12 @@ class ParquetConfig(BaseModel):
84
88
  return total_size
85
89
 
86
90
  def load_files(self):
91
+
87
92
  if self.load_parquet:
88
93
  if self.parquet_folder_list:
89
- return dd.read_parquet(self.parquet_folder_list, engine="pyarrow")
94
+ return dd.read_parquet(self.parquet_folder_list, engine="pyarrow", filesystem=self.fs)
90
95
  else:
91
- return dd.read_parquet(self.parquet_full_path, engine="pyarrow")
96
+ return dd.read_parquet(self.parquet_full_path, engine="pyarrow", filesystem=self.fs)
92
97
 
93
98
  @staticmethod
94
99
  def ensure_file_extension(filepath: str, extension: str) -> str:
@@ -22,6 +22,7 @@ class DataWrapper:
22
22
  parquet_filename: str,
23
23
  start_date: Any,
24
24
  end_date: Any,
25
+ fs: Optional[fsspec.AbstractFileSystem] = None,
25
26
  filesystem_type: str = "file",
26
27
  filesystem_options: Optional[Dict] = None,
27
28
  verbose: bool = False,
@@ -41,7 +42,7 @@ class DataWrapper:
41
42
  self.parquet_filename = parquet_filename
42
43
  self.filesystem_type = filesystem_type
43
44
  self.filesystem_options = filesystem_options or {}
44
- self.fs = fsspec.filesystem(filesystem_type, **self.filesystem_options)
45
+ self.fs = fs or fsspec.filesystem(filesystem_type, **self.filesystem_options)
45
46
  self.verbose = verbose
46
47
  self.class_params = class_params or {}
47
48
  self.load_params = load_params or {}
@@ -129,23 +130,45 @@ class DataWrapper:
129
130
  def is_file_older_than(self, file_path: str) -> bool:
130
131
  """
131
132
  Check if a file is older than the specified max_age_minutes.
133
+
134
+ :param file_path: Path to the file.
135
+ :return: True if the file is older than max_age_minutes, False otherwise.
132
136
  """
133
137
  try:
138
+ # Get file info
134
139
  info = self.fs.info(file_path)
135
- file_modification_time = info['mtime']
136
- file_modification_datetime = datetime.datetime.fromtimestamp(
137
- file_modification_time, tz=datetime.timezone.utc
138
- )
140
+
141
+ # Determine the modification time from available keys
142
+ file_modification_time = None
143
+ if "mtime" in info: # Local filesystem
144
+ file_modification_time = info["mtime"]
145
+ file_modification_datetime = datetime.datetime.fromtimestamp(
146
+ file_modification_time, tz=datetime.timezone.utc
147
+ )
148
+ elif "LastModified" in info: # S3-compatible filesystem
149
+ file_modification_datetime = (
150
+ info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
151
+ else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
152
+ )
153
+ else:
154
+ self.logger.warning(f"Modification time not available for {file_path}.")
155
+ return True # Assume file is too old if we cannot determine its age
156
+
157
+ # Compare file age
139
158
  current_time = datetime.datetime.now(datetime.timezone.utc)
140
159
  file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
141
160
  self.logger.info(
142
161
  f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
143
162
  f"(threshold: {self.max_age_minutes} minutes)"
144
163
  )
145
-
146
164
  return file_age_minutes > self.max_age_minutes
165
+
147
166
  except FileNotFoundError:
148
- return True
167
+ self.logger.warning(f"File {file_path} not found.")
168
+ return True # File is considered old if it doesn't exist
169
+ except Exception as e:
170
+ self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
171
+ return True #
149
172
 
150
173
  def process_date(self, date: datetime.date):
151
174
  """Process a specific date by regenerating data as necessary."""
@@ -162,7 +185,7 @@ class DataWrapper:
162
185
  self.logger.error("No data found for the specified date.")
163
186
  return
164
187
 
165
- parquet_saver = ParquetSaver(df, folder, self.logger)
188
+ parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
166
189
  parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
167
190
 
168
191
  end_time = datetime.datetime.now()
@@ -91,7 +91,7 @@ class FilePathGenerator:
91
91
  if engine == 'dask':
92
92
  # Collect individual file paths
93
93
  file_pattern = f"{base_dir}/**/*.{self.file_extension}"
94
- all_paths = self.fs.glob(file_pattern, recursive=True)
94
+ all_paths = self.fs.glob(file_pattern)
95
95
 
96
96
  if not all_paths and self.debug:
97
97
  self.logger.debug(f"No files found with pattern: {file_pattern}")
@@ -1,25 +1,36 @@
1
1
  from pathlib import Path
2
2
  from typing import Optional
3
3
 
4
- import fsspec
5
4
  import pyarrow as pa
5
+ import fsspec
6
6
 
7
7
  from sibi_dst.utils import Logger
8
8
 
9
9
 
10
10
  class ParquetSaver:
11
- def __init__(self, df_result, parquet_storage_path, logger=None):
12
- # Ensure df_result is a Dask DataFrame
11
+ def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
12
+ """
13
+ Initialize ParquetSaver.
14
+ :param df_result: Dask DataFrame to save.
15
+ :param parquet_storage_path: Base storage path (e.g., "s3://bucket-name/path/").
16
+ :param logger: Logger instance for logging messages.
17
+ :param fs: Pre-initialized fsspec filesystem instance. Defaults to 'file' if None.
18
+ """
13
19
  self.df_result = df_result
14
- self.parquet_storage_path = parquet_storage_path
20
+ self.parquet_storage_path = parquet_storage_path.rstrip("/")
15
21
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
16
22
 
23
+ # Default to the local filesystem if `fs` is not provided
24
+ self.fs = fs or fsspec.filesystem("file")
25
+
17
26
  def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
27
+ """
28
+ Save the DataFrame to Parquet format.
29
+ :param parquet_filename: Filename for the Parquet file.
30
+ :param clear_existing: Whether to clear existing files in the target directory.
31
+ """
18
32
  full_path = self._construct_full_path(parquet_filename)
19
33
 
20
- # We cannot check for empty DataFrame directly with Dask without computation
21
- # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
22
-
23
34
  # Ensure directory exists and clear if necessary
24
35
  self._ensure_directory_exists(full_path, clear_existing=clear_existing)
25
36
 
@@ -27,23 +38,24 @@ class ParquetSaver:
27
38
  schema = self._define_schema()
28
39
  self._convert_dtypes(schema)
29
40
  self._save_dataframe_to_parquet(full_path, schema)
41
+ self.fs.close()
30
42
 
31
43
  def _define_schema(self) -> pa.Schema:
32
44
  """Define a PyArrow schema dynamically based on df_result column types."""
33
45
  pandas_dtype_to_pa = {
34
- 'object': pa.string(),
35
- 'string': pa.string(),
36
- 'Int64': pa.int64(),
37
- 'int64': pa.int64(),
38
- 'float64': pa.float64(),
39
- 'float32': pa.float32(),
40
- 'bool': pa.bool_(),
41
- 'boolean': pa.bool_(), # pandas nullable boolean
42
- 'datetime64[ns]': pa.timestamp('ns'),
43
- 'timedelta[ns]': pa.duration('ns')
46
+ "object": pa.string(),
47
+ "string": pa.string(),
48
+ "Int64": pa.int64(),
49
+ "int64": pa.int64(),
50
+ "float64": pa.float64(),
51
+ "float32": pa.float32(),
52
+ "bool": pa.bool_(),
53
+ "boolean": pa.bool_(), # pandas nullable boolean
54
+ "datetime64[ns]": pa.timestamp("ns"),
55
+ "timedelta[ns]": pa.duration("ns"),
44
56
  }
45
57
 
46
- dtypes = self.df_result.dtypes # No need to call .compute()
58
+ dtypes = self.df_result.dtypes
47
59
 
48
60
  fields = [
49
61
  pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
@@ -58,47 +70,160 @@ class ParquetSaver:
58
70
  col_name = field.name
59
71
  if col_name in self.df_result.columns:
60
72
  if pa.types.is_string(field.type):
61
- dtype_mapping[col_name] = 'string'
73
+ dtype_mapping[col_name] = "string"
62
74
  elif pa.types.is_int64(field.type):
63
- dtype_mapping[col_name] = 'Int64' # pandas nullable integer
75
+ dtype_mapping[col_name] = "Int64"
64
76
  elif pa.types.is_float64(field.type):
65
- dtype_mapping[col_name] = 'float64'
77
+ dtype_mapping[col_name] = "float64"
66
78
  elif pa.types.is_float32(field.type):
67
- dtype_mapping[col_name] = 'float32'
79
+ dtype_mapping[col_name] = "float32"
68
80
  elif pa.types.is_boolean(field.type):
69
- dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
81
+ dtype_mapping[col_name] = "boolean"
70
82
  elif pa.types.is_timestamp(field.type):
71
- dtype_mapping[col_name] = 'datetime64[ns]'
83
+ dtype_mapping[col_name] = "datetime64[ns]"
72
84
  else:
73
- dtype_mapping[col_name] = 'object' # Fallback to object
74
- # Convert dtypes
85
+ dtype_mapping[col_name] = "object"
75
86
  self.df_result = self.df_result.astype(dtype_mapping)
76
87
 
77
- def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
88
+ def _construct_full_path(self, parquet_filename: Optional[str]) -> str:
78
89
  """Construct and return the full path for the Parquet file."""
79
- _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
80
90
  parquet_filename = parquet_filename or "default.parquet"
81
- return Path(base_path) / parquet_filename
91
+ return f"{self.parquet_storage_path}/{parquet_filename}"
82
92
 
83
- @staticmethod
84
- def _ensure_directory_exists(full_path: Path, clear_existing=False):
85
- """Ensure that the directory for the path exists, clearing it if specified."""
86
- fs, _ = fsspec.core.url_to_fs(str(full_path))
87
- directory = str(full_path.parent)
93
+ def _ensure_directory_exists(self, full_path: str, clear_existing=False):
94
+ """
95
+ Ensure that the directory for the path exists, clearing it if specified.
96
+ :param full_path: Full path for the target file.
97
+ :param clear_existing: Whether to clear existing files/directories.
98
+ """
99
+ directory = "/".join(full_path.split("/")[:-1])
88
100
 
89
- if fs.exists(directory):
101
+ if self.fs.exists(directory):
90
102
  if clear_existing:
91
- fs.rm(directory, recursive=True)
103
+ self.logger.info(f"Clearing existing directory: {directory}")
104
+ self.fs.rm(directory, recursive=True)
92
105
  else:
93
- fs.mkdirs(directory, exist_ok=True)
106
+ self.logger.info(f"Creating directory: {directory}")
107
+ self.fs.mkdirs(directory, exist_ok=True)
94
108
 
95
- def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
109
+ def _save_dataframe_to_parquet(self, full_path: str, schema: pa.Schema):
96
110
  """Save the DataFrame to Parquet using the specified schema."""
97
- fs, _ = fsspec.core.url_to_fs(str(full_path))
98
- if fs.exists(str(full_path)):
99
- fs.rm(str(full_path), recursive=True)
111
+ if self.fs.exists(full_path):
112
+ self.logger.info(f"Overwriting existing file: {full_path}")
113
+ self.fs.rm(full_path, recursive=True)
100
114
 
101
- # Save the Dask DataFrame to Parquet
115
+ self.logger.info(f"Saving Parquet file to: {full_path}")
102
116
  self.df_result.to_parquet(
103
- str(full_path), engine="pyarrow", schema=schema, write_index=False
117
+ full_path,
118
+ engine="pyarrow",
119
+ schema=schema,
120
+ storage_options=self.fs.storage_options if hasattr(self.fs, "storage_options") else None,
121
+ write_index=False,
104
122
  )
123
+
124
+ # from pathlib import Path
125
+ # from typing import Optional
126
+ #
127
+ # import fsspec
128
+ # import pyarrow as pa
129
+ #
130
+ # from sibi_dst.utils import Logger
131
+ #
132
+ #
133
+ # class ParquetSaver:
134
+ # def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
135
+ # # Ensure df_result is a Dask DataFrame
136
+ # self.fs = fs or fsspec.filesystem("file")
137
+ # self.df_result = df_result
138
+ # self.parquet_storage_path = parquet_storage_path
139
+ # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
140
+ #
141
+ # def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
142
+ # full_path = self._construct_full_path(parquet_filename)
143
+ #
144
+ # # We cannot check for empty DataFrame directly with Dask without computation
145
+ # # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
146
+ #
147
+ # # Ensure directory exists and clear if necessary
148
+ # self._ensure_directory_exists(full_path, clear_existing=clear_existing)
149
+ #
150
+ # # Define schema and save DataFrame to Parquet
151
+ # schema = self._define_schema()
152
+ # self._convert_dtypes(schema)
153
+ # self._save_dataframe_to_parquet(full_path, schema)
154
+ #
155
+ # def _define_schema(self) -> pa.Schema:
156
+ # """Define a PyArrow schema dynamically based on df_result column types."""
157
+ # pandas_dtype_to_pa = {
158
+ # 'object': pa.string(),
159
+ # 'string': pa.string(),
160
+ # 'Int64': pa.int64(),
161
+ # 'int64': pa.int64(),
162
+ # 'float64': pa.float64(),
163
+ # 'float32': pa.float32(),
164
+ # 'bool': pa.bool_(),
165
+ # 'boolean': pa.bool_(), # pandas nullable boolean
166
+ # 'datetime64[ns]': pa.timestamp('ns'),
167
+ # 'timedelta[ns]': pa.duration('ns')
168
+ # }
169
+ #
170
+ # dtypes = self.df_result.dtypes # No need to call .compute()
171
+ #
172
+ # fields = [
173
+ # pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
174
+ # for col, dtype in dtypes.items()
175
+ # ]
176
+ # return pa.schema(fields)
177
+ #
178
+ # def _convert_dtypes(self, schema: pa.Schema):
179
+ # """Convert DataFrame columns to match the specified schema."""
180
+ # dtype_mapping = {}
181
+ # for field in schema:
182
+ # col_name = field.name
183
+ # if col_name in self.df_result.columns:
184
+ # if pa.types.is_string(field.type):
185
+ # dtype_mapping[col_name] = 'string'
186
+ # elif pa.types.is_int64(field.type):
187
+ # dtype_mapping[col_name] = 'Int64' # pandas nullable integer
188
+ # elif pa.types.is_float64(field.type):
189
+ # dtype_mapping[col_name] = 'float64'
190
+ # elif pa.types.is_float32(field.type):
191
+ # dtype_mapping[col_name] = 'float32'
192
+ # elif pa.types.is_boolean(field.type):
193
+ # dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
194
+ # elif pa.types.is_timestamp(field.type):
195
+ # dtype_mapping[col_name] = 'datetime64[ns]'
196
+ # else:
197
+ # dtype_mapping[col_name] = 'object' # Fallback to object
198
+ # # Convert dtypes
199
+ # self.df_result = self.df_result.astype(dtype_mapping)
200
+ #
201
+ # def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
202
+ # """Construct and return the full path for the Parquet file."""
203
+ # _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
204
+ # parquet_filename = parquet_filename or "default.parquet"
205
+ # return Path(base_path) / parquet_filename
206
+ #
207
+ # @staticmethod
208
+ # def _ensure_directory_exists(full_path: Path, clear_existing=False):
209
+ # """Ensure that the directory for the path exists, clearing it if specified."""
210
+ # fs, _ = fsspec.core.url_to_fs(str(full_path))
211
+ # directory = str(full_path.parent)
212
+ #
213
+ # if fs.exists(directory):
214
+ # if clear_existing:
215
+ # fs.rm(directory, recursive=True)
216
+ # else:
217
+ # fs.mkdirs(directory, exist_ok=True)
218
+ #
219
+ # def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
220
+ # """Save the DataFrame to Parquet using the specified schema."""
221
+ # fs, _ = fsspec.core.url_to_fs(str(full_path))
222
+ # print(f"Saving to {str(full_path)}")
223
+ # if fs.exists(str(full_path)):
224
+ # fs.rm(str(full_path), recursive=True)
225
+ #
226
+ # # Save the Dask DataFrame to Parquet
227
+ # self.df_result.to_parquet(
228
+ # str(full_path), engine="pyarrow", schema=schema, write_index=False
229
+ # )
@@ -7,11 +7,12 @@ class StorageManager:
7
7
  def __init__(self, storage_path, fs_type="file", fs_options=None):
8
8
  """
9
9
  Initializes the StorageManager with the base storage path and file system settings.
10
- :param storage_path: Base path for the storage.
10
+ :param storage_path: Base path for the storage (e.g., "s3://my-bucket").
11
11
  :param fs_type: File system type (e.g., "file", "s3").
12
12
  :param fs_options: Dictionary of options for fsspec file system (e.g., credentials).
13
13
  """
14
- self.storage_path = storage_path
14
+ # Ensure the storage_path ends with a slash for consistency
15
+ self.storage_path = storage_path.rstrip("/")
15
16
  self.fs_type = fs_type
16
17
  self.fs_options = fs_options or {}
17
18
  self.fs = fsspec.filesystem(fs_type, **self.fs_options)
@@ -33,6 +34,7 @@ class StorageManager:
33
34
  :param dirs_to_create: List of subdirectories to create.
34
35
  :param clear_existing: Whether to clear existing directories.
35
36
  """
37
+ print(f"Setting up directories under: {base_path}")
36
38
  if clear_existing:
37
39
  print(f"Warning: All existing contents in {base_path} will be removed.")
38
40
  if self.fs.exists(base_path):
@@ -44,6 +46,7 @@ class StorageManager:
44
46
  # Create subdirectories
45
47
  for sub_directory in dirs_to_create:
46
48
  sub_path = self.join_paths(base_path, sub_directory)
49
+ print(f"Creating directory: {sub_path}")
47
50
  if clear_existing and self.fs.exists(sub_path):
48
51
  self.fs.rm(sub_path, recursive=True)
49
52
  self.fs.mkdirs(sub_path, exist_ok=True)
@@ -59,6 +62,7 @@ class StorageManager:
59
62
  # Ensure directories exist (optionally clear existing ones)
60
63
  for depot, sub_directories in depots.items():
61
64
  depot_path = self.join_paths(self.storage_path, depot)
65
+ print(f"Rebuilding depot at: {depot_path}")
62
66
  self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
63
67
 
64
68
  # Generate depot_paths dictionary
@@ -87,3 +91,6 @@ class StorageManager:
87
91
  print("Rebuilding depot structure...")
88
92
  self.rebuild_depot_paths(depots, clear_existing=clear_existing)
89
93
  print("Rebuild complete.")
94
+
95
+ def get_fs_instance(self):
96
+ return fsspec.filesystem(self.fs_type, **self.fs_options)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.30
3
+ Version: 0.3.32
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -20,6 +20,7 @@ Requires-Dist: django (>=5.1.4,<6.0.0)
20
20
  Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
21
21
  Requires-Dist: folium (>=0.19.4,<0.20.0)
22
22
  Requires-Dist: geopandas (>=1.0.1,<2.0.0)
23
+ Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
23
24
  Requires-Dist: httpx (>=0.27.2,<0.28.0)
24
25
  Requires-Dist: ipython (>=8.29.0,<9.0.0)
25
26
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
@@ -42,6 +43,7 @@ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
42
43
  Requires-Dist: tornado (>=6.4.1,<7.0.0)
43
44
  Requires-Dist: tqdm (>=4.67.0,<5.0.0)
44
45
  Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
46
+ Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
45
47
  Description-Content-Type: text/markdown
46
48
 
47
49
  # sibi-dst
@@ -1,8 +1,8 @@
1
1
  sibi_dst/__init__.py,sha256=CLHfzrFNqklNx5uMKAPtbZfkbBbVYR5qsiMro0RTfmA,252
2
2
  sibi_dst/df_helper/__init__.py,sha256=5yzslP6zYYOHsTtAzHnNDXHYjf_T6yW7baxwgtduWqQ,292
3
- sibi_dst/df_helper/_df_helper.py,sha256=MttqHot8dlHzo4G522JL-z6LOFWYVXqqz06k-4YcvRM,23447
4
- sibi_dst/df_helper/_parquet_artifact.py,sha256=nx1wTEyrjARpCCPNwBxYiBROee3CSb6c-u7Cpme_tdk,4978
5
- sibi_dst/df_helper/_parquet_reader.py,sha256=sbe8DsScNT2h6huNsz8mUxVnUGpJeRzbaONZ3u2sQeQ,1685
3
+ sibi_dst/df_helper/_df_helper.py,sha256=sZaI998N9yd7FuUgZ8Esrz-K0eh2kXky53h9K8-l4cw,23650
4
+ sibi_dst/df_helper/_parquet_artifact.py,sha256=HVChP3UBCsEMpY-yyFERLaB76mWaziQXkdu2Qtzm7_s,5291
5
+ sibi_dst/df_helper/_parquet_reader.py,sha256=0qJHMS1PLcODTLMS13UW5iFQLK8b3qjgy7qDzcupgII,1963
6
6
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  sibi_dst/df_helper/backends/django/__init__.py,sha256=uWHi-DtQX5re7b2HcqoXUH3_FZWOw1VTmDf552FAkNs,256
8
8
  sibi_dst/df_helper/backends/django/_db_connection.py,sha256=kWITSPqn3286NzPvWSSE_PtJCm1tyfrv2RIuPSThXlQ,1634
@@ -13,7 +13,7 @@ sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJS
13
13
  sibi_dst/df_helper/backends/http/_http_config.py,sha256=l6GdzTsknfzyf8LAo_TuIWeiswLRRrLcmqAmirxpH8Q,2132
14
14
  sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1uqJzvdaPNTYRb5qXTlQ,182
15
15
  sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=Q8Ic9PLDGT4L97yqr20mr_NsdEeMMOlFkT7Z12yYCxI,3663
16
- sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=5fAv7KzSRvCpW-6ZiXcvrWAyf1KThs1qCgtrzGo3x8A,4503
16
+ sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=BJQC2ZPnMMeN8iVq2scmhYtoZzkhdkZIPV1KamCPasc,4689
17
17
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=TuVp8Ce49dCIIxtyrtFGRblarQUl8QGcS-TDZd515IE,348
18
18
  sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=Og8dDFZX0FnS_ClLAik5O36mNgHSixUdg0_FNo-w-t4,1641
19
19
  sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
@@ -42,14 +42,14 @@ sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWd
42
42
  sibi_dst/utils/clickhouse_writer.py,sha256=xUhFDOuZt0eZDpVJNuLb7pfTHUV06NCYrNUx_a7qrSM,8580
43
43
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
44
44
  sibi_dst/utils/data_utils.py,sha256=Kv87Br78EXlH_MSVzRspqLwrf6sqHIRQc0t3LDI0dSM,7045
45
- sibi_dst/utils/data_wrapper.py,sha256=DTK4hd_GUUi5lxcbEbMraKwzpmPh2IwX6WNWA4t-vx0,10693
45
+ sibi_dst/utils/data_wrapper.py,sha256=Ope_G2Eq9FWg-phdTyU_7nsGnu4evsvofUVedd_SGas,11941
46
46
  sibi_dst/utils/date_utils.py,sha256=CMAZBNwVj7cvERcNiTA8Pf7_5EjV9By9yxkYJpkqz1g,10656
47
47
  sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10989
48
48
  sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
49
- sibi_dst/utils/filepath_generator.py,sha256=hjI7gQwfwRToPeuzoUQDayHKQrr4Ivhi4Chl1J4Phlk,6689
49
+ sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEBRY5FOUQ,6673
50
50
  sibi_dst/utils/log_utils.py,sha256=4eLmoV8VC7wDwPr1mRfDKP24_-laGO6ogE4U0u3DUuA,2315
51
- sibi_dst/utils/parquet_saver.py,sha256=hLrWr1G132y94eLopDPPGQGDsAiR1lQ8id4QQtGYPE4,4349
52
- sibi_dst/utils/storage_manager.py,sha256=7nkfeBW_2xlF59pGj7V2aY5TLwpJnPQuPVclqjavJOA,3856
53
- sibi_dst-0.3.30.dist-info/METADATA,sha256=nFzF7QqK-Rbyve6Ss2kkezC1LNEAUbktzZrgT6cUBpg,2474
54
- sibi_dst-0.3.30.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
- sibi_dst-0.3.30.dist-info/RECORD,,
51
+ sibi_dst/utils/parquet_saver.py,sha256=kR4FsjdMurQF46M0jc2Kvze4Ue70lUxefEzS0iszln8,9740
52
+ sibi_dst/utils/storage_manager.py,sha256=qHo5vTv-dr1roRr_mOcprSTdlAfH4Q2Dy5tQUz06Pnk,4228
53
+ sibi_dst-0.3.32.dist-info/METADATA,sha256=8CNqCjmW44vqkrhy-hvVlSmHS3s5jiPr2VDZV5V1Nl0,2564
54
+ sibi_dst-0.3.32.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
+ sibi_dst-0.3.32.dist-info/RECORD,,