sibi-dst 0.3.33__py3-none-any.whl → 0.3.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,7 +91,7 @@ class DfHelper:
91
91
  self.filesystem_options = kwargs.pop('filesystem_options', {})
92
92
  kwargs.setdefault("live", True)
93
93
  kwargs.setdefault("logger", self.logger)
94
- kwargs.setdefault("fs", fsspec.filesystem('file'))
94
+ self.fs =kwargs.setdefault("fs", fsspec.filesystem('file'))
95
95
  self.__post_init(**kwargs)
96
96
 
97
97
  def __str__(self):
@@ -208,6 +208,18 @@ class DfHelper:
208
208
  return asyncio.run(self.__load_from_http(**options))
209
209
 
210
210
  def __load_from_sqlalchemy(self, **options):
211
+ """
212
+ Loads data from an SQLAlchemy database source into a dataframe. The method processes
213
+ the loaded data and applies post-processing to transform it into the desired structure.
214
+ If the operation fails, an empty pandas DataFrame is created as a fallback.
215
+
216
+ :param options: Additional keyword arguments to configure the data loading process.
217
+ These options can include configurations such as 'debug' and other parameters
218
+ required by the `SqlAlchemyLoadFromDb` class.
219
+ :type options: dict
220
+ :return: A dataframe containing the data loaded from the SQLAlchemy database.
221
+ :rtype: dask.dataframe.DataFrame
222
+ """
211
223
  try:
212
224
  options.setdefault("debug", self.debug)
213
225
  db_loader = SqlAlchemyLoadFromDb(
@@ -228,6 +240,17 @@ class DfHelper:
228
240
  return self.df
229
241
 
230
242
  def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
243
+ """
244
+ Loads data from a Django database using a specific backend query mechanism. Processes the loaded data
245
+ and applies further post-processing before returning the dataframe. If the operation fails, an
246
+ empty dataframe with a single partition is returned instead.
247
+
248
+ :param options: Additional settings for the database loading process, which include optional configurations
249
+ like debug mode, among others.
250
+ :type options: dict
251
+ :return: A dataframe containing the loaded data either as a Pandas or Dask dataframe.
252
+ :rtype: Union[pd.DataFrame, dd.DataFrame]
253
+ """
231
254
  try:
232
255
  options.setdefault("debug", self.debug)
233
256
  db_loader = DjangoLoadFromDb(
@@ -248,7 +271,18 @@ class DfHelper:
248
271
  return self.df
249
272
 
250
273
  async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
251
- """Delegate asynchronous HTTP data loading to HttpDatabackend plugin."""
274
+ """
275
+ Loads data asynchronously from an HTTP source using the configured HTTP plugin.
276
+ If the HTTP plugin is not properly configured, this method logs a debug message and
277
+ returns an empty Dask DataFrame. If an exception occurs during data fetching, the error
278
+ is logged and an empty Dask DataFrame with one partition is returned.
279
+
280
+ :param options: Additional keyword arguments that are passed to the HTTP plugin for
281
+ fetching the data.
282
+ :returns: A DataFrame object that can either be a pandas or a Dask DataFrame. When the
283
+ fetching operation fails, it defaults to returning an empty Dask DataFrame
284
+ with a single partition.
285
+ """
252
286
  if not self.backend_http:
253
287
  self.logger.debug("HTTP plugin not configured properly.")
254
288
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -339,12 +373,45 @@ class DfHelper:
339
373
 
340
374
  self.logger.debug("Processing of loaded data completed.")
341
375
 
342
- def save_to_parquet(self, parquet_filename: Optional[str] = None):
343
- ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
376
+ def save_to_parquet(self, parquet_filename: Optional[str] = None, **kwargs):
377
+ """
378
+ Save the dataframe result to a Parquet file using specified configurations.
379
+
380
+ This method leverages the ParquetSaver class to store the dataframe result
381
+ into a Parquet file. It also provides functionality for overriding the default
382
+ filesystem (`fs`) and storage path (`parquet_storage_path`). The method logs
383
+ details about the saving operation for debugging purposes.
384
+
385
+ :param parquet_filename: The name of the Parquet file to save the dataframe to.
386
+ If not provided, a default name will be used.
387
+ :param kwargs: Additional arguments to customize the saving process. These may
388
+ include:
389
+ - `fs`: Filesystem to be used for saving Parquet files. If not
390
+ provided, defaults to the instance's filesystem attribute.
391
+ - `parquet_storage_path`: The root path in the filesystem where
392
+ Parquet files should be saved. If not provided, defaults to
393
+ the instance's attribute for storage path.
394
+ :return: None
395
+ """
396
+ fs = kwargs.pop('fs', self.fs)
397
+ parquet_storage_path = kwargs.pop('parquet_storage_path', self.parquet_storage_path)
398
+ ps = ParquetSaver(df_result=self.df, parquet_storage_path=parquet_storage_path, logger=self.logger, fs=fs)
344
399
  ps.save_to_parquet(parquet_filename)
345
- self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
400
+ self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {parquet_storage_path}.")
346
401
 
347
402
  def save_to_clickhouse(self, **credentials):
403
+ """
404
+ Saves the current DataFrame to ClickHouse using the provided credentials. This
405
+ method first checks if the DataFrame is empty. If it is empty, the method logs
406
+ a debug message and does not proceed with saving. Otherwise, it initializes
407
+ a ClickHouseWriter instance and uses it to save the DataFrame to ClickHouse,
408
+ logging a debug message upon successful completion.
409
+
410
+ :param credentials: Credentials required to connect to ClickHouse as keyword
411
+ arguments.
412
+ :type credentials: dict
413
+ :return: None
414
+ """
348
415
  if self.df.map_partitions(len).compute().sum() == 0:
349
416
  self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
350
417
  return
@@ -353,6 +420,21 @@ class DfHelper:
353
420
  self.logger.debug("Save to ClickHouse completed.")
354
421
 
355
422
  def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
423
+ """
424
+ Loads data from parquet files into a DataFrame, applies provided filters, and handles exceptions.
425
+
426
+ This method leverages a backend-specific implementation to load data from parquet files into a
427
+ DataFrame. If additional options are provided and the data is successfully loaded, filters are
428
+ applied to the DataFrame using a filter handler. Errors during this process are handled gracefully
429
+ by logging the issue and returning an empty Dask DataFrame.
430
+
431
+ :param options: A dictionary of filter options to be applied to the DataFrame.
432
+ :type options: dict
433
+
434
+ :return: A DataFrame containing the loaded and filtered data. If the operation fails, an empty
435
+ Dask DataFrame is returned.
436
+ :rtype: Union[pd.DataFrame, dd.DataFrame]
437
+ """
356
438
  try:
357
439
  self.df = self.backend_parquet.load_files()
358
440
  if options and self.df is not None:
@@ -368,6 +450,27 @@ class DfHelper:
368
450
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
369
451
 
370
452
  def load_period(self, **kwargs):
453
+ """
454
+ Loads a period with specified parameters.
455
+
456
+ This method acts as a wrapper around the private ``__load_period`` method. It
457
+ accepts arbitrary keyword arguments that are passed directly to the private
458
+ method for execution. The purpose of allowing keyword arguments is to permit
459
+ flexible configuration or parameterization for loading a specific period, based
460
+ on the internal implementation of the private ``__load_period`` method.
461
+
462
+ Note:
463
+ The arguments and return values are entirely determined by the private
464
+ method's behavior. This method is intentionally designed to mask details
465
+ of the internal logic behind the abstraction.
466
+
467
+ :param kwargs: Arbitrary keyword arguments to parameterize the internal logic
468
+ of loading a period. The specific keys and values expected by the
469
+ ``__load_period`` method depend on its own internal implementation.
470
+ :return: The result of calling the private ``__load_period`` method with the
471
+ provided keyword arguments. The return type is dependent on the internal
472
+ implementation of ``__load_period``.
473
+ """
371
474
  return self.__load_period(**kwargs)
372
475
 
373
476
  def __load_period(self, **kwargs):
@@ -121,109 +121,3 @@ class ParquetSaver:
121
121
  write_index=False,
122
122
  )
123
123
 
124
- # from pathlib import Path
125
- # from typing import Optional
126
- #
127
- # import fsspec
128
- # import pyarrow as pa
129
- #
130
- # from sibi_dst.utils import Logger
131
- #
132
- #
133
- # class ParquetSaver:
134
- # def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
135
- # # Ensure df_result is a Dask DataFrame
136
- # self.fs = fs or fsspec.filesystem("file")
137
- # self.df_result = df_result
138
- # self.parquet_storage_path = parquet_storage_path
139
- # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
140
- #
141
- # def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
142
- # full_path = self._construct_full_path(parquet_filename)
143
- #
144
- # # We cannot check for empty DataFrame directly with Dask without computation
145
- # # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
146
- #
147
- # # Ensure directory exists and clear if necessary
148
- # self._ensure_directory_exists(full_path, clear_existing=clear_existing)
149
- #
150
- # # Define schema and save DataFrame to Parquet
151
- # schema = self._define_schema()
152
- # self._convert_dtypes(schema)
153
- # self._save_dataframe_to_parquet(full_path, schema)
154
- #
155
- # def _define_schema(self) -> pa.Schema:
156
- # """Define a PyArrow schema dynamically based on df_result column types."""
157
- # pandas_dtype_to_pa = {
158
- # 'object': pa.string(),
159
- # 'string': pa.string(),
160
- # 'Int64': pa.int64(),
161
- # 'int64': pa.int64(),
162
- # 'float64': pa.float64(),
163
- # 'float32': pa.float32(),
164
- # 'bool': pa.bool_(),
165
- # 'boolean': pa.bool_(), # pandas nullable boolean
166
- # 'datetime64[ns]': pa.timestamp('ns'),
167
- # 'timedelta[ns]': pa.duration('ns')
168
- # }
169
- #
170
- # dtypes = self.df_result.dtypes # No need to call .compute()
171
- #
172
- # fields = [
173
- # pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
174
- # for col, dtype in dtypes.items()
175
- # ]
176
- # return pa.schema(fields)
177
- #
178
- # def _convert_dtypes(self, schema: pa.Schema):
179
- # """Convert DataFrame columns to match the specified schema."""
180
- # dtype_mapping = {}
181
- # for field in schema:
182
- # col_name = field.name
183
- # if col_name in self.df_result.columns:
184
- # if pa.types.is_string(field.type):
185
- # dtype_mapping[col_name] = 'string'
186
- # elif pa.types.is_int64(field.type):
187
- # dtype_mapping[col_name] = 'Int64' # pandas nullable integer
188
- # elif pa.types.is_float64(field.type):
189
- # dtype_mapping[col_name] = 'float64'
190
- # elif pa.types.is_float32(field.type):
191
- # dtype_mapping[col_name] = 'float32'
192
- # elif pa.types.is_boolean(field.type):
193
- # dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
194
- # elif pa.types.is_timestamp(field.type):
195
- # dtype_mapping[col_name] = 'datetime64[ns]'
196
- # else:
197
- # dtype_mapping[col_name] = 'object' # Fallback to object
198
- # # Convert dtypes
199
- # self.df_result = self.df_result.astype(dtype_mapping)
200
- #
201
- # def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
202
- # """Construct and return the full path for the Parquet file."""
203
- # _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
204
- # parquet_filename = parquet_filename or "default.parquet"
205
- # return Path(base_path) / parquet_filename
206
- #
207
- # @staticmethod
208
- # def _ensure_directory_exists(full_path: Path, clear_existing=False):
209
- # """Ensure that the directory for the path exists, clearing it if specified."""
210
- # fs, _ = fsspec.core.url_to_fs(str(full_path))
211
- # directory = str(full_path.parent)
212
- #
213
- # if fs.exists(directory):
214
- # if clear_existing:
215
- # fs.rm(directory, recursive=True)
216
- # else:
217
- # fs.mkdirs(directory, exist_ok=True)
218
- #
219
- # def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
220
- # """Save the DataFrame to Parquet using the specified schema."""
221
- # fs, _ = fsspec.core.url_to_fs(str(full_path))
222
- # print(f"Saving to {str(full_path)}")
223
- # if fs.exists(str(full_path)):
224
- # fs.rm(str(full_path), recursive=True)
225
- #
226
- # # Save the Dask DataFrame to Parquet
227
- # self.df_result.to_parquet(
228
- # str(full_path), engine="pyarrow", schema=schema, write_index=False
229
- # )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.33
3
+ Version: 0.3.34
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  sibi_dst/__init__.py,sha256=CLHfzrFNqklNx5uMKAPtbZfkbBbVYR5qsiMro0RTfmA,252
2
2
  sibi_dst/df_helper/__init__.py,sha256=5yzslP6zYYOHsTtAzHnNDXHYjf_T6yW7baxwgtduWqQ,292
3
- sibi_dst/df_helper/_df_helper.py,sha256=sZaI998N9yd7FuUgZ8Esrz-K0eh2kXky53h9K8-l4cw,23650
3
+ sibi_dst/df_helper/_df_helper.py,sha256=NRiLdHHO45SPwhif5JIQpfj56iC8HcffaRAyT7-TC2w,29585
4
4
  sibi_dst/df_helper/_parquet_artifact.py,sha256=K9FnKjXDmkqCzYqv5weS9scLHsPGyj0UUUoVzOtWv30,8858
5
5
  sibi_dst/df_helper/_parquet_reader.py,sha256=HhzhKtV_7qABHJvmpU2CssjNLgQHUB07eF0CqqzmkOs,3654
6
6
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -48,8 +48,8 @@ sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10
48
48
  sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
49
49
  sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEBRY5FOUQ,6673
50
50
  sibi_dst/utils/log_utils.py,sha256=XUbeXa1JsOlcEJyW8jnBlWo295rLUnuYi-HMzyhHwJg,3145
51
- sibi_dst/utils/parquet_saver.py,sha256=kR4FsjdMurQF46M0jc2Kvze4Ue70lUxefEzS0iszln8,9740
51
+ sibi_dst/utils/parquet_saver.py,sha256=FmSTOVhKruGw6r5G1sH3kKqsP0tCuU32KTlyQBLpXos,5092
52
52
  sibi_dst/utils/storage_manager.py,sha256=qHo5vTv-dr1roRr_mOcprSTdlAfH4Q2Dy5tQUz06Pnk,4228
53
- sibi_dst-0.3.33.dist-info/METADATA,sha256=yghZuscDKJkUhggh-hpGnJm6of6pq-_BRVcKc4wt1_E,2564
54
- sibi_dst-0.3.33.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
- sibi_dst-0.3.33.dist-info/RECORD,,
53
+ sibi_dst-0.3.34.dist-info/METADATA,sha256=ewd8lmlRjJg0lEeEI0ju5g20zGk7Lk1bdgBxunNpf3s,2564
54
+ sibi_dst-0.3.34.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
+ sibi_dst-0.3.34.dist-info/RECORD,,