sibi-dst 0.3.33__py3-none-any.whl → 0.3.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -91,7 +91,7 @@ class DfHelper:
|
|
91
91
|
self.filesystem_options = kwargs.pop('filesystem_options', {})
|
92
92
|
kwargs.setdefault("live", True)
|
93
93
|
kwargs.setdefault("logger", self.logger)
|
94
|
-
kwargs.setdefault("fs", fsspec.filesystem('file'))
|
94
|
+
self.fs =kwargs.setdefault("fs", fsspec.filesystem('file'))
|
95
95
|
self.__post_init(**kwargs)
|
96
96
|
|
97
97
|
def __str__(self):
|
@@ -208,6 +208,18 @@ class DfHelper:
|
|
208
208
|
return asyncio.run(self.__load_from_http(**options))
|
209
209
|
|
210
210
|
def __load_from_sqlalchemy(self, **options):
|
211
|
+
"""
|
212
|
+
Loads data from an SQLAlchemy database source into a dataframe. The method processes
|
213
|
+
the loaded data and applies post-processing to transform it into the desired structure.
|
214
|
+
If the operation fails, an empty pandas DataFrame is created as a fallback.
|
215
|
+
|
216
|
+
:param options: Additional keyword arguments to configure the data loading process.
|
217
|
+
These options can include configurations such as 'debug' and other parameters
|
218
|
+
required by the `SqlAlchemyLoadFromDb` class.
|
219
|
+
:type options: dict
|
220
|
+
:return: A dataframe containing the data loaded from the SQLAlchemy database.
|
221
|
+
:rtype: dask.dataframe.DataFrame
|
222
|
+
"""
|
211
223
|
try:
|
212
224
|
options.setdefault("debug", self.debug)
|
213
225
|
db_loader = SqlAlchemyLoadFromDb(
|
@@ -228,6 +240,17 @@ class DfHelper:
|
|
228
240
|
return self.df
|
229
241
|
|
230
242
|
def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
243
|
+
"""
|
244
|
+
Loads data from a Django database using a specific backend query mechanism. Processes the loaded data
|
245
|
+
and applies further post-processing before returning the dataframe. If the operation fails, an
|
246
|
+
empty dataframe with a single partition is returned instead.
|
247
|
+
|
248
|
+
:param options: Additional settings for the database loading process, which include optional configurations
|
249
|
+
like debug mode, among others.
|
250
|
+
:type options: dict
|
251
|
+
:return: A dataframe containing the loaded data either as a Pandas or Dask dataframe.
|
252
|
+
:rtype: Union[pd.DataFrame, dd.DataFrame]
|
253
|
+
"""
|
231
254
|
try:
|
232
255
|
options.setdefault("debug", self.debug)
|
233
256
|
db_loader = DjangoLoadFromDb(
|
@@ -248,7 +271,18 @@ class DfHelper:
|
|
248
271
|
return self.df
|
249
272
|
|
250
273
|
async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
251
|
-
"""
|
274
|
+
"""
|
275
|
+
Loads data asynchronously from an HTTP source using the configured HTTP plugin.
|
276
|
+
If the HTTP plugin is not properly configured, this method logs a debug message and
|
277
|
+
returns an empty Dask DataFrame. If an exception occurs during data fetching, the error
|
278
|
+
is logged and an empty Dask DataFrame with one partition is returned.
|
279
|
+
|
280
|
+
:param options: Additional keyword arguments that are passed to the HTTP plugin for
|
281
|
+
fetching the data.
|
282
|
+
:returns: A DataFrame object that can either be a pandas or a Dask DataFrame. When the
|
283
|
+
fetching operation fails, it defaults to returning an empty Dask DataFrame
|
284
|
+
with a single partition.
|
285
|
+
"""
|
252
286
|
if not self.backend_http:
|
253
287
|
self.logger.debug("HTTP plugin not configured properly.")
|
254
288
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
@@ -339,12 +373,45 @@ class DfHelper:
|
|
339
373
|
|
340
374
|
self.logger.debug("Processing of loaded data completed.")
|
341
375
|
|
342
|
-
def save_to_parquet(self, parquet_filename: Optional[str] = None):
|
343
|
-
|
376
|
+
def save_to_parquet(self, parquet_filename: Optional[str] = None, **kwargs):
|
377
|
+
"""
|
378
|
+
Save the dataframe result to a Parquet file using specified configurations.
|
379
|
+
|
380
|
+
This method leverages the ParquetSaver class to store the dataframe result
|
381
|
+
into a Parquet file. It also provides functionality for overriding the default
|
382
|
+
filesystem (`fs`) and storage path (`parquet_storage_path`). The method logs
|
383
|
+
details about the saving operation for debugging purposes.
|
384
|
+
|
385
|
+
:param parquet_filename: The name of the Parquet file to save the dataframe to.
|
386
|
+
If not provided, a default name will be used.
|
387
|
+
:param kwargs: Additional arguments to customize the saving process. These may
|
388
|
+
include:
|
389
|
+
- `fs`: Filesystem to be used for saving Parquet files. If not
|
390
|
+
provided, defaults to the instance's filesystem attribute.
|
391
|
+
- `parquet_storage_path`: The root path in the filesystem where
|
392
|
+
Parquet files should be saved. If not provided, defaults to
|
393
|
+
the instance's attribute for storage path.
|
394
|
+
:return: None
|
395
|
+
"""
|
396
|
+
fs = kwargs.pop('fs', self.fs)
|
397
|
+
parquet_storage_path = kwargs.pop('parquet_storage_path', self.parquet_storage_path)
|
398
|
+
ps = ParquetSaver(df_result=self.df, parquet_storage_path=parquet_storage_path, logger=self.logger, fs=fs)
|
344
399
|
ps.save_to_parquet(parquet_filename)
|
345
|
-
self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {
|
400
|
+
self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {parquet_storage_path}.")
|
346
401
|
|
347
402
|
def save_to_clickhouse(self, **credentials):
|
403
|
+
"""
|
404
|
+
Saves the current DataFrame to ClickHouse using the provided credentials. This
|
405
|
+
method first checks if the DataFrame is empty. If it is empty, the method logs
|
406
|
+
a debug message and does not proceed with saving. Otherwise, it initializes
|
407
|
+
a ClickHouseWriter instance and uses it to save the DataFrame to ClickHouse,
|
408
|
+
logging a debug message upon successful completion.
|
409
|
+
|
410
|
+
:param credentials: Credentials required to connect to ClickHouse as keyword
|
411
|
+
arguments.
|
412
|
+
:type credentials: dict
|
413
|
+
:return: None
|
414
|
+
"""
|
348
415
|
if self.df.map_partitions(len).compute().sum() == 0:
|
349
416
|
self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
|
350
417
|
return
|
@@ -353,6 +420,21 @@ class DfHelper:
|
|
353
420
|
self.logger.debug("Save to ClickHouse completed.")
|
354
421
|
|
355
422
|
def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
423
|
+
"""
|
424
|
+
Loads data from parquet files into a DataFrame, applies provided filters, and handles exceptions.
|
425
|
+
|
426
|
+
This method leverages a backend-specific implementation to load data from parquet files into a
|
427
|
+
DataFrame. If additional options are provided and the data is successfully loaded, filters are
|
428
|
+
applied to the DataFrame using a filter handler. Errors during this process are handled gracefully
|
429
|
+
by logging the issue and returning an empty Dask DataFrame.
|
430
|
+
|
431
|
+
:param options: A dictionary of filter options to be applied to the DataFrame.
|
432
|
+
:type options: dict
|
433
|
+
|
434
|
+
:return: A DataFrame containing the loaded and filtered data. If the operation fails, an empty
|
435
|
+
Dask DataFrame is returned.
|
436
|
+
:rtype: Union[pd.DataFrame, dd.DataFrame]
|
437
|
+
"""
|
356
438
|
try:
|
357
439
|
self.df = self.backend_parquet.load_files()
|
358
440
|
if options and self.df is not None:
|
@@ -368,6 +450,27 @@ class DfHelper:
|
|
368
450
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
369
451
|
|
370
452
|
def load_period(self, **kwargs):
|
453
|
+
"""
|
454
|
+
Loads a period with specified parameters.
|
455
|
+
|
456
|
+
This method acts as a wrapper around the private ``__load_period`` method. It
|
457
|
+
accepts arbitrary keyword arguments that are passed directly to the private
|
458
|
+
method for execution. The purpose of allowing keyword arguments is to permit
|
459
|
+
flexible configuration or parameterization for loading a specific period, based
|
460
|
+
on the internal implementation of the private ``__load_period`` method.
|
461
|
+
|
462
|
+
Note:
|
463
|
+
The arguments and return values are entirely determined by the private
|
464
|
+
method's behavior. This method is intentionally designed to mask details
|
465
|
+
of the internal logic behind the abstraction.
|
466
|
+
|
467
|
+
:param kwargs: Arbitrary keyword arguments to parameterize the internal logic
|
468
|
+
of loading a period. The specific keys and values expected by the
|
469
|
+
``__load_period`` method depend on its own internal implementation.
|
470
|
+
:return: The result of calling the private ``__load_period`` method with the
|
471
|
+
provided keyword arguments. The return type is dependent on the internal
|
472
|
+
implementation of ``__load_period``.
|
473
|
+
"""
|
371
474
|
return self.__load_period(**kwargs)
|
372
475
|
|
373
476
|
def __load_period(self, **kwargs):
|
sibi_dst/utils/parquet_saver.py
CHANGED
@@ -121,109 +121,3 @@ class ParquetSaver:
|
|
121
121
|
write_index=False,
|
122
122
|
)
|
123
123
|
|
124
|
-
# from pathlib import Path
|
125
|
-
# from typing import Optional
|
126
|
-
#
|
127
|
-
# import fsspec
|
128
|
-
# import pyarrow as pa
|
129
|
-
#
|
130
|
-
# from sibi_dst.utils import Logger
|
131
|
-
#
|
132
|
-
#
|
133
|
-
# class ParquetSaver:
|
134
|
-
# def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
|
135
|
-
# # Ensure df_result is a Dask DataFrame
|
136
|
-
# self.fs = fs or fsspec.filesystem("file")
|
137
|
-
# self.df_result = df_result
|
138
|
-
# self.parquet_storage_path = parquet_storage_path
|
139
|
-
# self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
140
|
-
#
|
141
|
-
# def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
|
142
|
-
# full_path = self._construct_full_path(parquet_filename)
|
143
|
-
#
|
144
|
-
# # We cannot check for empty DataFrame directly with Dask without computation
|
145
|
-
# # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
|
146
|
-
#
|
147
|
-
# # Ensure directory exists and clear if necessary
|
148
|
-
# self._ensure_directory_exists(full_path, clear_existing=clear_existing)
|
149
|
-
#
|
150
|
-
# # Define schema and save DataFrame to Parquet
|
151
|
-
# schema = self._define_schema()
|
152
|
-
# self._convert_dtypes(schema)
|
153
|
-
# self._save_dataframe_to_parquet(full_path, schema)
|
154
|
-
#
|
155
|
-
# def _define_schema(self) -> pa.Schema:
|
156
|
-
# """Define a PyArrow schema dynamically based on df_result column types."""
|
157
|
-
# pandas_dtype_to_pa = {
|
158
|
-
# 'object': pa.string(),
|
159
|
-
# 'string': pa.string(),
|
160
|
-
# 'Int64': pa.int64(),
|
161
|
-
# 'int64': pa.int64(),
|
162
|
-
# 'float64': pa.float64(),
|
163
|
-
# 'float32': pa.float32(),
|
164
|
-
# 'bool': pa.bool_(),
|
165
|
-
# 'boolean': pa.bool_(), # pandas nullable boolean
|
166
|
-
# 'datetime64[ns]': pa.timestamp('ns'),
|
167
|
-
# 'timedelta[ns]': pa.duration('ns')
|
168
|
-
# }
|
169
|
-
#
|
170
|
-
# dtypes = self.df_result.dtypes # No need to call .compute()
|
171
|
-
#
|
172
|
-
# fields = [
|
173
|
-
# pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
|
174
|
-
# for col, dtype in dtypes.items()
|
175
|
-
# ]
|
176
|
-
# return pa.schema(fields)
|
177
|
-
#
|
178
|
-
# def _convert_dtypes(self, schema: pa.Schema):
|
179
|
-
# """Convert DataFrame columns to match the specified schema."""
|
180
|
-
# dtype_mapping = {}
|
181
|
-
# for field in schema:
|
182
|
-
# col_name = field.name
|
183
|
-
# if col_name in self.df_result.columns:
|
184
|
-
# if pa.types.is_string(field.type):
|
185
|
-
# dtype_mapping[col_name] = 'string'
|
186
|
-
# elif pa.types.is_int64(field.type):
|
187
|
-
# dtype_mapping[col_name] = 'Int64' # pandas nullable integer
|
188
|
-
# elif pa.types.is_float64(field.type):
|
189
|
-
# dtype_mapping[col_name] = 'float64'
|
190
|
-
# elif pa.types.is_float32(field.type):
|
191
|
-
# dtype_mapping[col_name] = 'float32'
|
192
|
-
# elif pa.types.is_boolean(field.type):
|
193
|
-
# dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
|
194
|
-
# elif pa.types.is_timestamp(field.type):
|
195
|
-
# dtype_mapping[col_name] = 'datetime64[ns]'
|
196
|
-
# else:
|
197
|
-
# dtype_mapping[col_name] = 'object' # Fallback to object
|
198
|
-
# # Convert dtypes
|
199
|
-
# self.df_result = self.df_result.astype(dtype_mapping)
|
200
|
-
#
|
201
|
-
# def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
|
202
|
-
# """Construct and return the full path for the Parquet file."""
|
203
|
-
# _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
|
204
|
-
# parquet_filename = parquet_filename or "default.parquet"
|
205
|
-
# return Path(base_path) / parquet_filename
|
206
|
-
#
|
207
|
-
# @staticmethod
|
208
|
-
# def _ensure_directory_exists(full_path: Path, clear_existing=False):
|
209
|
-
# """Ensure that the directory for the path exists, clearing it if specified."""
|
210
|
-
# fs, _ = fsspec.core.url_to_fs(str(full_path))
|
211
|
-
# directory = str(full_path.parent)
|
212
|
-
#
|
213
|
-
# if fs.exists(directory):
|
214
|
-
# if clear_existing:
|
215
|
-
# fs.rm(directory, recursive=True)
|
216
|
-
# else:
|
217
|
-
# fs.mkdirs(directory, exist_ok=True)
|
218
|
-
#
|
219
|
-
# def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
|
220
|
-
# """Save the DataFrame to Parquet using the specified schema."""
|
221
|
-
# fs, _ = fsspec.core.url_to_fs(str(full_path))
|
222
|
-
# print(f"Saving to {str(full_path)}")
|
223
|
-
# if fs.exists(str(full_path)):
|
224
|
-
# fs.rm(str(full_path), recursive=True)
|
225
|
-
#
|
226
|
-
# # Save the Dask DataFrame to Parquet
|
227
|
-
# self.df_result.to_parquet(
|
228
|
-
# str(full_path), engine="pyarrow", schema=schema, write_index=False
|
229
|
-
# )
|
@@ -1,6 +1,6 @@
|
|
1
1
|
sibi_dst/__init__.py,sha256=CLHfzrFNqklNx5uMKAPtbZfkbBbVYR5qsiMro0RTfmA,252
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=5yzslP6zYYOHsTtAzHnNDXHYjf_T6yW7baxwgtduWqQ,292
|
3
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
3
|
+
sibi_dst/df_helper/_df_helper.py,sha256=NRiLdHHO45SPwhif5JIQpfj56iC8HcffaRAyT7-TC2w,29585
|
4
4
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=K9FnKjXDmkqCzYqv5weS9scLHsPGyj0UUUoVzOtWv30,8858
|
5
5
|
sibi_dst/df_helper/_parquet_reader.py,sha256=HhzhKtV_7qABHJvmpU2CssjNLgQHUB07eF0CqqzmkOs,3654
|
6
6
|
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -48,8 +48,8 @@ sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10
|
|
48
48
|
sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
|
49
49
|
sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEBRY5FOUQ,6673
|
50
50
|
sibi_dst/utils/log_utils.py,sha256=XUbeXa1JsOlcEJyW8jnBlWo295rLUnuYi-HMzyhHwJg,3145
|
51
|
-
sibi_dst/utils/parquet_saver.py,sha256=
|
51
|
+
sibi_dst/utils/parquet_saver.py,sha256=FmSTOVhKruGw6r5G1sH3kKqsP0tCuU32KTlyQBLpXos,5092
|
52
52
|
sibi_dst/utils/storage_manager.py,sha256=qHo5vTv-dr1roRr_mOcprSTdlAfH4Q2Dy5tQUz06Pnk,4228
|
53
|
-
sibi_dst-0.3.
|
54
|
-
sibi_dst-0.3.
|
55
|
-
sibi_dst-0.3.
|
53
|
+
sibi_dst-0.3.34.dist-info/METADATA,sha256=ewd8lmlRjJg0lEeEI0ju5g20zGk7Lk1bdgBxunNpf3s,2564
|
54
|
+
sibi_dst-0.3.34.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
55
|
+
sibi_dst-0.3.34.dist-info/RECORD,,
|
File without changes
|