sibi-dst 2025.1.3__py3-none-any.whl → 2025.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +4 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +355 -163
- sibi_dst/df_helper/_df_helper.py +47 -30
- sibi_dst/df_helper/_parquet_artifact.py +57 -47
- sibi_dst/df_helper/_parquet_reader.py +9 -13
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +1 -103
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/base.py +97 -0
- sibi_dst/utils/clickhouse_writer.py +5 -4
- sibi_dst/utils/data_wrapper.py +69 -84
- sibi_dst/utils/date_utils.py +2 -1
- sibi_dst/utils/log_utils.py +309 -77
- sibi_dst/utils/manifest_manager.py +96 -375
- sibi_dst/utils/parquet_saver.py +98 -173
- sibi_dst/utils/storage_config.py +6 -0
- sibi_dst/utils/storage_manager.py +2 -1
- sibi_dst/utils/update_planner.py +72 -22
- {sibi_dst-2025.1.3.dist-info → sibi_dst-2025.1.5.dist-info}/METADATA +3 -1
- {sibi_dst-2025.1.3.dist-info → sibi_dst-2025.1.5.dist-info}/RECORD +24 -27
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +0 -91
- {sibi_dst-2025.1.3.dist-info → sibi_dst-2025.1.5.dist-info}/WHEEL +0 -0
@@ -1,91 +0,0 @@
|
|
1
|
-
from typing import Union, Optional
|
2
|
-
import pandas as pd
|
3
|
-
import dask.dataframe as dd
|
4
|
-
|
5
|
-
# Refactored DfHelper class
|
6
|
-
class DfHelper:
|
7
|
-
"""
|
8
|
-
DfHelper is a utility class that orchestrates loading and processing data.
|
9
|
-
It uses a configured BackendStrategy to handle the specifics of data loading.
|
10
|
-
"""
|
11
|
-
df: Union[dd.DataFrame, pd.DataFrame] = None
|
12
|
-
|
13
|
-
def __init__(self, backend_strategy: BackendStrategy,
|
14
|
-
params_config: ParamsConfig,
|
15
|
-
as_pandas: bool = False,
|
16
|
-
debug: bool = False,
|
17
|
-
logger: Optional[Logger] = None):
|
18
|
-
|
19
|
-
self.backend_strategy = backend_strategy
|
20
|
-
self._backend_params = params_config # Needed for post-processing and field mapping
|
21
|
-
self.as_pandas = as_pandas
|
22
|
-
self.debug = debug
|
23
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
24
|
-
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
25
|
-
|
26
|
-
# Other attributes like parquet saving paths can be passed in here if needed
|
27
|
-
# self.parquet_storage_path = kwargs.get("parquet_storage_path")
|
28
|
-
|
29
|
-
def __enter__(self):
|
30
|
-
return self
|
31
|
-
|
32
|
-
def __exit__(self, exc_type, exc_val, traceback):
|
33
|
-
# Cleanup logic for resources that DfHelper itself might manage
|
34
|
-
# The connection cleanup is now the responsibility of the caller who creates the strategy
|
35
|
-
if hasattr(self.backend_strategy, 'connection') and hasattr(self.backend_strategy.connection, 'close'):
|
36
|
-
self.backend_strategy.connection.close()
|
37
|
-
return False
|
38
|
-
|
39
|
-
def load(self, **options):
|
40
|
-
"""
|
41
|
-
Loads data using the configured backend strategy, applies transformations,
|
42
|
-
and returns a DataFrame.
|
43
|
-
"""
|
44
|
-
try:
|
45
|
-
self.logger.debug(f"Loading data using {self.backend_strategy.__class__.__name__}...")
|
46
|
-
# 1. Delegate loading to the strategy object
|
47
|
-
self.df = self.backend_strategy.load(**options)
|
48
|
-
|
49
|
-
# 2. Perform post-processing (these methods remain in DfHelper)
|
50
|
-
self.__process_loaded_data()
|
51
|
-
self.__post_process_df()
|
52
|
-
self.logger.debug("Data successfully loaded and processed.")
|
53
|
-
|
54
|
-
except Exception as e:
|
55
|
-
self.logger.error(f"Failed to load data using {self.backend_strategy.__class__.__name__}: {e}")
|
56
|
-
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
57
|
-
|
58
|
-
if self.as_pandas and isinstance(self.df, dd.DataFrame):
|
59
|
-
return self.df.compute()
|
60
|
-
return self.df
|
61
|
-
|
62
|
-
def load_period(self, start: str, end: str, dt_field: str, **kwargs):
|
63
|
-
"""
|
64
|
-
Loads data for a specific period by delegating filter creation to the strategy.
|
65
|
-
"""
|
66
|
-
if dt_field is None:
|
67
|
-
raise ValueError("dt_field must be provided")
|
68
|
-
|
69
|
-
# Parse and validate dates
|
70
|
-
start_dt = self.parse_date(start)
|
71
|
-
end_dt = self.parse_date(end)
|
72
|
-
if start_dt > end_dt:
|
73
|
-
raise ValueError("The 'start' date cannot be later than the 'end' date.")
|
74
|
-
|
75
|
-
# Delegate the creation of filter logic to the current strategy
|
76
|
-
field_map = getattr(self._backend_params, 'field_map', {}) or {}
|
77
|
-
period_filters = self.backend_strategy.build_period_filter(
|
78
|
-
dt_field, start_dt, end_dt, field_map
|
79
|
-
)
|
80
|
-
|
81
|
-
# Combine with other filters and load
|
82
|
-
all_filters = {**kwargs, **period_filters}
|
83
|
-
self.logger.debug(f"Loading period with combined filters: {all_filters}")
|
84
|
-
return self.load(**all_filters)
|
85
|
-
|
86
|
-
# The methods __process_loaded_data, __post_process_df, save_to_parquet,
|
87
|
-
# save_to_clickhouse, and parse_date remain unchanged as they are
|
88
|
-
# part of the orchestration logic, not the loading strategy itself.
|
89
|
-
|
90
|
-
# ... (paste those methods here without modification) ...
|
91
|
-
# ... __process_loaded_data, __post_process_df, save_to_parquet, etc. ...
|
File without changes
|