sibi-dst 0.3.11__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +5 -2
- sibi_dst/df_helper/plugins/http/_http_config.py +2 -3
- sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +4 -0
- sibi_dst/df_helper/plugins/parquet/_parquet_options.py +2 -0
- sibi_dst/utils/__init__.py +2 -1
- sibi_dst/utils/_data_utils.py +91 -62
- sibi_dst/utils/_data_wrapper.py +2 -320
- sibi_dst/utils/_date_utils.py +130 -0
- sibi_dst/utils/_df_utils.py +91 -0
- {sibi_dst-0.3.11.dist-info → sibi_dst-0.3.12.dist-info}/METADATA +4 -1
- {sibi_dst-0.3.11.dist-info → sibi_dst-0.3.12.dist-info}/RECORD +12 -12
- {sibi_dst-0.3.11.dist-info → sibi_dst-0.3.12.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -42,6 +42,7 @@ class DfHelper:
|
|
42
42
|
self.dt_field=kwargs.setdefault("dt_field", None)
|
43
43
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
44
44
|
kwargs.setdefault("live", True)
|
45
|
+
kwargs.setdefault("logger", self.logger)
|
45
46
|
self.post_init(**kwargs)
|
46
47
|
|
47
48
|
|
@@ -211,6 +212,7 @@ class DfHelper:
|
|
211
212
|
def save_to_parquet(self, parquet_filename: Optional[str] = None):
|
212
213
|
ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
|
213
214
|
ps.save_to_parquet(parquet_filename)
|
215
|
+
self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
|
214
216
|
|
215
217
|
def save_to_clickhouse(self, database, table, order_by=None, **credentials):
|
216
218
|
click_config ={
|
@@ -219,13 +221,14 @@ class DfHelper:
|
|
219
221
|
'order_by': order_by or 'id',
|
220
222
|
}
|
221
223
|
credentials = {**credentials, **click_config}
|
222
|
-
cs=ClickHouseWriter(**credentials)
|
224
|
+
cs=ClickHouseWriter(logger=self.logger, **credentials)
|
223
225
|
cs.save_to_clickhouse(self.df)
|
226
|
+
self.logger.info("Save to ClickHouse completed.")
|
224
227
|
|
225
228
|
def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
226
229
|
self.df = self.plugin_parquet.load_files()
|
227
230
|
if options:
|
228
|
-
self.df = ParquetFilterHandler().apply_filters_dask(self.df, options)
|
231
|
+
self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
|
229
232
|
return self.df
|
230
233
|
|
231
234
|
def load_period(self, **kwargs):
|
@@ -13,11 +13,10 @@ class HttpConfig(BaseModel):
|
|
13
13
|
api_key: Optional[SecretStr] = None
|
14
14
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
15
15
|
|
16
|
-
def __init__(self, **data):
|
16
|
+
def __init__(self, logger=None, **data):
|
17
17
|
super().__init__(**data)
|
18
18
|
# Initialize the logger if not provided
|
19
|
-
|
20
|
-
self.logger = Logger(log_dir='./logs/', logger_name="HttpDataSource", log_file='http_data_source.log')
|
19
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
21
20
|
|
22
21
|
async def fetch_data(self, **options) -> dd.DataFrame:
|
23
22
|
"""Asynchronously fetch JSON data from HTTP endpoint, substituting options into the URL path."""
|
@@ -1,7 +1,11 @@
|
|
1
1
|
import pandas as pd
|
2
2
|
import dask.dataframe as dd
|
3
|
+
from sibi_dst.utils import Logger
|
3
4
|
|
4
5
|
class ParquetFilterHandler(object):
|
6
|
+
def __init__(self, logger=None):
|
7
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
8
|
+
|
5
9
|
@staticmethod
|
6
10
|
def apply_filters_dask(df, filters):
|
7
11
|
dt_operators = ['date', 'time']
|
@@ -25,6 +25,8 @@ class ParquetConfig(BaseModel):
|
|
25
25
|
@model_validator(mode='after')
|
26
26
|
def check_parquet_params(self):
|
27
27
|
# Configure paths based on fsspec
|
28
|
+
if self.logger is None:
|
29
|
+
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
28
30
|
self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
|
29
31
|
|
30
32
|
# Validation for parquet path
|
sibi_dst/utils/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from ._credentials import ConfigManager, ConfigLoader
|
3
3
|
from ._log_utils import Logger
|
4
|
-
from ._date_utils import DateUtils
|
4
|
+
from ._date_utils import DateUtils, BusinessDays
|
5
5
|
from ._data_utils import DataUtils
|
6
6
|
from ._file_utils import FileUtils
|
7
7
|
from ._filepath_generator import FilePathGenerator
|
@@ -17,6 +17,7 @@ __all__=[
|
|
17
17
|
"ConfigLoader",
|
18
18
|
"Logger",
|
19
19
|
"DateUtils",
|
20
|
+
"BusinessDays",
|
20
21
|
"FileUtils",
|
21
22
|
"DataWrapper",
|
22
23
|
"DataUtils",
|
sibi_dst/utils/_data_utils.py
CHANGED
@@ -7,6 +7,27 @@ class DataUtils:
|
|
7
7
|
def __init__(self, logger=None):
|
8
8
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
9
9
|
|
10
|
+
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
11
|
+
if not columns:
|
12
|
+
self.logger.warning('No columns specified')
|
13
|
+
|
14
|
+
columns = [column for column in columns if column in df.columns]
|
15
|
+
for col in columns:
|
16
|
+
if isinstance(df, dd.DataFrame):
|
17
|
+
# Replace NaN with 0, then convert to boolean
|
18
|
+
df[col] = df[col].map_partitions(
|
19
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
20
|
+
.fillna(fill_value) # Replace NaN with 0
|
21
|
+
.astype(dtype),
|
22
|
+
meta=(col, dtype)
|
23
|
+
)
|
24
|
+
else:
|
25
|
+
# For Pandas DataFrame, handle mixed types and invalid values
|
26
|
+
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
27
|
+
df[col] = df[col].fillna(fill_value).astype(dtype)
|
28
|
+
|
29
|
+
return df
|
30
|
+
|
10
31
|
@staticmethod
|
11
32
|
def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
|
12
33
|
"""
|
@@ -14,6 +35,7 @@ class DataUtils:
|
|
14
35
|
|
15
36
|
Parameters:
|
16
37
|
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
38
|
+
- columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
|
17
39
|
- fill_value (int or float): The value to replace NA values with.
|
18
40
|
- transform_func (callable, optional): The transformation function to apply.
|
19
41
|
If None, no additional transformation is applied.
|
@@ -28,31 +50,28 @@ class DataUtils:
|
|
28
50
|
if not columns:
|
29
51
|
return df
|
30
52
|
|
53
|
+
columns = [column for column in columns if column in df.columns]
|
31
54
|
# Default transformation function (identity) if none is provided
|
32
55
|
if transform_func is None:
|
33
56
|
transform_func = lambda x: x
|
34
57
|
|
35
|
-
#
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
58
|
+
# Batch processing for Dask
|
59
|
+
if isinstance(df, dd.DataFrame):
|
60
|
+
def transform_partition(partition):
|
61
|
+
# Apply transformations for all numeric columns in a single pass
|
62
|
+
partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
|
63
|
+
return partition
|
64
|
+
|
65
|
+
# Apply the transformation function to all specified columns
|
66
|
+
df = df.map_partitions(transform_partition, meta=df)
|
67
|
+
else:
|
68
|
+
# Pandas: Vectorized operations for all specified columns
|
69
|
+
df[columns] = df[columns].fillna(fill_value).map(transform_func)
|
44
70
|
|
45
|
-
df[col] = df[col].fillna(fill_value).astype(meta_type)
|
46
|
-
if isinstance(df, dd.DataFrame):
|
47
|
-
df[col] = df[col].map_partitions(
|
48
|
-
lambda s: s.apply(transform_func), meta=(col, meta_type)
|
49
|
-
)
|
50
|
-
else:
|
51
|
-
df[col] = df[col].apply(transform_func)
|
52
71
|
return df
|
53
72
|
|
54
73
|
@staticmethod
|
55
|
-
def transform_boolean_columns(df, columns=None
|
74
|
+
def transform_boolean_columns(df, columns=None):
|
56
75
|
"""
|
57
76
|
Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
|
58
77
|
and convert them to boolean. Detection is performed using a sample.
|
@@ -96,73 +115,67 @@ class DataUtils:
|
|
96
115
|
Returns:
|
97
116
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
|
98
117
|
"""
|
99
|
-
#
|
118
|
+
# Return early if the DataFrame is empty
|
100
119
|
if self.is_dataframe_empty(df):
|
101
120
|
return df
|
102
121
|
|
103
|
-
# Extract required parameters
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
|
109
|
-
fieldnames = kwargs.get('fieldnames', None)
|
110
|
-
column_names = kwargs.get('column_names', None)
|
122
|
+
# Extract and validate required parameters
|
123
|
+
required_params = ['source_col', 'lookup_col', 'lookup_description_col', 'source_description_alias']
|
124
|
+
missing_params = [param for param in required_params if param not in kwargs]
|
125
|
+
if missing_params:
|
126
|
+
raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
|
111
127
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
128
|
+
source_col = kwargs.pop('source_col')
|
129
|
+
lookup_col = kwargs.pop('lookup_col')
|
130
|
+
lookup_description_col = kwargs.pop('lookup_description_col')
|
131
|
+
source_description_alias = kwargs.pop('source_description_alias')
|
132
|
+
|
133
|
+
# Optional parameters with default values
|
134
|
+
fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
|
135
|
+
fieldnames = kwargs.pop('fieldnames', (lookup_col, lookup_description_col))
|
136
|
+
column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
|
117
137
|
|
118
138
|
if source_col not in df.columns:
|
119
|
-
self.logger.info(f
|
139
|
+
self.logger.info(f"{source_col} not in DataFrame columns")
|
120
140
|
return df
|
121
141
|
|
122
142
|
# Get unique IDs from source column
|
123
143
|
ids = df[source_col].dropna().unique()
|
124
144
|
if isinstance(ids, dd.Series):
|
125
145
|
ids = ids.compute()
|
126
|
-
|
127
|
-
|
128
|
-
if not ids:
|
129
|
-
self.logger.info(f'No IDs found in the source column: {source_col}')
|
146
|
+
if not len(ids):
|
147
|
+
self.logger.info(f"No IDs found in the source column: {source_col}")
|
130
148
|
return df
|
131
|
-
|
132
|
-
# Set default fieldnames and column_names if not provided
|
133
|
-
if fieldnames is None:
|
134
|
-
kwargs['fieldnames'] = (lookup_col, lookup_description_col)
|
135
|
-
if column_names is None:
|
136
|
-
kwargs['column_names'] = ['temp_join_col', source_description_alias]
|
137
|
-
|
149
|
+
ids = sorted(ids.tolist())
|
138
150
|
# Prepare kwargs for loading lookup data
|
139
151
|
load_kwargs = kwargs.copy()
|
140
|
-
load_kwargs
|
141
|
-
|
152
|
+
load_kwargs.update({
|
153
|
+
'fieldnames': fieldnames,
|
154
|
+
'column_names': column_names,
|
155
|
+
f'{lookup_col}__in': ids
|
156
|
+
})
|
142
157
|
# Load lookup data
|
143
|
-
lookup_instance = classname()
|
158
|
+
lookup_instance = classname(debug=True, verbose_debug=True)
|
144
159
|
result = lookup_instance.load(**load_kwargs)
|
145
|
-
|
160
|
+
if len(result.index) == 0:
|
161
|
+
self.logger.info(f"No IDs found in the source column: {source_col}")
|
162
|
+
return df
|
146
163
|
# Determine the join column on the result DataFrame
|
147
|
-
if 'temp_join_col' in
|
148
|
-
temp_join_col = 'temp_join_col'
|
149
|
-
else:
|
150
|
-
temp_join_col = lookup_col
|
164
|
+
temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
|
151
165
|
|
152
166
|
# Merge DataFrames
|
153
167
|
df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
|
154
168
|
|
155
169
|
if fillna_source_description_alias and source_description_alias in df.columns:
|
156
|
-
df[source_description_alias]
|
170
|
+
df[source_description_alias]=df[source_description_alias].fillna('')
|
157
171
|
|
158
172
|
# Drop temp_join_col if present
|
159
|
-
|
160
|
-
df = df.drop(columns='temp_join_col')
|
173
|
+
df = df.drop(columns='temp_join_col', errors='ignore')
|
161
174
|
|
162
175
|
return df
|
163
176
|
|
164
|
-
|
165
|
-
def is_dataframe_empty(df):
|
177
|
+
|
178
|
+
def is_dataframe_empty(self, df):
|
166
179
|
"""
|
167
180
|
Check if a DataFrame (Pandas or Dask) is empty.
|
168
181
|
|
@@ -173,14 +186,30 @@ class DataUtils:
|
|
173
186
|
- bool: True if the DataFrame is empty, False otherwise.
|
174
187
|
"""
|
175
188
|
if isinstance(df, dd.DataFrame):
|
176
|
-
|
177
|
-
|
178
|
-
|
189
|
+
try:
|
190
|
+
return len(df.index) == 0
|
191
|
+
except Exception as e:
|
192
|
+
self.logger.error(f"Error while processing Dask DataFrame: {e}")
|
193
|
+
return False
|
194
|
+
elif isinstance(df, pd.DataFrame):
|
179
195
|
return df.empty
|
196
|
+
else:
|
197
|
+
self.logger.error("Input must be a pandas or dask DataFrame.")
|
198
|
+
return False
|
180
199
|
|
181
200
|
@staticmethod
|
182
|
-
def
|
201
|
+
def convert_to_datetime_dask(df, date_fields):
|
202
|
+
"""
|
203
|
+
Convert specified columns in a Dask DataFrame to datetime, handling errors gracefully.
|
204
|
+
|
205
|
+
Parameters:
|
206
|
+
- df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
|
207
|
+
- date_fields (list of str): List of column names to convert to datetime.
|
208
|
+
|
209
|
+
Returns:
|
210
|
+
- dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
|
211
|
+
"""
|
183
212
|
for col in date_fields:
|
184
213
|
if col in df.columns:
|
185
|
-
df[col] =
|
186
|
-
return df
|
214
|
+
df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
|
215
|
+
return df
|
sibi_dst/utils/_data_wrapper.py
CHANGED
@@ -164,7 +164,7 @@ class DataWrapper:
|
|
164
164
|
|
165
165
|
date_range = self.generate_date_range()
|
166
166
|
if self.show_progress:
|
167
|
-
date_range = tqdm(date_range, desc="Evaluating update plan", unit="date")
|
167
|
+
date_range = tqdm(date_range, desc=f"Evaluating update plan {self.__class__.__name__}", unit="date")
|
168
168
|
|
169
169
|
for current_date in date_range:
|
170
170
|
folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
|
@@ -207,325 +207,7 @@ class DataWrapper:
|
|
207
207
|
return update_plan_table
|
208
208
|
|
209
209
|
|
210
|
-
|
211
|
-
# from typing import Type, Any, Dict, Optional
|
212
|
-
# import fsspec
|
213
|
-
# import pandas as pd
|
214
|
-
# from IPython.display import display
|
215
|
-
#
|
216
|
-
# from sibi_dst.utils import Logger
|
217
|
-
# from tqdm import tqdm
|
218
|
-
# from sibi_dst.utils import ParquetSaver
|
219
|
-
#
|
220
|
-
# class DataWrapper:
|
221
|
-
# DEFAULT_MAX_AGE_MINUTES = 1440
|
222
|
-
# DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
223
|
-
#
|
224
|
-
# def __init__(self,
|
225
|
-
# dataclass: Type,
|
226
|
-
# date_field: str,
|
227
|
-
# data_path: str,
|
228
|
-
# parquet_filename: str,
|
229
|
-
# start_date: Any,
|
230
|
-
# end_date: Any,
|
231
|
-
# filesystem_type: str = "file",
|
232
|
-
# filesystem_options: Optional[Dict] = None,
|
233
|
-
# verbose: bool = False,
|
234
|
-
# class_params: Optional[Dict] = None,
|
235
|
-
# load_params: Optional[Dict] = None,
|
236
|
-
# reverse_order: bool = False,
|
237
|
-
# overwrite: bool = False,
|
238
|
-
# ignore_missing: bool = False,
|
239
|
-
# logger: Optional[Logger] = None,
|
240
|
-
# max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
241
|
-
# history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
242
|
-
# show_progress: bool = False):
|
243
|
-
# self.dataclass = dataclass
|
244
|
-
# self.date_field = date_field
|
245
|
-
# self.data_path = self.ensure_forward_slash(data_path)
|
246
|
-
# self.parquet_filename = parquet_filename
|
247
|
-
# self.filesystem_type = filesystem_type
|
248
|
-
# self.filesystem_options = filesystem_options or {}
|
249
|
-
# self.fs = fsspec.filesystem(filesystem_type, **self.filesystem_options)
|
250
|
-
# self.verbose = verbose
|
251
|
-
# self.class_params = class_params or {}
|
252
|
-
# self.load_params = load_params or {}
|
253
|
-
# self.reverse_order = reverse_order
|
254
|
-
# self.overwrite = overwrite
|
255
|
-
# self.ignore_missing = ignore_missing
|
256
|
-
# self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
257
|
-
# self.max_age_minutes = max_age_minutes
|
258
|
-
# self.history_days_threshold = history_days_threshold
|
259
|
-
# self.show_progress = show_progress
|
260
|
-
#
|
261
|
-
# self.start_date = self.convert_to_date(start_date)
|
262
|
-
# self.end_date = self.convert_to_date(end_date)
|
263
|
-
#
|
264
|
-
#
|
265
|
-
# def convert_to_date(self, date: Any) -> datetime.date:
|
266
|
-
# try:
|
267
|
-
# return datetime.datetime.strptime(date, '%Y-%m-%d').date() if isinstance(date, str) else date
|
268
|
-
# except ValueError as e:
|
269
|
-
# self.logger.error(f"Error converting {date} to datetime: {e}")
|
270
|
-
# raise
|
271
|
-
#
|
272
|
-
# @staticmethod
|
273
|
-
# def ensure_forward_slash(path: str) -> str:
|
274
|
-
# return path if path.endswith('/') else path + '/'
|
275
|
-
#
|
276
|
-
# def generate_date_range(self):
|
277
|
-
# step = -1 if self.reverse_order else 1
|
278
|
-
# start, end = (self.end_date, self.start_date) if self.reverse_order else (self.start_date, self.end_date)
|
279
|
-
# current_date = start
|
280
|
-
# while current_date != end + datetime.timedelta(days=step):
|
281
|
-
# yield current_date
|
282
|
-
# current_date += datetime.timedelta(days=step)
|
283
|
-
#
|
284
|
-
# def process(self):
|
285
|
-
# """Execute the update plan following the specified hierarchy."""
|
286
|
-
# update_plan, update_plan_table = self.generate_update_plan_with_conditions()
|
287
|
-
#
|
288
|
-
# # Display the update plan table to the user
|
289
|
-
#
|
290
|
-
# display(update_plan_table)
|
291
|
-
#
|
292
|
-
# # Process files according to the hierarchy, considering only `update_required` dates
|
293
|
-
# for category, description in [
|
294
|
-
# ("overwrite", "Processing files due to overwrite=True"),
|
295
|
-
# ("history_days", "Processing files within history_days_threshold"),
|
296
|
-
# ("missing_files", "Processing missing files")
|
297
|
-
# ]:
|
298
|
-
# # Filter dates in the category where `update_required` is True
|
299
|
-
# dates_to_process = update_plan_table[
|
300
|
-
# (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
|
301
|
-
# ]["date"].tolist()
|
302
|
-
#
|
303
|
-
# for current_date in tqdm(dates_to_process, desc=description, unit="date"):
|
304
|
-
# self.process_date(current_date)
|
305
|
-
#
|
306
|
-
# def is_file_older_than(self, file_path: str, current_date: datetime.date) -> bool:
|
307
|
-
# """
|
308
|
-
# Check if a file is older than the specified max_age_minutes.
|
309
|
-
# """
|
310
|
-
# if not self.fs.exists(file_path):
|
311
|
-
# return True # Treat missing files as old
|
312
|
-
#
|
313
|
-
# # Get the file modification time
|
314
|
-
# file_modification_time = self.fs.info(file_path)['mtime']
|
315
|
-
# file_modification_datetime = datetime.datetime.fromtimestamp(file_modification_time, tz=datetime.timezone.utc)
|
316
|
-
#
|
317
|
-
# # Get the current UTC time as a timezone-aware object
|
318
|
-
# current_time = datetime.datetime.now(datetime.timezone.utc)
|
319
|
-
#
|
320
|
-
# # Calculate file age in seconds and minutes
|
321
|
-
# file_age_seconds = (current_time - file_modification_datetime).total_seconds()
|
322
|
-
# file_age_minutes = file_age_seconds / 60
|
323
|
-
#
|
324
|
-
# if self.verbose:
|
325
|
-
# self.logger.info(
|
326
|
-
# f"File {file_path} is {round(file_age_minutes, 2)} minutes old (threshold: {self.max_age_minutes} minutes)")
|
327
|
-
#
|
328
|
-
# # Check if the file date is within the history threshold
|
329
|
-
# history_start_date = datetime.date.today() - datetime.timedelta(days=self.history_days_threshold)
|
330
|
-
# within_history_threshold = current_date >= history_start_date
|
331
|
-
#
|
332
|
-
# # File is considered old if it exceeds max_age_minutes and is within the history threshold
|
333
|
-
# return file_age_minutes > self.max_age_minutes and within_history_threshold
|
334
|
-
#
|
335
|
-
# def process_date(self, date: datetime.date):
|
336
|
-
# """Process a specific date by regenerating data as necessary."""
|
337
|
-
# folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
|
338
|
-
# full_parquet_filename = f"{folder}{self.parquet_filename}"
|
339
|
-
#
|
340
|
-
# start_time = datetime.datetime.now()
|
341
|
-
#
|
342
|
-
# if self.verbose:
|
343
|
-
# self.logger.info(f"Processing {full_parquet_filename}...")
|
344
|
-
#
|
345
|
-
# data_object = self.dataclass(**self.class_params)
|
346
|
-
# #date_filter_params = {
|
347
|
-
# # f'{self.date_field}__year': date.year,
|
348
|
-
# # f'{self.date_field}__month': date.month,
|
349
|
-
# # f'{self.date_field}__day': date.day
|
350
|
-
# #}
|
351
|
-
# df=data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
352
|
-
# #df = data_object.load(**self.load_params, **date_filter_params)
|
353
|
-
#
|
354
|
-
# if len(df.index) == 0:
|
355
|
-
# if self.verbose:
|
356
|
-
# self.logger.info("No data found for the specified date.")
|
357
|
-
# return
|
358
|
-
#
|
359
|
-
# parquet_saver = ParquetSaver(df, folder, self.logger)
|
360
|
-
# parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
|
361
|
-
#
|
362
|
-
# end_time = datetime.datetime.now()
|
363
|
-
# duration_seconds = (end_time - start_time).total_seconds()
|
364
|
-
#
|
365
|
-
# if self.verbose:
|
366
|
-
# self.logger.info(f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds")
|
367
|
-
#
|
368
|
-
#
|
369
|
-
# def remove_empty_directories(self, path: str):
|
370
|
-
# if not self.fs.isdir(path) or self.fs.abspath(path) == self.fs.abspath(self.data_path):
|
371
|
-
# return
|
372
|
-
#
|
373
|
-
# if not self.fs.ls(path): # Check if directory is empty
|
374
|
-
# try:
|
375
|
-
# self.fs.rmdir(path)
|
376
|
-
# if self.verbose:
|
377
|
-
# self.logger.info(f"Removed empty directory: {path}")
|
378
|
-
# self.remove_empty_directories(self.fs.path.dirname(path))
|
379
|
-
# except Exception as e:
|
380
|
-
# if self.verbose:
|
381
|
-
# self.logger.error(f"Error removing directory {path}: {e}")
|
382
|
-
# else:
|
383
|
-
# if self.verbose:
|
384
|
-
# self.logger.info(f"Directory not empty, stopping: {path}")
|
385
|
-
#
|
386
|
-
# def generate_update_plan_with_conditions(self):
|
387
|
-
# """
|
388
|
-
# Generate an update plan that evaluates files based on the specified hierarchy:
|
389
|
-
# 1. Overwrite (all files regenerated).
|
390
|
-
# 2. History threshold: Files within `history_days_threshold` are evaluated for `max_age_minutes`.
|
391
|
-
# 3. Missing files: Detect missing files, ignoring future dates.
|
392
|
-
# """
|
393
|
-
# update_plan = {
|
394
|
-
# "overwrite": [],
|
395
|
-
# "history_days": [],
|
396
|
-
# "missing_files": []
|
397
|
-
# }
|
398
|
-
# rows = []
|
399
|
-
#
|
400
|
-
# today = datetime.date.today()
|
401
|
-
# history_start_date = today - datetime.timedelta(
|
402
|
-
# days=self.history_days_threshold) if self.history_days_threshold else None
|
403
|
-
#
|
404
|
-
# for current_date in tqdm(self.generate_date_range(), desc="Evaluating update plan", unit="date"):
|
405
|
-
# folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
|
406
|
-
# full_parquet_filename = f"{folder}{self.parquet_filename}"
|
407
|
-
#
|
408
|
-
# file_exists = self.fs.exists(full_parquet_filename)
|
409
|
-
# file_age_minutes = None # Initialize file_age_minutes as None
|
410
|
-
# file_is_old = False
|
411
|
-
# within_history = False
|
412
|
-
# missing_file = not file_exists and not self.ignore_missing
|
413
|
-
# category = None
|
414
|
-
#
|
415
|
-
# if file_exists:
|
416
|
-
# # Calculate file age in minutes
|
417
|
-
# file_modification_time = self.fs.info(full_parquet_filename)['mtime']
|
418
|
-
# file_modification_datetime = datetime.datetime.fromtimestamp(file_modification_time,
|
419
|
-
# tz=datetime.timezone.utc)
|
420
|
-
# current_time = datetime.datetime.now(datetime.timezone.utc)
|
421
|
-
# file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
|
422
|
-
#
|
423
|
-
# # Determine if the file is old
|
424
|
-
# file_is_old = file_age_minutes > self.max_age_minutes
|
425
|
-
#
|
426
|
-
# # Determine if the file is within the history threshold
|
427
|
-
# if self.history_days_threshold and history_start_date and history_start_date <= current_date <= today:
|
428
|
-
# within_history = True
|
429
|
-
#
|
430
|
-
# # Hierarchy 1: Overwrite (all files are marked for regeneration)
|
431
|
-
# if self.overwrite:
|
432
|
-
# category = "overwrite"
|
433
|
-
#
|
434
|
-
# # Hierarchy 2: History threshold evaluation
|
435
|
-
# elif within_history and (missing_file or file_is_old):
|
436
|
-
# category = "history_days"
|
437
|
-
#
|
438
|
-
# # Hierarchy 3: Detect missing files, ignoring future dates
|
439
|
-
# elif missing_file and current_date <= today:
|
440
|
-
# category = "missing_files"
|
441
|
-
#
|
442
|
-
# # Append to update plan
|
443
|
-
# if category:
|
444
|
-
# update_plan[category].append(current_date)
|
445
|
-
#
|
446
|
-
# # Collect condition descriptions for the update plan table
|
447
|
-
# rows.append({
|
448
|
-
# "date": current_date,
|
449
|
-
# "file_exists": file_exists,
|
450
|
-
# "file_age_minutes": file_age_minutes, # Add file age to the table
|
451
|
-
# "file_is_old": file_is_old,
|
452
|
-
# "within_history": within_history,
|
453
|
-
# "missing_file": missing_file,
|
454
|
-
# "update_required": category is not None, # Mark as true if a category is assigned
|
455
|
-
# "update_category": category
|
456
|
-
# })
|
457
|
-
#
|
458
|
-
# # Sort dates in descending order if reverse_order is True
|
459
|
-
# if self.reverse_order:
|
460
|
-
# for key in update_plan:
|
461
|
-
# update_plan[key].sort(reverse=True)
|
462
|
-
#
|
463
|
-
# update_plan_table = pd.DataFrame(rows)
|
464
|
-
# return update_plan, update_plan_table
|
465
|
-
# # def generate_update_plan_with_conditions(self):
|
466
|
-
# # """
|
467
|
-
# # Generate an update plan that evaluates files based on the specified hierarchy:
|
468
|
-
# # 1. Overwrite (all files regenerated).
|
469
|
-
# # 2. History threshold: Files within `history_days_threshold` are evaluated for `max_age_minutes`.
|
470
|
-
# # 3. Missing files: Detect missing files, ignoring future dates.
|
471
|
-
# # """
|
472
|
-
# # update_plan = {
|
473
|
-
# # "overwrite": [],
|
474
|
-
# # "history_days": [],
|
475
|
-
# # "missing_files": []
|
476
|
-
# # }
|
477
|
-
# # rows = []
|
478
|
-
# #
|
479
|
-
# # today = datetime.date.today()
|
480
|
-
# # history_start_date = today - datetime.timedelta(days=self.history_days_threshold) if self.history_days_threshold else None
|
481
|
-
# #
|
482
|
-
# # for current_date in tqdm(self.generate_date_range(), desc="Evaluating update plan", unit="date"):
|
483
|
-
# # folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
|
484
|
-
# # full_parquet_filename = f"{folder}{self.parquet_filename}"
|
485
|
-
# #
|
486
|
-
# # file_exists = self.fs.exists(full_parquet_filename)
|
487
|
-
# # file_is_old = file_exists and self.is_file_older_than(full_parquet_filename, current_date)
|
488
|
-
# # within_history = False
|
489
|
-
# # missing_file = not file_exists and not self.ignore_missing
|
490
|
-
# # category = None
|
491
|
-
# #
|
492
|
-
# # # Hierarchy 1: Overwrite (all files are marked for regeneration)
|
493
|
-
# # if self.overwrite:
|
494
|
-
# # category = "overwrite"
|
495
|
-
# #
|
496
|
-
# # # Hierarchy 2: History threshold evaluation
|
497
|
-
# # elif self.history_days_threshold and history_start_date and history_start_date <= current_date <= today:
|
498
|
-
# # within_history = True
|
499
|
-
# # if missing_file or self.is_file_older_than(full_parquet_filename, current_date):
|
500
|
-
# # category = "history_days"
|
501
|
-
# #
|
502
|
-
# # # Hierarchy 3: Detect missing files, ignoring future dates
|
503
|
-
# # elif missing_file and current_date <= today:
|
504
|
-
# # category = "missing_files"
|
505
|
-
# #
|
506
|
-
# # # Append to update plan
|
507
|
-
# # if category:
|
508
|
-
# # update_plan[category].append(current_date)
|
509
|
-
# #
|
510
|
-
# # # Collect condition descriptions for the update plan table
|
511
|
-
# # rows.append({
|
512
|
-
# # "date": current_date,
|
513
|
-
# # "file_exists": file_exists,
|
514
|
-
# # "file_is_old": file_is_old,
|
515
|
-
# # "within_history": within_history,
|
516
|
-
# # "missing_file": missing_file,
|
517
|
-
# # "update_required": category is not None,
|
518
|
-
# # "update_category": category
|
519
|
-
# # })
|
520
|
-
# #
|
521
|
-
# # # Sort dates in descending order if reverse_order is True
|
522
|
-
# # if self.reverse_order:
|
523
|
-
# # for key in update_plan:
|
524
|
-
# # update_plan[key].sort(reverse=True)
|
525
|
-
# #
|
526
|
-
# # update_plan_table = pd.DataFrame(rows)
|
527
|
-
# # return update_plan, update_plan_table
|
528
|
-
#
|
210
|
+
|
529
211
|
# # Usage:
|
530
212
|
# # wrapper = DataWrapper(
|
531
213
|
# # dataclass=YourDataClass,
|
sibi_dst/utils/_date_utils.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
import datetime
|
2
2
|
from typing import Union, Tuple, Callable, Dict, Any
|
3
|
+
|
4
|
+
import numpy as np
|
3
5
|
import pandas as pd
|
4
6
|
from sibi_dst.utils import Logger
|
5
7
|
|
@@ -118,6 +120,134 @@ class DateUtils:
|
|
118
120
|
'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
|
119
121
|
}
|
120
122
|
|
123
|
+
class BusinessDays:
|
124
|
+
def __init__(self, holiday_list, logger):
|
125
|
+
"""
|
126
|
+
Initialize a BusinessDays object with a given holiday list.
|
127
|
+
"""
|
128
|
+
self.logger = logger
|
129
|
+
self.HOLIDAY_LIST = holiday_list
|
130
|
+
bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
|
131
|
+
self.bd_cal = np.busdaycalendar(holidays=bd_holidays, weekmask="1111100")
|
132
|
+
self.holidays = self.bd_cal.holidays
|
133
|
+
self.week_mask = self.bd_cal.weekmask
|
134
|
+
|
135
|
+
def get_business_days_count(self, begin_date, end_date):
|
136
|
+
"""
|
137
|
+
Calculate the number of business days between two dates.
|
138
|
+
"""
|
139
|
+
try:
|
140
|
+
begin_date = pd.to_datetime(begin_date)
|
141
|
+
end_date = pd.to_datetime(end_date)
|
142
|
+
except Exception as e:
|
143
|
+
raise ValueError(f"Invalid date format: {e}")
|
144
|
+
|
145
|
+
years = [str(year) for year in range(begin_date.year, end_date.year + 1)]
|
146
|
+
if not all(year in self.HOLIDAY_LIST for year in years):
|
147
|
+
raise ValueError("Not all years in date range are in the holiday list")
|
148
|
+
|
149
|
+
return np.busday_count(
|
150
|
+
begin_date.strftime("%Y-%m-%d"),
|
151
|
+
end_date.strftime("%Y-%m-%d"),
|
152
|
+
busdaycal=self.bd_cal,
|
153
|
+
)
|
154
|
+
|
155
|
+
def calc_business_days_from_df(self, df, begin_date_col, end_date_col, result_col="business_days"):
|
156
|
+
"""
|
157
|
+
Add a column to a Dask DataFrame with the number of business days between two date columns.
|
158
|
+
"""
|
159
|
+
if not all(col in df.columns for col in [begin_date_col, end_date_col]):
|
160
|
+
self.logger.error("Column names not found in DataFrame")
|
161
|
+
raise ValueError("Required columns are missing")
|
162
|
+
|
163
|
+
# Extract holidays and weekmask to recreate the busdaycalendar
|
164
|
+
holidays = self.bd_cal.holidays
|
165
|
+
weekmask = self.bd_cal.weekmask
|
166
|
+
|
167
|
+
# Define a function to calculate business days
|
168
|
+
def calculate_business_days(row, holidays, weekmask):
|
169
|
+
begin_date = pd.to_datetime(row[begin_date_col])
|
170
|
+
end_date = pd.to_datetime(row[end_date_col])
|
171
|
+
busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
|
172
|
+
return np.busday_count(
|
173
|
+
begin_date.strftime("%Y-%m-%d"),
|
174
|
+
end_date.strftime("%Y-%m-%d"),
|
175
|
+
busdaycal=busdaycal,
|
176
|
+
)
|
177
|
+
|
178
|
+
# Define a wrapper function for partition-wise operations
|
179
|
+
def apply_business_days(partition, holidays, weekmask):
|
180
|
+
return partition.apply(
|
181
|
+
calculate_business_days, axis=1, holidays=holidays, weekmask=weekmask
|
182
|
+
)
|
183
|
+
|
184
|
+
# Apply the function using map_partitions
|
185
|
+
df[result_col] = df.map_partitions(
|
186
|
+
apply_business_days,
|
187
|
+
holidays,
|
188
|
+
weekmask,
|
189
|
+
meta=(result_col, "int64"),
|
190
|
+
)
|
191
|
+
|
192
|
+
return df
|
193
|
+
|
194
|
+
def add_business_days(self, start_date, n_days):
|
195
|
+
"""
|
196
|
+
Add n_days business days to start_date.
|
197
|
+
"""
|
198
|
+
try:
|
199
|
+
start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
|
200
|
+
except ValueError:
|
201
|
+
raise ValueError("Date should be a string in the format YYYY-MM-DD")
|
202
|
+
|
203
|
+
if str(start_date.year) not in self.HOLIDAY_LIST:
|
204
|
+
self.logger.warning(f"Year {start_date.year} is not in the holiday list")
|
205
|
+
|
206
|
+
return np.busday_offset(
|
207
|
+
start_date.strftime("%Y-%m-%d"),
|
208
|
+
n_days,
|
209
|
+
roll="forward",
|
210
|
+
busdaycal=self.bd_cal,
|
211
|
+
)
|
212
|
+
|
213
|
+
def calc_sla_end_date(self, df, start_date_col, n_days_col, result_col="sla_end_date"):
|
214
|
+
"""
|
215
|
+
Add a column to a Dask DataFrame with SLA end dates based on start date and SLA days.
|
216
|
+
"""
|
217
|
+
if not all(col in df.columns for col in [start_date_col, n_days_col]):
|
218
|
+
raise ValueError("Column names not found in DataFrame")
|
219
|
+
|
220
|
+
# Extract holidays and weekmask to recreate the busdaycalendar
|
221
|
+
holidays = self.bd_cal.holidays
|
222
|
+
weekmask = self.bd_cal.weekmask
|
223
|
+
|
224
|
+
# Define a function to calculate SLA end dates
|
225
|
+
def calculate_sla_end_date(row, holidays, weekmask):
|
226
|
+
start_date = pd.to_datetime(row[start_date_col])
|
227
|
+
n_days = row[n_days_col]
|
228
|
+
busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
|
229
|
+
return np.busday_offset(
|
230
|
+
start_date.strftime("%Y-%m-%d"),
|
231
|
+
n_days,
|
232
|
+
roll="forward",
|
233
|
+
busdaycal=busdaycal,
|
234
|
+
)
|
235
|
+
|
236
|
+
# Define a wrapper for partition-wise operation
|
237
|
+
def apply_sla_end_date(partition, holidays, weekmask):
|
238
|
+
return partition.apply(
|
239
|
+
calculate_sla_end_date, axis=1, holidays=holidays, weekmask=weekmask
|
240
|
+
)
|
241
|
+
|
242
|
+
# Apply the function using map_partitions
|
243
|
+
df[result_col] = df.map_partitions(
|
244
|
+
apply_sla_end_date,
|
245
|
+
holidays,
|
246
|
+
weekmask,
|
247
|
+
meta=(result_col, "object"),
|
248
|
+
)
|
249
|
+
|
250
|
+
return df
|
121
251
|
# Class enhancements
|
122
252
|
# DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
|
123
253
|
# datetime.date.today() + datetime.timedelta(days=13)))
|
sibi_dst/utils/_df_utils.py
CHANGED
@@ -12,6 +12,97 @@ class DfUtils:
|
|
12
12
|
"""
|
13
13
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
14
14
|
|
15
|
+
def align_and_merge_by_type(self, df_left, df_right, type_mapping, how='left'):
|
16
|
+
"""
|
17
|
+
Align column data types in two DataFrames based on a type mapping dictionary and perform the merge.
|
18
|
+
|
19
|
+
Parameters:
|
20
|
+
- df_left (pd.DataFrame or dd.DataFrame): Left DataFrame
|
21
|
+
- df_right (pd.DataFrame or dd.DataFrame): Right DataFrame
|
22
|
+
- type_mapping (dict): Dictionary mapping target dtypes to column pairs.
|
23
|
+
Example: {
|
24
|
+
'integer': [('customer_id', 'temp1'), ('product_type_id', 'temp2')],
|
25
|
+
'string': [('group2', 'temp4')]
|
26
|
+
}
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
- Merged DataFrame
|
30
|
+
"""
|
31
|
+
# Map string keys to actual dtypes
|
32
|
+
dtype_map = {
|
33
|
+
'integer': 'int64',
|
34
|
+
'float': 'float64',
|
35
|
+
'string': 'string',
|
36
|
+
'datetime': 'datetime64[ns]',
|
37
|
+
'boolean': 'bool',
|
38
|
+
}
|
39
|
+
|
40
|
+
# Iterate over each dtype and align the column pairs
|
41
|
+
for target_type, column_pairs in type_mapping.items():
|
42
|
+
if target_type not in dtype_map:
|
43
|
+
self.logger.error(f"Unsupported type: {target_type}")
|
44
|
+
|
45
|
+
for left_col, right_col in column_pairs:
|
46
|
+
# Align dtypes in left and right DataFrames
|
47
|
+
if left_col in df_left.columns and right_col in df_right.columns:
|
48
|
+
df_left[left_col] = df_left[left_col].astype(dtype_map[target_type])
|
49
|
+
df_right[right_col] = df_right[right_col].astype(dtype_map[target_type])
|
50
|
+
|
51
|
+
# Flatten all column pairs for the merge operation
|
52
|
+
all_pairs = [pair for pairs in type_mapping.values() for pair in pairs]
|
53
|
+
|
54
|
+
# Perform the merge
|
55
|
+
return df_left.merge(
|
56
|
+
df_right,
|
57
|
+
how=how,
|
58
|
+
left_on=[pair[0] for pair in all_pairs],
|
59
|
+
right_on=[pair[1] for pair in all_pairs]
|
60
|
+
)
|
61
|
+
|
62
|
+
def exclude_from_dataframe(self, df, conditions):
|
63
|
+
"""
|
64
|
+
Generic function to filter rows from a DataFrame (Pandas or Dask).
|
65
|
+
|
66
|
+
Parameters:
|
67
|
+
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame to filter.
|
68
|
+
- conditions (list of tuples): List of conditions to apply for filtering.
|
69
|
+
Each condition is a tuple: (column_name, operator, value).
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
- pandas.DataFrame or dask.dataframe.DataFrame: Filtered DataFrame.
|
73
|
+
"""
|
74
|
+
import operator
|
75
|
+
|
76
|
+
# Mapping string operators to actual Python operators
|
77
|
+
ops = {
|
78
|
+
"==": operator.eq,
|
79
|
+
"!=": operator.ne,
|
80
|
+
"<": operator.lt,
|
81
|
+
"<=": operator.le,
|
82
|
+
">": operator.gt,
|
83
|
+
">=": operator.ge,
|
84
|
+
}
|
85
|
+
# Ensure all specified columns exist in the DataFrame
|
86
|
+
missing_columns = [col for col, _, _ in conditions if col not in df.columns]
|
87
|
+
if missing_columns:
|
88
|
+
self.logger.info(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
|
89
|
+
return df
|
90
|
+
|
91
|
+
# Build the combined filtering condition
|
92
|
+
combined_condition = None
|
93
|
+
for col, op, value in conditions:
|
94
|
+
if op not in ops:
|
95
|
+
raise ValueError(f"Unsupported operator: {op}")
|
96
|
+
|
97
|
+
# Get the individual condition
|
98
|
+
condition = ops[op](df[col], value)
|
99
|
+
|
100
|
+
# Combine the condition with AND (&)
|
101
|
+
combined_condition = condition if combined_condition is None else (combined_condition & condition)
|
102
|
+
|
103
|
+
# Apply the filtering and return the DataFrame
|
104
|
+
return df[~combined_condition]
|
105
|
+
|
15
106
|
def load_grouped_activity(self, df, group_by_expr, group_expr='count', debug=False):
|
16
107
|
"""
|
17
108
|
Groups the DataFrame by the specified expression and computes the size.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.12
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -9,6 +9,8 @@ Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: Programming Language :: Python :: 3.11
|
10
10
|
Classifier: Programming Language :: Python :: 3.12
|
11
11
|
Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
|
12
|
+
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
13
|
+
Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
12
14
|
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
13
15
|
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
14
16
|
Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
|
@@ -29,6 +31,7 @@ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
29
31
|
Requires-Dist: sqlmodel (>=0.0.22,<0.0.23)
|
30
32
|
Requires-Dist: tornado (>=6.4.1,<7.0.0)
|
31
33
|
Requires-Dist: tqdm (>=4.67.0,<5.0.0)
|
34
|
+
Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
|
32
35
|
Description-Content-Type: text/markdown
|
33
36
|
|
34
37
|
# sibi-dst
|
@@ -1,6 +1,6 @@
|
|
1
1
|
sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
|
3
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
3
|
+
sibi_dst/df_helper/_df_helper.py,sha256=43-eY9mDU-j-QFeAtdMjIb3KuC2_hYzLjVi177_EKAo,13006
|
4
4
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
|
5
5
|
sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
|
6
6
|
sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
|
@@ -14,10 +14,10 @@ sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PF
|
|
14
14
|
sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=jryDojeA62rB3seRaWWMjsAmekKacK5xctwCQGVklPQ,9063
|
15
15
|
sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
|
16
16
|
sibi_dst/df_helper/plugins/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
|
17
|
-
sibi_dst/df_helper/plugins/http/_http_config.py,sha256=
|
17
|
+
sibi_dst/df_helper/plugins/http/_http_config.py,sha256=WH0d4vsxfZRhWrWI4iTVAnhsdY3421SBr9kXYZVfeYQ,2126
|
18
18
|
sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPjLDTVHW2NQqqfQwWAw,187
|
19
|
-
sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=
|
20
|
-
sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=
|
19
|
+
sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
|
20
|
+
sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=suJC7LfNEWAo-7_R62YTMSRku3k8orysft83VxRUems,4394
|
21
21
|
sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
|
22
22
|
sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=6IjQEREXqTAzSJE95FKfXjRkTlEjRMS4hJ_yMpyKDTg,5223
|
23
23
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
|
@@ -27,19 +27,19 @@ sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=vrTTe
|
|
27
27
|
sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
|
28
28
|
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
|
29
29
|
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=bLD4tEcGDKkJCfSO4b13_89tzVJcpz55I6uw9D4ERnE,3751
|
30
|
-
sibi_dst/utils/__init__.py,sha256=
|
30
|
+
sibi_dst/utils/__init__.py,sha256=nkX7tASNn57kw998YdqQQGY8qXv2J4LC4-g0GoQSiic,831
|
31
31
|
sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
|
32
32
|
sibi_dst/utils/_clickhouse_writer.py,sha256=mdgszbyVluhGvDmvsHY4XDTZrp42L3xtdmiyn3z2bYM,8534
|
33
33
|
sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
|
34
|
-
sibi_dst/utils/_data_utils.py,sha256=
|
35
|
-
sibi_dst/utils/_data_wrapper.py,sha256=
|
36
|
-
sibi_dst/utils/_date_utils.py,sha256=
|
37
|
-
sibi_dst/utils/_df_utils.py,sha256=
|
34
|
+
sibi_dst/utils/_data_utils.py,sha256=BvmjMNSkICy671BmjW68RhvDMfN5uAXwhffSV-wEwmk,9185
|
35
|
+
sibi_dst/utils/_data_wrapper.py,sha256=SmNv1UoZLq7ovRVy4wipsWLMidKJXcRTp4HtxmaCQdk,9399
|
36
|
+
sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
|
37
|
+
sibi_dst/utils/_df_utils.py,sha256=9_dNYoZ9_ofU0t_sxMdsXALWCuh02gvqUrei-6Lhr6w,10910
|
38
38
|
sibi_dst/utils/_file_utils.py,sha256=5EN90c8N1n9d-_xwz2RzaYcXRMQY_rws2Q3EA3pNAog,1254
|
39
39
|
sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixWnn_68_n0,6686
|
40
40
|
sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
|
41
41
|
sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
|
42
42
|
sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
|
43
|
-
sibi_dst-0.3.
|
44
|
-
sibi_dst-0.3.
|
45
|
-
sibi_dst-0.3.
|
43
|
+
sibi_dst-0.3.12.dist-info/METADATA,sha256=5mezOBAiUV2pMgNsVqI7iCZZgmxeZpLuYWDYUAZCTVk,2030
|
44
|
+
sibi_dst-0.3.12.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
45
|
+
sibi_dst-0.3.12.dist-info/RECORD,,
|
File without changes
|