sibi-dst 0.3.39__py3-none-any.whl → 0.3.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/__init__.py +2 -0
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +257 -0
- sibi_dst/utils/__init__.py +3 -0
- sibi_dst/utils/data_utils.py +66 -25
- sibi_dst/utils/data_wrapper.py +586 -286
- sibi_dst/utils/date_utils.py +118 -113
- sibi_dst/utils/log_utils.py +57 -18
- sibi_dst/utils/phone_formatter.py +127 -0
- {sibi_dst-0.3.39.dist-info → sibi_dst-0.3.42.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.39.dist-info → sibi_dst-0.3.42.dist-info}/RECORD +11 -9
- {sibi_dst-0.3.39.dist-info → sibi_dst-0.3.42.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/__init__.py
CHANGED
@@ -3,11 +3,13 @@ from __future__ import annotations
|
|
3
3
|
from ._df_helper import DfHelper
|
4
4
|
from ._parquet_artifact import ParquetArtifact
|
5
5
|
from ._parquet_reader import ParquetReader
|
6
|
+
from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapper
|
6
7
|
#from .data_cleaner import DataCleaner
|
7
8
|
|
8
9
|
__all__ = [
|
9
10
|
'DfHelper',
|
10
11
|
'ParquetArtifact',
|
11
12
|
'ParquetReader',
|
13
|
+
'ArtifactUpdaterMultiWrapper',
|
12
14
|
#'DataCleaner'
|
13
15
|
]
|
@@ -0,0 +1,257 @@
|
|
1
|
+
import asyncio
|
2
|
+
import logging
|
3
|
+
import datetime
|
4
|
+
import psutil
|
5
|
+
import time
|
6
|
+
from functools import total_ordering
|
7
|
+
from collections import defaultdict
|
8
|
+
from contextlib import asynccontextmanager
|
9
|
+
import signal
|
10
|
+
from sibi_dst.utils import Logger
|
11
|
+
|
12
|
+
@total_ordering
|
13
|
+
class PrioritizedItem:
|
14
|
+
def __init__(self, priority, artifact):
|
15
|
+
self.priority = priority
|
16
|
+
self.artifact = artifact
|
17
|
+
|
18
|
+
def __lt__(self, other):
|
19
|
+
return self.priority < other.priority
|
20
|
+
|
21
|
+
def __eq__(self, other):
|
22
|
+
return self.priority == other.priority
|
23
|
+
|
24
|
+
class ArtifactUpdaterMultiWrapper:
|
25
|
+
def __init__(self, wrapped_classes=None, debug=False, **kwargs):
|
26
|
+
self.wrapped_classes = wrapped_classes or {}
|
27
|
+
self.debug = debug
|
28
|
+
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
29
|
+
self.logger.set_level(logging.DEBUG if debug else logging.INFO)
|
30
|
+
|
31
|
+
today = datetime.datetime.today()
|
32
|
+
self.today_str = today.strftime('%Y-%m-%d')
|
33
|
+
self.current_year_starts_on_str = datetime.date(today.year, 1, 1).strftime('%Y-%m-%d')
|
34
|
+
self.parquet_start_date = kwargs.get('parquet_start_date', self.current_year_starts_on_str)
|
35
|
+
self.parquet_end_date = kwargs.get('parquet_end_date', self.today_str)
|
36
|
+
|
37
|
+
# track concurrency and locks
|
38
|
+
self.locks = {}
|
39
|
+
self.worker_heartbeat = defaultdict(float)
|
40
|
+
|
41
|
+
# graceful shutdown handling
|
42
|
+
loop = asyncio.get_event_loop()
|
43
|
+
self.register_signal_handlers(loop)
|
44
|
+
|
45
|
+
# dynamic scaling config
|
46
|
+
self.min_workers = kwargs.get('min_workers', 1)
|
47
|
+
self.max_workers = kwargs.get('max_workers', 8)
|
48
|
+
self.memory_per_worker_gb = kwargs.get('memory_per_worker_gb', 1) # default 2GB per worker
|
49
|
+
self.monitor_interval = kwargs.get('monitor_interval', 10) # default monitor interval in seconds
|
50
|
+
self.retry_attempts = kwargs.get('retry_attempts', 3)
|
51
|
+
self.update_timeout_seconds = kwargs.get('update_timeout_seconds', 600)
|
52
|
+
self.lock_acquire_timeout_seconds = kwargs.get('lock_acquire_timeout_seconds', 10)
|
53
|
+
|
54
|
+
def register_signal_handlers(self, loop):
|
55
|
+
for sig in (signal.SIGINT, signal.SIGTERM):
|
56
|
+
loop.add_signal_handler(sig, lambda: asyncio.create_task(self.shutdown()))
|
57
|
+
|
58
|
+
async def shutdown(self):
|
59
|
+
self.logger.info("Shutdown signal received. Cleaning up...")
|
60
|
+
tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
|
61
|
+
[task.cancel() for task in tasks]
|
62
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
63
|
+
self.logger.info("Shutdown complete.")
|
64
|
+
|
65
|
+
def get_lock_for_artifact(self, artifact):
|
66
|
+
artifact_key = artifact.__class__.__name__
|
67
|
+
if artifact_key not in self.locks:
|
68
|
+
self.locks[artifact_key] = asyncio.Lock()
|
69
|
+
return self.locks[artifact_key]
|
70
|
+
|
71
|
+
def get_artifacts(self, data_type):
|
72
|
+
if data_type not in self.wrapped_classes:
|
73
|
+
raise ValueError(f"Unsupported data type: {data_type}")
|
74
|
+
|
75
|
+
return [
|
76
|
+
artifact_class(parquet_start_date=self.parquet_start_date, parquet_end_date=self.parquet_end_date)
|
77
|
+
for artifact_class in self.wrapped_classes[data_type]
|
78
|
+
]
|
79
|
+
|
80
|
+
def estimate_complexity(self, artifact):
|
81
|
+
try:
|
82
|
+
if hasattr(artifact, 'get_size_estimate'):
|
83
|
+
return artifact.get_size_estimate()
|
84
|
+
except Exception as e:
|
85
|
+
self.logger.warning(f"Failed to estimate complexity for {artifact}: {e}")
|
86
|
+
return 1 # default
|
87
|
+
|
88
|
+
def prioritize_tasks(self, artifacts):
|
89
|
+
queue = asyncio.PriorityQueue()
|
90
|
+
for artifact in artifacts:
|
91
|
+
complexity = self.estimate_complexity(artifact)
|
92
|
+
# we invert the complexity to ensure higher complexity -> higher priority
|
93
|
+
# if you want high complexity first, store negative complexity in the priority queue
|
94
|
+
# or if the smaller number means earlier processing, just keep as is
|
95
|
+
queue.put_nowait(PrioritizedItem(complexity, artifact))
|
96
|
+
return queue
|
97
|
+
|
98
|
+
async def resource_monitor(self, queue, workers):
|
99
|
+
"""Monitor system resources and adjust worker count while queue is not empty."""
|
100
|
+
while True:
|
101
|
+
# break if queue done
|
102
|
+
if queue.empty():
|
103
|
+
await asyncio.sleep(0.5)
|
104
|
+
if queue.empty():
|
105
|
+
break
|
106
|
+
|
107
|
+
try:
|
108
|
+
available_memory = psutil.virtual_memory().available
|
109
|
+
worker_memory_bytes = self.memory_per_worker_gb * (1024 ** 3)
|
110
|
+
max_workers_by_memory = available_memory // worker_memory_bytes
|
111
|
+
|
112
|
+
# figure out how many workers we can sustain
|
113
|
+
# note: we also cap by self.max_workers
|
114
|
+
optimal_workers = min(psutil.cpu_count(), max_workers_by_memory, self.max_workers)
|
115
|
+
|
116
|
+
# ensure at least self.min_workers is used
|
117
|
+
optimal_workers = max(self.min_workers, optimal_workers)
|
118
|
+
|
119
|
+
current_worker_count = len(workers)
|
120
|
+
|
121
|
+
if optimal_workers > current_worker_count:
|
122
|
+
# we can add more workers if queue is not empty
|
123
|
+
diff = optimal_workers - current_worker_count
|
124
|
+
for _ in range(diff):
|
125
|
+
worker_id = len(workers)
|
126
|
+
# create a new worker
|
127
|
+
w = asyncio.create_task(self.worker(queue, worker_id))
|
128
|
+
workers.append(w)
|
129
|
+
self.logger.info(f"Added worker {worker_id}. Total workers: {len(workers)}")
|
130
|
+
elif optimal_workers < current_worker_count:
|
131
|
+
# remove some workers
|
132
|
+
diff = current_worker_count - optimal_workers
|
133
|
+
for _ in range(diff):
|
134
|
+
w = workers.pop()
|
135
|
+
w.cancel()
|
136
|
+
self.logger.info(f"Removed a worker. Total workers: {len(workers)}")
|
137
|
+
|
138
|
+
await asyncio.sleep(self.monitor_interval)
|
139
|
+
|
140
|
+
except asyncio.CancelledError:
|
141
|
+
# monitor is being shut down
|
142
|
+
break
|
143
|
+
except Exception as e:
|
144
|
+
self.logger.error(f"Error in resource_monitor: {e}")
|
145
|
+
await asyncio.sleep(self.monitor_interval)
|
146
|
+
|
147
|
+
@asynccontextmanager
|
148
|
+
async def artifact_lock(self, artifact):
|
149
|
+
lock = self.get_lock_for_artifact(artifact)
|
150
|
+
try:
|
151
|
+
await asyncio.wait_for(lock.acquire(), timeout=self.lock_acquire_timeout_seconds)
|
152
|
+
yield
|
153
|
+
except asyncio.TimeoutError:
|
154
|
+
self.logger.error(f"Timeout acquiring lock for artifact: {artifact.__class__.__name__}")
|
155
|
+
yield # continue but no actual lock was acquired
|
156
|
+
finally:
|
157
|
+
if lock.locked():
|
158
|
+
lock.release()
|
159
|
+
|
160
|
+
async def async_update_artifact(self, artifact, **kwargs):
|
161
|
+
for attempt in range(self.retry_attempts):
|
162
|
+
try:
|
163
|
+
async with self.artifact_lock(artifact):
|
164
|
+
self.logger.info(
|
165
|
+
f"Updating artifact: {artifact.__class__.__name__}, Attempt: {attempt + 1} of {self.retry_attempts}" )
|
166
|
+
start_time = time.time()
|
167
|
+
await asyncio.wait_for(
|
168
|
+
asyncio.to_thread(artifact.update_parquet, **kwargs),
|
169
|
+
timeout=self.update_timeout_seconds
|
170
|
+
)
|
171
|
+
elapsed_time = time.time() - start_time
|
172
|
+
self.logger.info(
|
173
|
+
f"Successfully updated artifact: {artifact.__class__.__name__} in {elapsed_time:.2f}s." )
|
174
|
+
return
|
175
|
+
|
176
|
+
except asyncio.TimeoutError:
|
177
|
+
self.logger.error(f"Timeout updating artifact {artifact.__class__.__name__}, Attempt: {attempt + 1}")
|
178
|
+
except Exception as e:
|
179
|
+
self.logger.error(
|
180
|
+
f"Error updating artifact {artifact.__class__.__name__}, Attempt: {attempt + 1}: {e}" )
|
181
|
+
|
182
|
+
# exponential backoff
|
183
|
+
await asyncio.sleep(2 ** attempt)
|
184
|
+
|
185
|
+
self.logger.error(f"All retry attempts failed for artifact: {artifact.__class__.__name__}")
|
186
|
+
|
187
|
+
async def worker(self, queue, worker_id, **kwargs):
|
188
|
+
"""A worker that dynamically pulls tasks from the queue."""
|
189
|
+
while True:
|
190
|
+
try:
|
191
|
+
prioritized_item = await queue.get()
|
192
|
+
if prioritized_item is None:
|
193
|
+
break
|
194
|
+
artifact = prioritized_item.artifact
|
195
|
+
# heartbeat
|
196
|
+
self.worker_heartbeat[worker_id] = time.time()
|
197
|
+
|
198
|
+
await self.async_update_artifact(artifact, **kwargs)
|
199
|
+
|
200
|
+
except asyncio.CancelledError:
|
201
|
+
self.logger.info(f"Worker {worker_id} shutting down gracefully.")
|
202
|
+
break
|
203
|
+
except Exception as e:
|
204
|
+
self.logger.error(f"Error in worker {worker_id}: {e}")
|
205
|
+
finally:
|
206
|
+
queue.task_done()
|
207
|
+
|
208
|
+
async def process_tasks(self, queue, initial_workers, **kwargs):
|
209
|
+
"""Start a set of workers and a resource monitor to dynamically adjust them."""
|
210
|
+
# create initial workers
|
211
|
+
workers = []
|
212
|
+
for worker_id in range(initial_workers):
|
213
|
+
w = asyncio.create_task(self.worker(queue, worker_id, **kwargs))
|
214
|
+
workers.append(w)
|
215
|
+
|
216
|
+
# start resource monitor
|
217
|
+
monitor_task = asyncio.create_task(self.resource_monitor(queue, workers))
|
218
|
+
|
219
|
+
# wait until queue is done
|
220
|
+
try:
|
221
|
+
await queue.join()
|
222
|
+
finally:
|
223
|
+
# cancel resource monitor
|
224
|
+
monitor_task.cancel()
|
225
|
+
# all workers done
|
226
|
+
for w in workers:
|
227
|
+
w.cancel()
|
228
|
+
await asyncio.gather(*workers, return_exceptions=True)
|
229
|
+
|
230
|
+
async def update_data(self, data_type, **kwargs):
|
231
|
+
self.logger.info(f"Processing wrapper group: {data_type} with {kwargs}")
|
232
|
+
artifacts = self.get_artifacts(data_type)
|
233
|
+
queue = self.prioritize_tasks(artifacts)
|
234
|
+
|
235
|
+
# compute initial worker count (this can be low if memory is low initially)
|
236
|
+
initial_workers = self.calculate_initial_workers(len(artifacts))
|
237
|
+
self.logger.info(f"Initial worker count: {initial_workers} for {len(artifacts)} artifacts")
|
238
|
+
|
239
|
+
total_start_time = time.time()
|
240
|
+
await self.process_tasks(queue, initial_workers, **kwargs)
|
241
|
+
total_time = time.time() - total_start_time
|
242
|
+
self.logger.info(f"Total processing time: {total_time:.2f} seconds.")
|
243
|
+
|
244
|
+
def calculate_initial_workers(self, artifact_count: int) -> int:
|
245
|
+
"""Compute the initial number of workers before resource_monitor can adjust."""
|
246
|
+
self.logger.info("Calculating initial worker count...")
|
247
|
+
available_memory = psutil.virtual_memory().available
|
248
|
+
self.logger.info(f"Available memory: {available_memory / (1024 ** 3):.2f} GB")
|
249
|
+
worker_memory_bytes = self.memory_per_worker_gb * (1024 ** 3)
|
250
|
+
self.logger.info(f"Memory per worker: {worker_memory_bytes / (1024 ** 3):.2f} GB")
|
251
|
+
max_workers_by_memory = available_memory // worker_memory_bytes
|
252
|
+
self.logger.info(f"Max workers by memory: {max_workers_by_memory}")
|
253
|
+
# also consider CPU count and artifact_count
|
254
|
+
initial = min(psutil.cpu_count(), max_workers_by_memory, artifact_count, self.max_workers)
|
255
|
+
self.logger.info(f"Optimal workers: {initial} CPU: {psutil.cpu_count()} Max Workers: {self.max_workers}")
|
256
|
+
return max(self.min_workers, initial)
|
257
|
+
|
sibi_dst/utils/__init__.py
CHANGED
@@ -4,6 +4,7 @@ from .log_utils import Logger
|
|
4
4
|
from .date_utils import *
|
5
5
|
from .data_utils import DataUtils
|
6
6
|
from .file_utils import FileUtils
|
7
|
+
from .phone_formatter import PhoneNumberFormatter
|
7
8
|
from .filepath_generator import FilePathGenerator
|
8
9
|
from .df_utils import DfUtils
|
9
10
|
from .storage_manager import StorageManager
|
@@ -18,8 +19,10 @@ __all__ = [
|
|
18
19
|
"ConfigManager",
|
19
20
|
"ConfigLoader",
|
20
21
|
"DateUtils",
|
22
|
+
"FileAgeChecker",
|
21
23
|
"BusinessDays",
|
22
24
|
"FileUtils",
|
25
|
+
"PhoneNumberFormatter",
|
23
26
|
"DataWrapper",
|
24
27
|
"DataUtils",
|
25
28
|
"FilePathGenerator",
|
sibi_dst/utils/data_utils.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from typing import Union, List
|
2
|
+
|
1
3
|
import dask.dataframe as dd
|
2
4
|
import pandas as pd
|
3
5
|
|
@@ -23,6 +25,58 @@ class DataUtils:
|
|
23
25
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
24
26
|
self.debug = kwargs.get('debug', False)
|
25
27
|
|
28
|
+
@staticmethod
|
29
|
+
def _transform_column(series, fill_value, dtype):
|
30
|
+
"""
|
31
|
+
Helper method to transform a column by converting it to numeric, filling missing values,
|
32
|
+
and casting to the specified dtype.
|
33
|
+
|
34
|
+
:param series: The column to transform.
|
35
|
+
:type series: pd.Series or dd.Series
|
36
|
+
:param fill_value: Value to replace missing or invalid data.
|
37
|
+
:type fill_value: int or float
|
38
|
+
:param dtype: Target data type for the column.
|
39
|
+
:type dtype: type
|
40
|
+
:return: Transformed column.
|
41
|
+
:rtype: pd.Series or dd.Series
|
42
|
+
"""
|
43
|
+
return (
|
44
|
+
pd.to_numeric(series, errors="coerce") # Convert to numeric, invalid to NaN
|
45
|
+
.fillna(fill_value) # Replace NaN with fill_value
|
46
|
+
.astype(dtype) # Convert to target dtype
|
47
|
+
)
|
48
|
+
|
49
|
+
def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0,
|
50
|
+
dtype=int):
|
51
|
+
"""
|
52
|
+
Transform specified numeric columns in the DataFrame by converting their data types
|
53
|
+
to the specified dtype and replacing missing values with the given fill_value.
|
54
|
+
|
55
|
+
:param df: DataFrame to be transformed.
|
56
|
+
:type df: pd.DataFrame or dd.DataFrame
|
57
|
+
:param columns: List of column names to transform.
|
58
|
+
:type columns: list[str]
|
59
|
+
:param fill_value: Value to replace missing or invalid data. Default is 0.
|
60
|
+
:type fill_value: int or float
|
61
|
+
:param dtype: Target data type for the columns. Default is int.
|
62
|
+
:type dtype: type
|
63
|
+
:return: Transformed DataFrame.
|
64
|
+
:rtype: pd.DataFrame or dd.DataFrame
|
65
|
+
"""
|
66
|
+
if not columns:
|
67
|
+
self.logger.warning("No columns specified.")
|
68
|
+
return df
|
69
|
+
|
70
|
+
self.logger.debug(f"DataFrame type: {type(df)}")
|
71
|
+
columns = [col for col in columns if col in df.columns]
|
72
|
+
|
73
|
+
for col in columns:
|
74
|
+
df[col] = df[col].map_partitions(
|
75
|
+
self._transform_column, fill_value, dtype, meta=(col, dtype)
|
76
|
+
)
|
77
|
+
|
78
|
+
return df
|
79
|
+
|
26
80
|
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
27
81
|
"""
|
28
82
|
This function transforms the specified numeric columns in the given dataframe by converting
|
@@ -57,34 +111,21 @@ class DataUtils:
|
|
57
111
|
|
58
112
|
return df
|
59
113
|
|
60
|
-
def transform_boolean_columns(self, df, columns=
|
114
|
+
def transform_boolean_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0):
|
61
115
|
"""
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
Parameters:
|
66
|
-
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
67
|
-
- columns (list of str): List of columns to check and transform.
|
68
|
-
- sample_size (int): Number of rows to sample for detection. Ignored for Pandas DataFrames.
|
116
|
+
Convert specified columns in the DataFrame to boolean, replacing missing values with
|
117
|
+
the given fill_value.
|
69
118
|
|
70
|
-
|
71
|
-
|
119
|
+
:param df: DataFrame to be transformed.
|
120
|
+
:type df: pd.DataFrame or dd.DataFrame
|
121
|
+
:param columns: List of column names to transform.
|
122
|
+
:type columns: list[str]
|
123
|
+
:param fill_value: Value to replace missing or invalid data. Default is 0.
|
124
|
+
:type fill_value: int or float
|
125
|
+
:return: Transformed DataFrame.
|
126
|
+
:rtype: pd.DataFrame or dd.DataFrame
|
72
127
|
"""
|
73
|
-
|
74
|
-
# Apply transformation to each specified column
|
75
|
-
for col in columns:
|
76
|
-
if col in df.columns:
|
77
|
-
# Replace NaN with 0, then convert to boolean
|
78
|
-
df[col] = df[col].map_partitions(
|
79
|
-
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
80
|
-
.fillna(0) # Replace NaN with 0
|
81
|
-
.astype(int) # Ensure integer type
|
82
|
-
.astype(bool), # Convert to boolean
|
83
|
-
meta=(col, 'bool')
|
84
|
-
)
|
85
|
-
if self.debug:
|
86
|
-
self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
|
87
|
-
return df
|
128
|
+
return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
|
88
129
|
|
89
130
|
def merge_lookup_data(self, classname, df, **kwargs):
|
90
131
|
"""
|