sibi-dst 0.3.39__tar.gz → 0.3.42__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/PKG-INFO +1 -1
  2. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/pyproject.toml +1 -1
  3. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/__init__.py +2 -0
  4. sibi_dst-0.3.42/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +257 -0
  5. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/__init__.py +3 -0
  6. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/data_utils.py +66 -25
  7. sibi_dst-0.3.42/sibi_dst/utils/data_wrapper.py +665 -0
  8. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/date_utils.py +118 -113
  9. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/log_utils.py +57 -18
  10. sibi_dst-0.3.42/sibi_dst/utils/phone_formatter.py +127 -0
  11. sibi_dst-0.3.39/sibi_dst/utils/data_wrapper.py +0 -365
  12. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/README.md +0 -0
  13. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/__init__.py +0 -0
  14. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/_df_helper.py +0 -0
  15. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  16. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  17. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/__init__.py +0 -0
  18. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  19. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  20. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  21. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
  22. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  23. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  24. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  25. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  26. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  27. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  28. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  29. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  30. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  31. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  32. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  33. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  34. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/core/__init__.py +0 -0
  35. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/core/_defaults.py +0 -0
  36. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  37. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/core/_params_config.py +0 -0
  38. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/core/_query_config.py +0 -0
  39. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/df_helper/data_cleaner.py +0 -0
  40. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/geopy_helper/__init__.py +0 -0
  41. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  42. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/geopy_helper/utils.py +0 -0
  43. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/osmnx_helper/__init__.py +0 -0
  44. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  45. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  46. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  47. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  48. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/osmnx_helper/utils.py +0 -0
  49. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/tests/__init__.py +0 -0
  50. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  51. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/airflow_manager.py +0 -0
  52. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/clickhouse_writer.py +0 -0
  53. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/credentials.py +0 -0
  54. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/df_utils.py +0 -0
  55. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/file_utils.py +0 -0
  56. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/filepath_generator.py +0 -0
  57. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/parquet_saver.py +0 -0
  58. {sibi_dst-0.3.39 → sibi_dst-0.3.42}/sibi_dst/utils/storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.39
3
+ Version: 0.3.42
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.39"
3
+ version = "0.3.42"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -3,11 +3,13 @@ from __future__ import annotations
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
+ from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapper
6
7
  #from .data_cleaner import DataCleaner
7
8
 
8
9
  __all__ = [
9
10
  'DfHelper',
10
11
  'ParquetArtifact',
11
12
  'ParquetReader',
13
+ 'ArtifactUpdaterMultiWrapper',
12
14
  #'DataCleaner'
13
15
  ]
@@ -0,0 +1,257 @@
1
+ import asyncio
2
+ import logging
3
+ import datetime
4
+ import psutil
5
+ import time
6
+ from functools import total_ordering
7
+ from collections import defaultdict
8
+ from contextlib import asynccontextmanager
9
+ import signal
10
+ from sibi_dst.utils import Logger
11
+
12
+ @total_ordering
13
+ class PrioritizedItem:
14
+ def __init__(self, priority, artifact):
15
+ self.priority = priority
16
+ self.artifact = artifact
17
+
18
+ def __lt__(self, other):
19
+ return self.priority < other.priority
20
+
21
+ def __eq__(self, other):
22
+ return self.priority == other.priority
23
+
24
+ class ArtifactUpdaterMultiWrapper:
25
+ def __init__(self, wrapped_classes=None, debug=False, **kwargs):
26
+ self.wrapped_classes = wrapped_classes or {}
27
+ self.debug = debug
28
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
29
+ self.logger.set_level(logging.DEBUG if debug else logging.INFO)
30
+
31
+ today = datetime.datetime.today()
32
+ self.today_str = today.strftime('%Y-%m-%d')
33
+ self.current_year_starts_on_str = datetime.date(today.year, 1, 1).strftime('%Y-%m-%d')
34
+ self.parquet_start_date = kwargs.get('parquet_start_date', self.current_year_starts_on_str)
35
+ self.parquet_end_date = kwargs.get('parquet_end_date', self.today_str)
36
+
37
+ # track concurrency and locks
38
+ self.locks = {}
39
+ self.worker_heartbeat = defaultdict(float)
40
+
41
+ # graceful shutdown handling
42
+ loop = asyncio.get_event_loop()
43
+ self.register_signal_handlers(loop)
44
+
45
+ # dynamic scaling config
46
+ self.min_workers = kwargs.get('min_workers', 1)
47
+ self.max_workers = kwargs.get('max_workers', 8)
48
+ self.memory_per_worker_gb = kwargs.get('memory_per_worker_gb', 1) # default 2GB per worker
49
+ self.monitor_interval = kwargs.get('monitor_interval', 10) # default monitor interval in seconds
50
+ self.retry_attempts = kwargs.get('retry_attempts', 3)
51
+ self.update_timeout_seconds = kwargs.get('update_timeout_seconds', 600)
52
+ self.lock_acquire_timeout_seconds = kwargs.get('lock_acquire_timeout_seconds', 10)
53
+
54
+ def register_signal_handlers(self, loop):
55
+ for sig in (signal.SIGINT, signal.SIGTERM):
56
+ loop.add_signal_handler(sig, lambda: asyncio.create_task(self.shutdown()))
57
+
58
+ async def shutdown(self):
59
+ self.logger.info("Shutdown signal received. Cleaning up...")
60
+ tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
61
+ [task.cancel() for task in tasks]
62
+ await asyncio.gather(*tasks, return_exceptions=True)
63
+ self.logger.info("Shutdown complete.")
64
+
65
+ def get_lock_for_artifact(self, artifact):
66
+ artifact_key = artifact.__class__.__name__
67
+ if artifact_key not in self.locks:
68
+ self.locks[artifact_key] = asyncio.Lock()
69
+ return self.locks[artifact_key]
70
+
71
+ def get_artifacts(self, data_type):
72
+ if data_type not in self.wrapped_classes:
73
+ raise ValueError(f"Unsupported data type: {data_type}")
74
+
75
+ return [
76
+ artifact_class(parquet_start_date=self.parquet_start_date, parquet_end_date=self.parquet_end_date)
77
+ for artifact_class in self.wrapped_classes[data_type]
78
+ ]
79
+
80
+ def estimate_complexity(self, artifact):
81
+ try:
82
+ if hasattr(artifact, 'get_size_estimate'):
83
+ return artifact.get_size_estimate()
84
+ except Exception as e:
85
+ self.logger.warning(f"Failed to estimate complexity for {artifact}: {e}")
86
+ return 1 # default
87
+
88
+ def prioritize_tasks(self, artifacts):
89
+ queue = asyncio.PriorityQueue()
90
+ for artifact in artifacts:
91
+ complexity = self.estimate_complexity(artifact)
92
+ # we invert the complexity to ensure higher complexity -> higher priority
93
+ # if you want high complexity first, store negative complexity in the priority queue
94
+ # or if the smaller number means earlier processing, just keep as is
95
+ queue.put_nowait(PrioritizedItem(complexity, artifact))
96
+ return queue
97
+
98
+ async def resource_monitor(self, queue, workers):
99
+ """Monitor system resources and adjust worker count while queue is not empty."""
100
+ while True:
101
+ # break if queue done
102
+ if queue.empty():
103
+ await asyncio.sleep(0.5)
104
+ if queue.empty():
105
+ break
106
+
107
+ try:
108
+ available_memory = psutil.virtual_memory().available
109
+ worker_memory_bytes = self.memory_per_worker_gb * (1024 ** 3)
110
+ max_workers_by_memory = available_memory // worker_memory_bytes
111
+
112
+ # figure out how many workers we can sustain
113
+ # note: we also cap by self.max_workers
114
+ optimal_workers = min(psutil.cpu_count(), max_workers_by_memory, self.max_workers)
115
+
116
+ # ensure at least self.min_workers is used
117
+ optimal_workers = max(self.min_workers, optimal_workers)
118
+
119
+ current_worker_count = len(workers)
120
+
121
+ if optimal_workers > current_worker_count:
122
+ # we can add more workers if queue is not empty
123
+ diff = optimal_workers - current_worker_count
124
+ for _ in range(diff):
125
+ worker_id = len(workers)
126
+ # create a new worker
127
+ w = asyncio.create_task(self.worker(queue, worker_id))
128
+ workers.append(w)
129
+ self.logger.info(f"Added worker {worker_id}. Total workers: {len(workers)}")
130
+ elif optimal_workers < current_worker_count:
131
+ # remove some workers
132
+ diff = current_worker_count - optimal_workers
133
+ for _ in range(diff):
134
+ w = workers.pop()
135
+ w.cancel()
136
+ self.logger.info(f"Removed a worker. Total workers: {len(workers)}")
137
+
138
+ await asyncio.sleep(self.monitor_interval)
139
+
140
+ except asyncio.CancelledError:
141
+ # monitor is being shut down
142
+ break
143
+ except Exception as e:
144
+ self.logger.error(f"Error in resource_monitor: {e}")
145
+ await asyncio.sleep(self.monitor_interval)
146
+
147
+ @asynccontextmanager
148
+ async def artifact_lock(self, artifact):
149
+ lock = self.get_lock_for_artifact(artifact)
150
+ try:
151
+ await asyncio.wait_for(lock.acquire(), timeout=self.lock_acquire_timeout_seconds)
152
+ yield
153
+ except asyncio.TimeoutError:
154
+ self.logger.error(f"Timeout acquiring lock for artifact: {artifact.__class__.__name__}")
155
+ yield # continue but no actual lock was acquired
156
+ finally:
157
+ if lock.locked():
158
+ lock.release()
159
+
160
+ async def async_update_artifact(self, artifact, **kwargs):
161
+ for attempt in range(self.retry_attempts):
162
+ try:
163
+ async with self.artifact_lock(artifact):
164
+ self.logger.info(
165
+ f"Updating artifact: {artifact.__class__.__name__}, Attempt: {attempt + 1} of {self.retry_attempts}" )
166
+ start_time = time.time()
167
+ await asyncio.wait_for(
168
+ asyncio.to_thread(artifact.update_parquet, **kwargs),
169
+ timeout=self.update_timeout_seconds
170
+ )
171
+ elapsed_time = time.time() - start_time
172
+ self.logger.info(
173
+ f"Successfully updated artifact: {artifact.__class__.__name__} in {elapsed_time:.2f}s." )
174
+ return
175
+
176
+ except asyncio.TimeoutError:
177
+ self.logger.error(f"Timeout updating artifact {artifact.__class__.__name__}, Attempt: {attempt + 1}")
178
+ except Exception as e:
179
+ self.logger.error(
180
+ f"Error updating artifact {artifact.__class__.__name__}, Attempt: {attempt + 1}: {e}" )
181
+
182
+ # exponential backoff
183
+ await asyncio.sleep(2 ** attempt)
184
+
185
+ self.logger.error(f"All retry attempts failed for artifact: {artifact.__class__.__name__}")
186
+
187
+ async def worker(self, queue, worker_id, **kwargs):
188
+ """A worker that dynamically pulls tasks from the queue."""
189
+ while True:
190
+ try:
191
+ prioritized_item = await queue.get()
192
+ if prioritized_item is None:
193
+ break
194
+ artifact = prioritized_item.artifact
195
+ # heartbeat
196
+ self.worker_heartbeat[worker_id] = time.time()
197
+
198
+ await self.async_update_artifact(artifact, **kwargs)
199
+
200
+ except asyncio.CancelledError:
201
+ self.logger.info(f"Worker {worker_id} shutting down gracefully.")
202
+ break
203
+ except Exception as e:
204
+ self.logger.error(f"Error in worker {worker_id}: {e}")
205
+ finally:
206
+ queue.task_done()
207
+
208
+ async def process_tasks(self, queue, initial_workers, **kwargs):
209
+ """Start a set of workers and a resource monitor to dynamically adjust them."""
210
+ # create initial workers
211
+ workers = []
212
+ for worker_id in range(initial_workers):
213
+ w = asyncio.create_task(self.worker(queue, worker_id, **kwargs))
214
+ workers.append(w)
215
+
216
+ # start resource monitor
217
+ monitor_task = asyncio.create_task(self.resource_monitor(queue, workers))
218
+
219
+ # wait until queue is done
220
+ try:
221
+ await queue.join()
222
+ finally:
223
+ # cancel resource monitor
224
+ monitor_task.cancel()
225
+ # all workers done
226
+ for w in workers:
227
+ w.cancel()
228
+ await asyncio.gather(*workers, return_exceptions=True)
229
+
230
+ async def update_data(self, data_type, **kwargs):
231
+ self.logger.info(f"Processing wrapper group: {data_type} with {kwargs}")
232
+ artifacts = self.get_artifacts(data_type)
233
+ queue = self.prioritize_tasks(artifacts)
234
+
235
+ # compute initial worker count (this can be low if memory is low initially)
236
+ initial_workers = self.calculate_initial_workers(len(artifacts))
237
+ self.logger.info(f"Initial worker count: {initial_workers} for {len(artifacts)} artifacts")
238
+
239
+ total_start_time = time.time()
240
+ await self.process_tasks(queue, initial_workers, **kwargs)
241
+ total_time = time.time() - total_start_time
242
+ self.logger.info(f"Total processing time: {total_time:.2f} seconds.")
243
+
244
+ def calculate_initial_workers(self, artifact_count: int) -> int:
245
+ """Compute the initial number of workers before resource_monitor can adjust."""
246
+ self.logger.info("Calculating initial worker count...")
247
+ available_memory = psutil.virtual_memory().available
248
+ self.logger.info(f"Available memory: {available_memory / (1024 ** 3):.2f} GB")
249
+ worker_memory_bytes = self.memory_per_worker_gb * (1024 ** 3)
250
+ self.logger.info(f"Memory per worker: {worker_memory_bytes / (1024 ** 3):.2f} GB")
251
+ max_workers_by_memory = available_memory // worker_memory_bytes
252
+ self.logger.info(f"Max workers by memory: {max_workers_by_memory}")
253
+ # also consider CPU count and artifact_count
254
+ initial = min(psutil.cpu_count(), max_workers_by_memory, artifact_count, self.max_workers)
255
+ self.logger.info(f"Optimal workers: {initial} CPU: {psutil.cpu_count()} Max Workers: {self.max_workers}")
256
+ return max(self.min_workers, initial)
257
+
@@ -4,6 +4,7 @@ from .log_utils import Logger
4
4
  from .date_utils import *
5
5
  from .data_utils import DataUtils
6
6
  from .file_utils import FileUtils
7
+ from .phone_formatter import PhoneNumberFormatter
7
8
  from .filepath_generator import FilePathGenerator
8
9
  from .df_utils import DfUtils
9
10
  from .storage_manager import StorageManager
@@ -18,8 +19,10 @@ __all__ = [
18
19
  "ConfigManager",
19
20
  "ConfigLoader",
20
21
  "DateUtils",
22
+ "FileAgeChecker",
21
23
  "BusinessDays",
22
24
  "FileUtils",
25
+ "PhoneNumberFormatter",
23
26
  "DataWrapper",
24
27
  "DataUtils",
25
28
  "FilePathGenerator",
@@ -1,3 +1,5 @@
1
+ from typing import Union, List
2
+
1
3
  import dask.dataframe as dd
2
4
  import pandas as pd
3
5
 
@@ -23,6 +25,58 @@ class DataUtils:
23
25
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
24
26
  self.debug = kwargs.get('debug', False)
25
27
 
28
+ @staticmethod
29
+ def _transform_column(series, fill_value, dtype):
30
+ """
31
+ Helper method to transform a column by converting it to numeric, filling missing values,
32
+ and casting to the specified dtype.
33
+
34
+ :param series: The column to transform.
35
+ :type series: pd.Series or dd.Series
36
+ :param fill_value: Value to replace missing or invalid data.
37
+ :type fill_value: int or float
38
+ :param dtype: Target data type for the column.
39
+ :type dtype: type
40
+ :return: Transformed column.
41
+ :rtype: pd.Series or dd.Series
42
+ """
43
+ return (
44
+ pd.to_numeric(series, errors="coerce") # Convert to numeric, invalid to NaN
45
+ .fillna(fill_value) # Replace NaN with fill_value
46
+ .astype(dtype) # Convert to target dtype
47
+ )
48
+
49
+ def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0,
50
+ dtype=int):
51
+ """
52
+ Transform specified numeric columns in the DataFrame by converting their data types
53
+ to the specified dtype and replacing missing values with the given fill_value.
54
+
55
+ :param df: DataFrame to be transformed.
56
+ :type df: pd.DataFrame or dd.DataFrame
57
+ :param columns: List of column names to transform.
58
+ :type columns: list[str]
59
+ :param fill_value: Value to replace missing or invalid data. Default is 0.
60
+ :type fill_value: int or float
61
+ :param dtype: Target data type for the columns. Default is int.
62
+ :type dtype: type
63
+ :return: Transformed DataFrame.
64
+ :rtype: pd.DataFrame or dd.DataFrame
65
+ """
66
+ if not columns:
67
+ self.logger.warning("No columns specified.")
68
+ return df
69
+
70
+ self.logger.debug(f"DataFrame type: {type(df)}")
71
+ columns = [col for col in columns if col in df.columns]
72
+
73
+ for col in columns:
74
+ df[col] = df[col].map_partitions(
75
+ self._transform_column, fill_value, dtype, meta=(col, dtype)
76
+ )
77
+
78
+ return df
79
+
26
80
  def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
27
81
  """
28
82
  This function transforms the specified numeric columns in the given dataframe by converting
@@ -57,34 +111,21 @@ class DataUtils:
57
111
 
58
112
  return df
59
113
 
60
- def transform_boolean_columns(self, df, columns=None):
114
+ def transform_boolean_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0):
61
115
  """
62
- Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
63
- and convert them to boolean. Detection is performed using a sample.
64
-
65
- Parameters:
66
- - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
67
- - columns (list of str): List of columns to check and transform.
68
- - sample_size (int): Number of rows to sample for detection. Ignored for Pandas DataFrames.
116
+ Convert specified columns in the DataFrame to boolean, replacing missing values with
117
+ the given fill_value.
69
118
 
70
- Returns:
71
- - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
119
+ :param df: DataFrame to be transformed.
120
+ :type df: pd.DataFrame or dd.DataFrame
121
+ :param columns: List of column names to transform.
122
+ :type columns: list[str]
123
+ :param fill_value: Value to replace missing or invalid data. Default is 0.
124
+ :type fill_value: int or float
125
+ :return: Transformed DataFrame.
126
+ :rtype: pd.DataFrame or dd.DataFrame
72
127
  """
73
-
74
- # Apply transformation to each specified column
75
- for col in columns:
76
- if col in df.columns:
77
- # Replace NaN with 0, then convert to boolean
78
- df[col] = df[col].map_partitions(
79
- lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
80
- .fillna(0) # Replace NaN with 0
81
- .astype(int) # Ensure integer type
82
- .astype(bool), # Convert to boolean
83
- meta=(col, 'bool')
84
- )
85
- if self.debug:
86
- self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
87
- return df
128
+ return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
88
129
 
89
130
  def merge_lookup_data(self, classname, df, **kwargs):
90
131
  """