sibi-dst 0.3.56__tar.gz → 0.3.58__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/PKG-INFO +1 -1
  2. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/pyproject.toml +1 -1
  3. sibi_dst-0.3.58/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +261 -0
  4. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/_df_helper.py +58 -26
  5. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/_parquet_artifact.py +29 -11
  6. sibi_dst-0.3.58/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +202 -0
  7. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +6 -2
  8. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/__init__.py +2 -0
  9. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/data_wrapper.py +34 -93
  10. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/parquet_saver.py +15 -12
  11. sibi_dst-0.3.58/sibi_dst/utils/update_planner.py +237 -0
  12. sibi_dst-0.3.56/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -262
  13. sibi_dst-0.3.56/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -109
  14. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/README.md +0 -0
  15. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/__init__.py +0 -0
  16. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/__init__.py +0 -0
  17. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  18. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/__init__.py +0 -0
  19. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  20. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  21. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  22. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
  23. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  24. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  25. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  26. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  27. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  28. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  29. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  30. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  31. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  32. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  33. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/core/__init__.py +0 -0
  34. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/core/_defaults.py +0 -0
  35. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  36. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/core/_params_config.py +0 -0
  37. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/core/_query_config.py +0 -0
  38. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/df_helper/data_cleaner.py +0 -0
  39. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/geopy_helper/__init__.py +0 -0
  40. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  41. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/geopy_helper/utils.py +0 -0
  42. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/osmnx_helper/__init__.py +0 -0
  43. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  44. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  45. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  46. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  47. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/osmnx_helper/utils.py +0 -0
  48. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/tests/__init__.py +0 -0
  49. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  50. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/airflow_manager.py +0 -0
  51. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/clickhouse_writer.py +0 -0
  52. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/credentials.py +0 -0
  53. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/data_from_http_source.py +0 -0
  54. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/data_utils.py +0 -0
  55. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/date_utils.py +0 -0
  56. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/df_utils.py +0 -0
  57. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/file_utils.py +0 -0
  58. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/filepath_generator.py +0 -0
  59. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/log_utils.py +0 -0
  60. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/phone_formatter.py +0 -0
  61. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/storage_config.py +0 -0
  62. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/storage_manager.py +0 -0
  63. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/utils/webdav_client.py +0 -0
  64. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/__init__.py +0 -0
  65. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/__init__.py +0 -0
  66. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  67. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  68. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  69. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  70. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  71. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  72. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  73. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  74. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  75. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  76. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  77. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  78. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  79. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  80. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  81. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  82. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/utils/__init__.py +0 -0
  83. {sibi_dst-0.3.56 → sibi_dst-0.3.58}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.56
3
+ Version: 0.3.58
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.56"
3
+ version = "0.3.58"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -0,0 +1,261 @@
1
+ import asyncio
2
+ import logging
3
+ import datetime
4
+ import psutil
5
+ import time
6
+ from functools import total_ordering
7
+ from collections import defaultdict
8
+ from contextlib import asynccontextmanager
9
+ import signal
10
+ from sibi_dst.utils import Logger
11
+
12
+ @total_ordering
13
+ class PrioritizedItem:
14
+ def __init__(self, priority, artifact):
15
+ self.priority = priority
16
+ self.artifact = artifact
17
+
18
+ def __lt__(self, other):
19
+ return self.priority < other.priority
20
+
21
+ def __eq__(self, other):
22
+ return self.priority == other.priority
23
+
24
+ class ArtifactUpdaterMultiWrapper:
25
+ def __init__(self, wrapped_classes=None, debug=False, **kwargs):
26
+ self.wrapped_classes = wrapped_classes or {}
27
+ self.debug = debug
28
+ self.logger = kwargs.setdefault(
29
+ 'logger', Logger.default_logger(logger_name=self.__class__.__name__)
30
+ )
31
+ self.logger.set_level(logging.DEBUG if debug else logging.INFO)
32
+
33
+ today = datetime.datetime.today()
34
+ self.parquet_start_date = kwargs.get(
35
+ 'parquet_start_date',
36
+ datetime.date(today.year, 1, 1).strftime('%Y-%m-%d')
37
+ )
38
+ self.parquet_end_date = kwargs.get(
39
+ 'parquet_end_date',
40
+ today.strftime('%Y-%m-%d')
41
+ )
42
+
43
+ # track pending/completed/failed artifacts
44
+ self.pending = set()
45
+ self.completed = set()
46
+ self.failed = set()
47
+
48
+ # concurrency primitives
49
+ self.locks = {}
50
+ self.locks_lock = asyncio.Lock()
51
+ self.worker_heartbeat = defaultdict(float)
52
+ self.workers_lock = asyncio.Lock()
53
+
54
+ # dynamic scaling config
55
+ self.min_workers = kwargs.get('min_workers', 1)
56
+ self.max_workers = kwargs.get('max_workers', 3)
57
+ self.memory_per_worker_gb = kwargs.get('memory_per_worker_gb', 1)
58
+ self.monitor_interval = kwargs.get('monitor_interval', 10)
59
+ self.retry_attempts = kwargs.get('retry_attempts', 3)
60
+ self.update_timeout_seconds = kwargs.get('update_timeout_seconds', 600)
61
+ self.lock_acquire_timeout_seconds = kwargs.get('lock_acquire_timeout_seconds', 10)
62
+
63
+ async def get_lock_for_artifact(self, artifact):
64
+ key = artifact.__class__.__name__
65
+ async with self.locks_lock:
66
+ if key not in self.locks:
67
+ self.locks[key] = asyncio.Lock()
68
+ return self.locks[key]
69
+
70
+ def get_artifacts(self, data_type):
71
+ if data_type not in self.wrapped_classes:
72
+ raise ValueError(f"Unsupported data type: {data_type}")
73
+ artifacts = [cls(
74
+ parquet_start_date=self.parquet_start_date,
75
+ parquet_end_date=self.parquet_end_date,
76
+ logger=self.logger,
77
+ debug=self.debug
78
+ ) for cls in self.wrapped_classes[data_type]]
79
+ # seed pending set and clear others
80
+ self.pending = set(artifacts)
81
+ self.completed.clear()
82
+ self.failed.clear()
83
+ return artifacts
84
+
85
+ def estimate_complexity(self, artifact):
86
+ try:
87
+ return artifact.get_size_estimate()
88
+ except Exception:
89
+ return 1
90
+
91
+ def prioritize_tasks(self, artifacts):
92
+ queue = asyncio.PriorityQueue()
93
+ for art in artifacts:
94
+ queue.put_nowait(PrioritizedItem(self.estimate_complexity(art), art))
95
+ return queue
96
+
97
+ async def resource_monitor(self, queue, workers):
98
+ while not queue.empty():
99
+ try:
100
+ avail = psutil.virtual_memory().available
101
+ max_by_mem = avail // (self.memory_per_worker_gb * 2**30)
102
+ optimal = max(self.min_workers,
103
+ min(psutil.cpu_count(), max_by_mem, self.max_workers))
104
+ async with self.workers_lock:
105
+ current = len(workers)
106
+ if optimal > current:
107
+ for _ in range(optimal - current):
108
+ wid = len(workers)
109
+ workers.append(asyncio.create_task(self.worker(queue, wid)))
110
+ self.logger.info(f"Added worker {wid}")
111
+ elif optimal < current:
112
+ for _ in range(current - optimal):
113
+ w = workers.pop()
114
+ w.cancel()
115
+ self.logger.info("Removed a worker")
116
+ await asyncio.sleep(self.monitor_interval)
117
+ except asyncio.CancelledError:
118
+ break
119
+ except Exception as e:
120
+ self.logger.error(f"Monitor error: {e}")
121
+ await asyncio.sleep(self.monitor_interval)
122
+
123
+ @asynccontextmanager
124
+ async def artifact_lock(self, artifact):
125
+ lock = await self.get_lock_for_artifact(artifact)
126
+ try:
127
+ await asyncio.wait_for(lock.acquire(), timeout=self.lock_acquire_timeout_seconds)
128
+ yield
129
+ finally:
130
+ if lock.locked():
131
+ lock.release()
132
+
133
+ async def async_update_artifact(self, artifact, **kwargs):
134
+ for attempt in range(1, self.retry_attempts + 1):
135
+ lock = await self.get_lock_for_artifact(artifact)
136
+ try:
137
+ await asyncio.wait_for(lock.acquire(), timeout=self.lock_acquire_timeout_seconds)
138
+ try:
139
+ self.logger.info(f"Updating {artifact.__class__.__name__} (attempt {attempt})")
140
+ await asyncio.wait_for(
141
+ asyncio.to_thread(artifact.update_parquet, **kwargs),
142
+ timeout=self.update_timeout_seconds
143
+ )
144
+ # mark success
145
+ async with self.workers_lock:
146
+ self.pending.discard(artifact)
147
+ self.completed.add(artifact)
148
+ self.logger.info(
149
+ f"✅ {artifact.__class__.__name__} done — "
150
+ f"{len(self.completed)}/{len(self.completed) + len(self.pending) + len(self.failed)} completed, "
151
+ f"{len(self.failed)} failed"
152
+ )
153
+ return
154
+ finally:
155
+ if lock.locked():
156
+ lock.release()
157
+ except asyncio.TimeoutError:
158
+ self.logger.warning(f"Timeout on {artifact.__class__.__name__}, attempt {attempt}")
159
+ except Exception as e:
160
+ self.logger.error(f"Error on {artifact}: {e}")
161
+ finally:
162
+ if lock.locked():
163
+ lock.release()
164
+ await asyncio.sleep(2 ** (attempt - 1))
165
+
166
+ # all retries exhausted -> mark failure
167
+ async with self.workers_lock:
168
+ self.pending.discard(artifact)
169
+ self.failed.add(artifact)
170
+ self.logger.error(f"✖️ Permanently failed {artifact.__class__.__name__}")
171
+
172
+ async def worker(self, queue, worker_id, **kwargs):
173
+ while True:
174
+ try:
175
+ item = await queue.get()
176
+ art = item.artifact
177
+ self.worker_heartbeat[worker_id] = time.time()
178
+ await self.async_update_artifact(art, **kwargs)
179
+ except asyncio.CancelledError:
180
+ self.logger.info(f"Worker {worker_id} stopped")
181
+ break
182
+ finally:
183
+ queue.task_done()
184
+
185
+ def calculate_initial_workers(self, count: int) -> int:
186
+ avail = psutil.virtual_memory().available
187
+ max_by_mem = avail // (self.memory_per_worker_gb * 2**30)
188
+ return max(self.min_workers,
189
+ min(psutil.cpu_count(), max_by_mem, count, self.max_workers))
190
+
191
+ async def update_data(self, data_type, **kwargs):
192
+ self.logger.info(f"Starting update for {data_type}")
193
+ artifacts = self.get_artifacts(data_type)
194
+ queue = self.prioritize_tasks(artifacts)
195
+ init = self.calculate_initial_workers(len(artifacts))
196
+ tasks = [asyncio.create_task(self.worker(queue, i, **kwargs)) for i in range(init)]
197
+ monitor = asyncio.create_task(self.resource_monitor(queue, tasks))
198
+ await queue.join()
199
+ monitor.cancel()
200
+ for t in tasks:
201
+ t.cancel()
202
+ await asyncio.gather(*tasks, return_exceptions=True)
203
+ self.logger.info(self.format_results_table())
204
+ self.logger.info("All artifacts processed.")
205
+
206
+ def format_results_table(self):
207
+ results = self.get_update_status()
208
+ headers = ["Metric", "Value"]
209
+ rows = [
210
+ ["Total", results['total']],
211
+ ["Completed", results['completed']],
212
+ ["Pending", results['pending']],
213
+ ["Failed", results['failed']],
214
+ ["Pending Items", len(results['pending_items'])],
215
+ ["Failed Items", len(results['failed_items'])]
216
+ ]
217
+
218
+ # Find max lengths for alignment
219
+ max_metric = max(len(str(row[0])) for row in rows)
220
+ max_value = max(len(str(row[1])) for row in rows)
221
+
222
+ format_str = "{:<%d} {:>%d}" % (max_metric, max_value)
223
+
224
+ table = [
225
+ "\n",
226
+ format_str.format(*headers),
227
+ "-" * (max_metric + max_value + 2)
228
+ ]
229
+
230
+ for row in rows:
231
+ table.append(format_str.format(row[0], row[1]))
232
+
233
+ return "\n".join(table)
234
+
235
+ def get_update_status(self):
236
+ total = len(self.pending) + len(self.completed) + len(self.failed)
237
+ return {
238
+ "total": total,
239
+ "completed": len(self.completed),
240
+ "pending": len(self.pending),
241
+ "failed": len(self.failed),
242
+ "pending_items": [a.__class__.__name__ for a in self.pending],
243
+ "failed_items": [a.__class__.__name__ for a in self.failed]
244
+ }
245
+
246
+ # Top‑level driver
247
+ # environment = None # fill this in with your wrapped_classes dict
248
+ #
249
+ # async def main():
250
+ # wrapper = ArtifactUpdaterMultiWrapper(
251
+ # wrapped_classes=environment,
252
+ # debug=True
253
+ # )
254
+ # loop = asyncio.get_running_loop()
255
+ # for sig in (signal.SIGINT, signal.SIGTERM):
256
+ # loop.add_signal_handler(sig, lambda: asyncio.create_task(wrapper.shutdown()))
257
+ # await wrapper.update_data("your_data_type")
258
+ #
259
+ # if __name__ == "__main__":
260
+ # asyncio.run(main())
261
+
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import asyncio
2
4
  import datetime
3
5
  import logging
@@ -6,10 +8,10 @@ from typing import Any, Dict, TypeVar
6
8
  from typing import Union, Optional
7
9
 
8
10
  import dask.dataframe as dd
9
- from dask import delayed, compute
11
+ import fsspec
10
12
  import pandas as pd
13
+ from dask import delayed, compute
11
14
  from pydantic import BaseModel
12
- import fsspec
13
15
 
14
16
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
15
17
  from sibi_dst.utils import Logger
@@ -45,7 +47,7 @@ class DfHelper:
45
47
  :ivar df: The DataFrame currently being processed or loaded.
46
48
  :type df: Union[dd.DataFrame, pd.DataFrame]
47
49
  :ivar backend_django: Configuration for interacting with Django database backends.
48
- :type backend_django: Optional[DjangoConnectionConfig]
50
+ :type backend_connection: Optional[DjangoConnectionConfig]
49
51
  :ivar _backend_query: Internal configuration for query handling.
50
52
  :type _backend_query: Optional[QueryConfig]
51
53
  :ivar _backend_params: Internal parameters configuration for DataFrame handling.
@@ -54,8 +56,6 @@ class DfHelper:
54
56
  :type backend_parquet: Optional[ParquetConfig]
55
57
  :ivar backend_http: Configuration for interacting with HTTP-based backends.
56
58
  :type backend_http: Optional[HttpConfig]
57
- :ivar backend_sqlalchemy: Configuration for interacting with SQLAlchemy-based databases.
58
- :type backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig]
59
59
  :ivar parquet_filename: The filename for a Parquet file, if applicable.
60
60
  :type parquet_filename: str
61
61
  :ivar logger: Logger instance used for debugging and information logging.
@@ -64,12 +64,11 @@ class DfHelper:
64
64
  :type default_config: Dict
65
65
  """
66
66
  df: Union[dd.DataFrame, pd.DataFrame] = None
67
- backend_django: Optional[DjangoConnectionConfig] = None
67
+ backend_db_connection: Optional[Union[DjangoConnectionConfig | SqlAlchemyConnectionConfig]] = None
68
68
  _backend_query: Optional[QueryConfig] = None
69
69
  _backend_params: Optional[ParamsConfig] = None
70
70
  backend_parquet: Optional[ParquetConfig] = None
71
71
  backend_http: Optional[HttpConfig] = None
72
- backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
73
72
  parquet_filename: str = None
74
73
  logger: Logger
75
74
  default_config: Dict = None
@@ -91,7 +90,7 @@ class DfHelper:
91
90
  self.filesystem_options = kwargs.pop('filesystem_options', {})
92
91
  kwargs.setdefault("live", True)
93
92
  kwargs.setdefault("logger", self.logger)
94
- self.fs =kwargs.setdefault("fs", fsspec.filesystem('file'))
93
+ self.fs = kwargs.setdefault("fs", fsspec.filesystem('file'))
95
94
  self.__post_init(**kwargs)
96
95
 
97
96
  def __str__(self):
@@ -100,6 +99,34 @@ class DfHelper:
100
99
  def __call__(self, **options):
101
100
  return self.load(**options)
102
101
 
102
+ def __enter__(self):
103
+ return self
104
+
105
+ def __exit__(self, exc_type, exc_value, traceback):
106
+ self.__cleanup()
107
+ return False
108
+
109
+ def __cleanup(self):
110
+ """
111
+ Clean up resources when exiting the context manager.
112
+ This method is called when the context manager exits.
113
+ """
114
+
115
+ if self.backend_db_connection:
116
+ if getattr(self.backend_db_connection, "dispose_idle_connections", None):
117
+ self.backend_db_connection.dispose_idle_connections()
118
+ if getattr(self.backend_db_connection, "close", None):
119
+ self.backend_db_connection.close()
120
+
121
+ self.backend_db_connection = None
122
+
123
+ if self.backend_parquet:
124
+ self.backend_parquet = None
125
+ if self.backend_http:
126
+ self.backend_http = None
127
+ self._backend_query = None
128
+ self._backend_params = None
129
+
103
130
  def __post_init(self, **kwargs):
104
131
  """
105
132
  Initializes backend-specific configurations based on the provided backend type and other
@@ -111,20 +138,19 @@ class DfHelper:
111
138
  Additional parameters for specific backend types are extracted here.
112
139
  :return: None
113
140
  """
114
- self.logger.debug(f"backend used: {self.backend}")
115
- self.logger.debug(f"kwargs passed to backend plugins: {kwargs}")
141
+ # self.logger.debug(f"backend used: {self.backend}")
142
+ # self.logger.debug(f"kwargs passed to backend plugins: {kwargs}")
116
143
  self._backend_query = self.__get_config(QueryConfig, kwargs)
117
144
  self._backend_params = self.__get_config(ParamsConfig, kwargs)
118
145
  if self.backend == 'django_db':
119
- self.backend_django = self.__get_config(DjangoConnectionConfig, kwargs)
146
+ self.backend_db_connection = self.__get_config(DjangoConnectionConfig, kwargs)
120
147
  elif self.backend == 'parquet':
121
148
  self.parquet_filename = kwargs.setdefault("parquet_filename", None)
122
149
  self.backend_parquet = ParquetConfig(**kwargs)
123
150
  elif self.backend == 'http':
124
151
  self.backend_http = HttpConfig(**kwargs)
125
152
  elif self.backend == 'sqlalchemy':
126
- self.backend_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
127
-
153
+ self.backend_db_connection = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
128
154
 
129
155
  def __get_config(self, model: [T], kwargs: Dict[str, Any]) -> Union[T]:
130
156
  """
@@ -134,11 +160,13 @@ class DfHelper:
134
160
  :param kwargs: The dictionary of keyword arguments.
135
161
  :return: The initialized Pydantic model instance.
136
162
  """
163
+ kwargs.setdefault("debug", self.debug)
164
+ kwargs.setdefault("logger", self.logger)
137
165
  # Extract keys that the model can accept
138
166
  recognized_keys = set(model.__annotations__.keys())
139
167
  self.logger.debug(f"recognized keys: {recognized_keys}")
140
168
  model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
141
- self.logger.debug(f"model_kwargs: {model_kwargs}")
169
+ # self.logger.debug(f"model_kwargs: {model_kwargs}")
142
170
  return model(**model_kwargs)
143
171
 
144
172
  def load_parallel(self, **options):
@@ -171,10 +199,10 @@ class DfHelper:
171
199
  `as_pandas` is set to True, or kept in its native backend format otherwise.
172
200
  """
173
201
  # this will be the universal method to load data from a df irrespective of the backend
174
- df = self.__load(**options)
202
+ self.df = self.__load(**options)
175
203
  if self.as_pandas:
176
- return df.compute()
177
- return df
204
+ return self.df.compute()
205
+ return self.df
178
206
 
179
207
  def __load(self, **options):
180
208
  """
@@ -196,7 +224,7 @@ class DfHelper:
196
224
  """
197
225
  if self.backend == 'django_db':
198
226
  self._backend_params.parse_params(options)
199
- return self.__load_from_db(**options)
227
+ return self.__load_from_django_db(**options)
200
228
  elif self.backend == 'sqlalchemy':
201
229
  self._backend_params.parse_params(options)
202
230
  return self.__load_from_sqlalchemy(**options)
@@ -227,7 +255,7 @@ class DfHelper:
227
255
  try:
228
256
  options.setdefault("debug", self.debug)
229
257
  db_loader = SqlAlchemyLoadFromDb(
230
- self.backend_sqlalchemy,
258
+ self.backend_db_connection,
231
259
  self._backend_query,
232
260
  self._backend_params,
233
261
  self.logger,
@@ -236,6 +264,7 @@ class DfHelper:
236
264
  self.df = db_loader.build_and_load()
237
265
  self.__process_loaded_data()
238
266
  self.__post_process_df()
267
+ self.backend_db_connection.close()
239
268
  self.logger.debug("Data successfully loaded from sqlalchemy database.")
240
269
  except Exception as e:
241
270
  self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
@@ -243,7 +272,7 @@ class DfHelper:
243
272
 
244
273
  return self.df
245
274
 
246
- def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
275
+ def __load_from_django_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
247
276
  """
248
277
  Loads data from a Django database using a specific backend query mechanism. Processes the loaded data
249
278
  and applies further post-processing before returning the dataframe. If the operation fails, an
@@ -258,7 +287,7 @@ class DfHelper:
258
287
  try:
259
288
  options.setdefault("debug", self.debug)
260
289
  db_loader = DjangoLoadFromDb(
261
- self.backend_django,
290
+ self.backend_db_connection,
262
291
  self._backend_query,
263
292
  self._backend_params,
264
293
  self.logger,
@@ -307,6 +336,7 @@ class DfHelper:
307
336
  :raises ValueError: If the lengths of `fieldnames` and `column_names` do not match,
308
337
  or if the specified `index_col` is not found in the DataFrame.
309
338
  """
339
+ self.logger.debug("Post-processing DataFrame.")
310
340
  df_params = self._backend_params.df_params
311
341
  fieldnames = df_params.get("fieldnames", None)
312
342
  index_col = df_params.get("index_col", None)
@@ -357,16 +387,16 @@ class DfHelper:
357
387
 
358
388
  :return: None
359
389
  """
360
- self.logger.debug(f"Type of self.df: {type(self.df)}")
390
+ self.logger.debug(f"Processing loaded data...")
361
391
  if self.df.map_partitions(len).compute().sum() > 0:
362
392
  field_map = self._backend_params.field_map or {}
363
- if isinstance(field_map, dict):
393
+ if isinstance(field_map, dict) and field_map != {}:
364
394
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
365
395
  missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
366
396
 
367
397
  if missing_columns:
368
398
  self.logger.warning(
369
- f"The following columns in field_map are not in the DataFrame: {missing_columns}")
399
+ f"The following columns in field_map are not in the DataFrame: {missing_columns}, field map: {field_map}")
370
400
 
371
401
  def rename_columns(df, mapping):
372
402
  return df.rename(columns=mapping)
@@ -376,6 +406,8 @@ class DfHelper:
376
406
  self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
377
407
 
378
408
  self.logger.debug("Processing of loaded data completed.")
409
+ else:
410
+ self.logger.debug("DataFrame is empty, skipping processing.")
379
411
 
380
412
  def save_to_parquet(self, parquet_filename: Optional[str] = None, **kwargs):
381
413
  """
@@ -536,14 +568,14 @@ class DfHelper:
536
568
 
537
569
  # Common logic for Django and SQLAlchemy
538
570
  if self.backend == 'django_db':
539
- model_fields = {field.name: field for field in self.backend_django.model._meta.get_fields()}
571
+ model_fields = {field.name: field for field in self.backend_db_connection.model._meta.get_fields()}
540
572
  if mapped_field not in model_fields:
541
573
  raise ValueError(f"Field '{dt_field}' does not exist in the Django model.")
542
574
  field_type = type(model_fields[mapped_field]).__name__
543
575
  is_date_field = field_type == 'DateField'
544
576
  is_datetime_field = field_type == 'DateTimeField'
545
577
  elif self.backend == 'sqlalchemy':
546
- model = self.backend_sqlalchemy.model
578
+ model = self.backend_db_connection.model
547
579
  fields = [column.name for column in model.__table__.columns]
548
580
  if mapped_field not in fields:
549
581
  raise ValueError(f"Field '{dt_field}' does not exist in the SQLAlchemy model.")
@@ -1,5 +1,6 @@
1
1
  import datetime
2
2
  import logging
3
+ import threading
3
4
  from typing import Optional, Any, Dict
4
5
 
5
6
  import dask.dataframe as dd
@@ -78,6 +79,7 @@ class ParquetArtifact(DfHelper):
78
79
  `parquet_filename`, `parquet_start_date`,
79
80
  or `parquet_end_date`) are missing or not set properly.
80
81
  """
82
+ self._lock = threading.Lock()
81
83
  self.config = {
82
84
  **self.DEFAULT_CONFIG,
83
85
  **kwargs,
@@ -119,21 +121,36 @@ class ParquetArtifact(DfHelper):
119
121
  super().__init__(**self.config)
120
122
 
121
123
  def load(self, **kwargs):
122
- self.df = super().load(**kwargs)
124
+ with self._lock:
125
+ self.df = super().load(**kwargs)
123
126
  return self.df
124
127
 
125
128
  def generate_parquet(self, **kwargs) -> None:
126
129
  """
127
130
  Generate a Parquet file using the configured DataWrapper class.
128
131
  """
129
- params = self._prepare_params(kwargs)
130
- dw = DataWrapper(self.data_wrapper_class, **params)
131
- dw.process()
132
+ with self._lock:
133
+ params = self._prepare_params(kwargs)
134
+ dw = DataWrapper(self.data_wrapper_class, **params)
135
+ dw.process()
136
+
137
+ def __enter__(self):
138
+ if getattr(self, "_entered", False):
139
+ return self
140
+ self._entered = True
141
+ self.ensure_directory_exists(self.parquet_storage_path)
142
+ return self
132
143
 
133
144
  def __exit__(self, exc_type, exc_value, traceback):
134
- # Ensure resources are cleaned up
135
- if self.fs:
136
- self.fs.close()
145
+ try:
146
+ if getattr(self, "_entered", False) and self.fs:
147
+ self.fs.close()
148
+ except Exception as e:
149
+ self.logger.warning(f"Error closing filesystem: {e}")
150
+ finally:
151
+ self._entered = False
152
+ # return False so exceptions aren’t suppressed
153
+ return False
137
154
 
138
155
  def update_parquet(self, period: str = 'today', **kwargs) -> None:
139
156
  """Update the Parquet file with data from a specific period."""
@@ -223,7 +240,8 @@ class ParquetArtifact(DfHelper):
223
240
 
224
241
  def ensure_directory_exists(self, path: str) -> None:
225
242
  """Ensure the directory exists in the specified filesystem."""
226
- try:
227
- self.fs.makedirs(path, exist_ok=True)
228
- except Exception as e:
229
- raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
243
+ with self._lock:
244
+ try:
245
+ self.fs.makedirs(path, exist_ok=True)
246
+ except Exception as e:
247
+ raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")