sibi-dst 0.3.58__tar.gz → 0.3.59__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/PKG-INFO +1 -1
  2. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/pyproject.toml +1 -1
  3. sibi_dst-0.3.59/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +239 -0
  4. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/_df_helper.py +7 -3
  5. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/_parquet_artifact.py +143 -52
  6. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +3 -3
  7. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/__init__.py +3 -2
  8. sibi_dst-0.3.59/sibi_dst/utils/data_wrapper.py +258 -0
  9. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/date_utils.py +8 -8
  10. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/log_utils.py +1 -1
  11. sibi_dst-0.3.59/sibi_dst/utils/manifest_manager.py +154 -0
  12. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/update_planner.py +96 -85
  13. sibi_dst-0.3.58/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -261
  14. sibi_dst-0.3.58/sibi_dst/utils/data_wrapper.py +0 -249
  15. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/README.md +0 -0
  16. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/__init__.py +0 -0
  17. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/__init__.py +0 -0
  18. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  19. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/__init__.py +0 -0
  20. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  21. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  22. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  23. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
  24. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  25. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  26. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  27. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  28. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  29. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  30. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  31. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  32. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  33. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  34. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  35. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/__init__.py +0 -0
  36. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/_defaults.py +0 -0
  37. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  38. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/_params_config.py +0 -0
  39. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/_query_config.py +0 -0
  40. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/data_cleaner.py +0 -0
  41. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/geopy_helper/__init__.py +0 -0
  42. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  43. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/geopy_helper/utils.py +0 -0
  44. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/__init__.py +0 -0
  45. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  46. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  47. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  48. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  49. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/utils.py +0 -0
  50. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/tests/__init__.py +0 -0
  51. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  52. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/airflow_manager.py +0 -0
  53. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/clickhouse_writer.py +0 -0
  54. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/credentials.py +0 -0
  55. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/data_from_http_source.py +0 -0
  56. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/data_utils.py +0 -0
  57. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/df_utils.py +0 -0
  58. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/file_utils.py +0 -0
  59. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/filepath_generator.py +0 -0
  60. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/parquet_saver.py +0 -0
  61. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/phone_formatter.py +0 -0
  62. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/storage_config.py +0 -0
  63. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/storage_manager.py +0 -0
  64. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/webdav_client.py +0 -0
  65. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/__init__.py +0 -0
  66. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/__init__.py +0 -0
  67. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  68. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  69. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  70. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  71. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  72. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  73. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  74. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  75. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  76. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  77. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  78. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  79. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  80. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  81. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  82. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  83. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/utils/__init__.py +0 -0
  84. {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.58
3
+ Version: 0.3.59
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.58"
3
+ version = "0.3.59"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -0,0 +1,239 @@
1
+ import asyncio
2
+ import logging
3
+ import datetime
4
+ import random
5
+ from typing import Any, Callable, Dict, List, Optional, Type
6
+
7
+ from sibi_dst.utils import Logger
8
+
9
+
10
+ class ArtifactUpdaterMultiWrapper:
11
+ """
12
+ Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
13
+
14
+ Features:
15
+ - Caps concurrency at max_workers via semaphore
16
+ - Optionally prioritises tasks via a priority function or static method on artifact classes
17
+ - Tracks per-artifact completion times
18
+ - Configurable retry/backoff strategy
19
+ - Optional metrics integration
20
+ - Thread-safe within a single asyncio loop
21
+
22
+ Usage:
23
+ wrapper = ArtifactUpdaterMultiWrapper(
24
+ wrapped_classes={
25
+ 'mydata': [DataArtifactA, DataArtifactB],
26
+ },
27
+ max_workers=4,
28
+ retry_attempts=3,
29
+ update_timeout_seconds=600,
30
+ backoff_base=2,
31
+ backoff_max=60,
32
+ backoff_jitter=0.1,
33
+ priority_fn=None, # or custom
34
+ metrics_client=None,
35
+ debug=True,
36
+ logger=None,
37
+ artifact_class_kwargs={
38
+ 'fs': my_fs,
39
+ 'parquet_storage_path': 's3://bucket/data',
40
+ 'logger': my_logger,
41
+ 'debug': True,
42
+ }
43
+ )
44
+ await wrapper.update_data('mydata', period='ytd', overwrite=True)
45
+ """
46
+ def __init__(
47
+ self,
48
+ wrapped_classes: Dict[str, List[Type]],
49
+ *,
50
+ max_workers: int = 3,
51
+ retry_attempts: int = 3,
52
+ update_timeout_seconds: int = 600,
53
+ backoff_base: int = 2,
54
+ backoff_max: Optional[int] = 60,
55
+ backoff_jitter: float = 0.1,
56
+ priority_fn: Optional[Callable[[Type], int]] = None,
57
+ metrics_client: Any = None,
58
+ debug: bool = False,
59
+ logger: Optional[logging.Logger] = None,
60
+ artifact_class_kwargs: Optional[Dict[str, Any]] = None,
61
+ ) -> None:
62
+ self.wrapped_classes = wrapped_classes
63
+ self.max_workers = max_workers
64
+ self.retry_attempts = retry_attempts
65
+ self.update_timeout_seconds = update_timeout_seconds
66
+ self.backoff_base = backoff_base
67
+ self.backoff_max = backoff_max
68
+ self.backoff_jitter = backoff_jitter
69
+ self.priority_fn = priority_fn
70
+ self.metrics_client = metrics_client
71
+
72
+ self.debug = debug
73
+ self.logger = logger or Logger.default_logger(
74
+ logger_name=self.__class__.__name__,
75
+ log_level=Logger.DEBUG if debug else Logger.INFO
76
+ )
77
+
78
+ # Default artifact init kwargs
79
+ today = datetime.datetime.today() + datetime.timedelta(days=1)
80
+ default_kwargs = {
81
+ 'parquet_start_date': today.strftime('%Y-%m-%d'),
82
+ 'parquet_end_date': today.strftime('%Y-%m-%d'),
83
+ 'logger': self.logger,
84
+ 'debug': self.debug,
85
+ }
86
+ self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
87
+
88
+ # State
89
+ self.completion_times: Dict[str, float] = {}
90
+ self.failed: List[str] = []
91
+ self.original_classes: List[Type] = []
92
+
93
+ def get_artifact_classes(self, data_type: str) -> List[Type]:
94
+ """
95
+ Retrieve artifact classes by data type.
96
+ """
97
+ self.logger.info(f"Fetching artifact classes for '{data_type}'")
98
+ if data_type not in self.wrapped_classes:
99
+ raise ValueError(f"Unsupported data type: {data_type}")
100
+ classes = self.wrapped_classes[data_type]
101
+ self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
102
+ return classes
103
+
104
+ def estimate_priority(self, artifact_cls: Type) -> int:
105
+ """
106
+ Determine task priority for ordering. Lower values run first.
107
+ """
108
+ name = artifact_cls.__name__
109
+ if self.priority_fn:
110
+ try:
111
+ pr = self.priority_fn(artifact_cls)
112
+ self.logger.debug(f"priority_fn for {name}: {pr}")
113
+ return pr
114
+ except Exception as e:
115
+ self.logger.warning(f"priority_fn error for {name}: {e}")
116
+ try:
117
+ fs = self.artifact_class_kwargs.get('fs')
118
+ path = self.artifact_class_kwargs.get('parquet_storage_path')
119
+ pr=1
120
+ if hasattr(artifact_cls, 'get_size_estimate'):
121
+ pr = artifact_cls.get_size_estimate(fs, path)
122
+ self.logger.debug(f"Estimated priority for {name}: {pr}")
123
+ return pr
124
+ except Exception:
125
+ return 1
126
+
127
+ async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
128
+ """
129
+ Wrap update_artifact in a semaphore slot to limit concurrency.
130
+ """
131
+ async with sem:
132
+ name = artifact_cls.__name__
133
+ start = asyncio.get_event_loop().time()
134
+ self.logger.info(f"Starting update for {name}")
135
+ try:
136
+ for attempt in range(1, self.retry_attempts + 1):
137
+ try:
138
+ artifact = await asyncio.to_thread(
139
+ artifact_cls, **self.artifact_class_kwargs
140
+ )
141
+ await asyncio.wait_for(
142
+ asyncio.to_thread(
143
+ artifact.update_parquet, **update_kwargs
144
+ ),
145
+ timeout=self.update_timeout_seconds
146
+ )
147
+ duration = asyncio.get_event_loop().time() - start
148
+ self.completion_times[name] = duration
149
+ self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
150
+ if self.metrics_client:
151
+ self.metrics_client.increment('task_succeeded')
152
+ return
153
+ except asyncio.TimeoutError:
154
+ self.logger.warning(f"Timeout on {name}, attempt {attempt}")
155
+ except Exception as e:
156
+ self.logger.error(f"Error on {name} attempt {attempt}: {e}")
157
+
158
+ delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
159
+ delay *= 1 + random.uniform(0, self.backoff_jitter)
160
+ self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
161
+ await asyncio.sleep(delay)
162
+
163
+ except asyncio.CancelledError:
164
+ self.logger.warning(f"{name} update cancelled")
165
+ raise
166
+
167
+ # permanent failure
168
+ self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
169
+ if self.metrics_client:
170
+ self.metrics_client.increment('task_failed')
171
+ self.failed.append(name)
172
+
173
+ async def update_data(self, data_type: str, **kwargs: Any) -> None:
174
+ """
175
+ Entry point to update all artifacts of a given type concurrently.
176
+ """
177
+ self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
178
+
179
+ # RESET STATE
180
+ self.completion_times.clear()
181
+ self.failed.clear()
182
+ self.original_classes = self.get_artifact_classes(data_type)
183
+
184
+ # NON-DESTRUCTIVE SORTING
185
+ ordered = sorted(self.original_classes, key=self.estimate_priority)
186
+
187
+ sem = asyncio.Semaphore(self.max_workers)
188
+ tasks = [
189
+ asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
190
+ for cls in ordered
191
+ ]
192
+
193
+ try:
194
+ for coro in asyncio.as_completed(tasks):
195
+ await coro
196
+ except asyncio.CancelledError:
197
+ self.logger.warning("update_data was cancelled—aborting remaining retries")
198
+ for t in tasks:
199
+ t.cancel()
200
+ raise
201
+ finally:
202
+ total = len(self.original_classes)
203
+ completed = len(self.completion_times)
204
+ failed = len(self.failed)
205
+ self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
206
+
207
+ def get_update_status(self) -> Dict[str, Any]:
208
+ """
209
+ Returns summary status including completion times.
210
+ """
211
+ total = len(self.original_classes)
212
+ completed = set(self.completion_times.keys())
213
+ failed = set(self.failed)
214
+ pending = {cls.__name__ for cls in self.original_classes} - completed - failed
215
+
216
+ return {
217
+ 'total': total,
218
+ 'completed': list(completed),
219
+ 'failed': list(failed),
220
+ 'pending': list(pending),
221
+ 'completion_times': self.completion_times,
222
+ }
223
+
224
+ @staticmethod
225
+ def format_status_table(status: Dict[str, Any]) -> str:
226
+ """
227
+ Formats the status dict into a readable table.
228
+ """
229
+ lines = [
230
+ f"Total: {status['total']}",
231
+ f"Completed: {len(status['completed'])} {status['completed']}",
232
+ f"Failed: {len(status['failed'])} {status['failed']}",
233
+ f"Pending: {len(status['pending'])} {status['pending']}",
234
+ "",
235
+ "Per-artifact timings:"
236
+ ]
237
+ for name, dur in status['completion_times'].items():
238
+ lines.append(f" {name}: {dur:.2f}s")
239
+ return "\n".join(lines)
@@ -46,8 +46,7 @@ class DfHelper:
46
46
 
47
47
  :ivar df: The DataFrame currently being processed or loaded.
48
48
  :type df: Union[dd.DataFrame, pd.DataFrame]
49
- :ivar backend_django: Configuration for interacting with Django database backends.
50
- :type backend_connection: Optional[DjangoConnectionConfig]
49
+ :type backend_connection: Optional[DjangoConnectionConfig | SqlAlchemyConnectionConfig]
51
50
  :ivar _backend_query: Internal configuration for query handling.
52
51
  :type _backend_query: Optional[QueryConfig]
53
52
  :ivar _backend_params: Internal parameters configuration for DataFrame handling.
@@ -81,9 +80,10 @@ class DfHelper:
81
80
  self.debug = kwargs.setdefault("debug", False)
82
81
  self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
83
82
  # Configure logger level
84
- self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
83
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
85
84
  self.logger.debug("Logger initialized in DEBUG mode.")
86
85
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
86
+ self.parquet_filename = kwargs.setdefault("parquet_filename", None)
87
87
  self.dt_field = kwargs.setdefault("dt_field", None)
88
88
  self.as_pandas = kwargs.setdefault("as_pandas", False)
89
89
  self.filesystem = kwargs.pop('filesystem', 'file')
@@ -429,7 +429,11 @@ class DfHelper:
429
429
  the instance's attribute for storage path.
430
430
  :return: None
431
431
  """
432
+ if self.df.map_partitions(len).compute().sum() == 0:
433
+ self.logger.debug("Cannot save to parquet since DataFrame is empty")
434
+ return
432
435
  fs = kwargs.pop('fs', self.fs)
436
+ parquet_filename = parquet_filename or self.parquet_filename
433
437
  parquet_storage_path = kwargs.pop('parquet_storage_path', self.parquet_storage_path)
434
438
  ps = ParquetSaver(df_result=self.df, parquet_storage_path=parquet_storage_path, logger=self.logger, fs=fs)
435
439
  ps.save_to_parquet(parquet_filename)
@@ -7,7 +7,8 @@ import dask.dataframe as dd
7
7
  import fsspec
8
8
 
9
9
  from sibi_dst.df_helper import DfHelper
10
- from sibi_dst.utils import DataWrapper, DateUtils, Logger
10
+ from sibi_dst.utils import DataWrapper, DateUtils, Logger, ParquetSaver, UpdatePlanner
11
+ from sibi_dst.utils import MissingManifestManager
11
12
 
12
13
 
13
14
  class ParquetArtifact(DfHelper):
@@ -57,7 +58,16 @@ class ParquetArtifact(DfHelper):
57
58
  'backend': 'parquet'
58
59
  }
59
60
 
60
- def __init__(self, data_wrapper_class, **kwargs):
61
+ # DEFAULT_UPDATE_PLANNER_CONFIG = {
62
+ # 'reverse_order': True,
63
+ # 'overwrite': False,
64
+ # 'ignore_missing': True,
65
+ # 'history_days_threshold': 30,
66
+ # 'max_age_minutes': 10,
67
+ # 'show_progress': False
68
+ # }
69
+
70
+ def __init__(self, data_wrapper_class, **kwargs):
61
71
  """
62
72
  Initializes an instance of the class with given configuration and validates
63
73
  required parameters. Sets up the filesystem to handle storage, ensuring
@@ -79,46 +89,89 @@ class ParquetArtifact(DfHelper):
79
89
  `parquet_filename`, `parquet_start_date`,
80
90
  or `parquet_end_date`) are missing or not set properly.
81
91
  """
92
+
93
+ """Initialize with config, validate required fields, and setup filesystem."""
82
94
  self._lock = threading.Lock()
83
95
  self.config = {
84
96
  **self.DEFAULT_CONFIG,
85
97
  **kwargs,
86
98
  }
87
99
  self.df: Optional[dd.DataFrame] = None
88
- self.debug = self.config.setdefault('debug', False)
89
- self.logger = self.config.setdefault('logger',Logger.default_logger(logger_name=f'parquet_artifact_{__class__.__name__}'))
90
- self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
100
+ self._setup_logging()
91
101
  self.data_wrapper_class = data_wrapper_class
92
- self.class_params = self.config.setdefault('class_params', None)
93
- self.load_params = self.config.setdefault('load_params', None)
94
- self.date_field = self.config.setdefault('date_field', None)
95
- if self.date_field is None:
96
- raise ValueError('date_field must be set')
97
- self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
98
- if self.parquet_storage_path is None:
99
- raise ValueError('parquet_storage_path must be set')
100
-
101
- self.parquet_filename = self.config.setdefault('parquet_filename', None)
102
- if self.parquet_filename is None:
103
- raise ValueError('parquet_filename must be set')
104
- self.parquet_start_date = self.config.setdefault('parquet_start_date', None)
105
- if self.parquet_start_date is None:
106
- raise ValueError('parquet_start_date must be set')
107
-
108
- self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
109
- if self.parquet_end_date is None:
110
- raise ValueError('parquet_end_date must be set')
102
+
103
+ self.date_field = self._validate_required('date_field')
104
+ self.parquet_storage_path = self._validate_required('parquet_storage_path')
105
+ self.parquet_filename = self._validate_required('parquet_filename')
106
+ self.parquet_start_date = self._validate_required('parquet_start_date')
107
+ self.parquet_end_date = self._validate_required('parquet_end_date')
111
108
 
112
109
  # Filesystem setup
113
110
  self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
114
111
  self.filesystem_options = self.config.setdefault('filesystem_options', {})
115
112
  self.fs = self.config.setdefault('fs', None)
113
+ self._own_fs = self.fs is None
116
114
  if self.fs is None:
117
115
  self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
116
+ self._own_fs = True
118
117
  self.config.setdefault('fs', self.fs)
118
+ ## Populate to parameters to pass to data_wrapper_class
119
+ self.class_params = self.config.pop('class_params', {
120
+ 'debug': self.debug,
121
+ 'logger': self.logger,
122
+ 'fs': self.fs,
123
+ })
124
+ # Populate parameters to pass to load method of DataWrapper class
125
+ self.load_params = self.config.setdefault('load_params', {})
119
126
  # Ensure the directory exists
120
127
  self.ensure_directory_exists(self.parquet_storage_path)
121
128
  super().__init__(**self.config)
129
+ self.update_planner_params = {}
130
+ self.datawrapper_params = {}
131
+
132
+ def _setup_logging(self):
133
+ """Initialize logger and debug settings."""
134
+ self.debug = self.config.get('debug', False)
135
+ self.logger = self.config.get('logger',
136
+ Logger.default_logger(
137
+ logger_name=f'Parquet_Artifact_InstanceOf_{self.__class__.__name__}'))
138
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
139
+
140
+ def _validate_required(self, key: str) -> Any:
141
+ """Validate required configuration fields."""
142
+ value = self.config.setdefault(key, None)
143
+ if value is None:
144
+ raise ValueError(f'{key} must be set')
145
+ return value
146
+
147
+ def _setup_manifest(self, overwrite: bool = False, ignore_missing: bool = False):
148
+ self.skipped = []
149
+ self.missing_manifest_path = f"{self.parquet_storage_path}_manifests/missing.parquet"
150
+ self.mmanifest = MissingManifestManager(
151
+ fs=self.fs,
152
+ manifest_path=self.missing_manifest_path,
153
+ clear_existing=overwrite
154
+ )
155
+
156
+ # Initialize skipped files
157
+ manifest_exists = self.mmanifest._safe_exists(self.missing_manifest_path)
158
+ if not manifest_exists:
159
+ self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
160
+ self.mmanifest.save()
161
+ else:
162
+ self.logger.info(f"Manifest already exists at {self.missing_manifest_path}")
163
+
164
+ # Load skipped files if manifest exists and ignore_missing is True
165
+ self.skipped = self.mmanifest.load_existing() # if ignore_missing and manifest_exists else []
166
+ self.logger.info(f"Skipped: {self.skipped}")
167
+ if overwrite:
168
+ self.skipped = []
169
+ self.ignore_missing = False
170
+
171
+ def _setup_update_planner(self, **kwargs) -> None:
172
+ self._prepare_update_params(**kwargs)
173
+ self.update_planner = UpdatePlanner(**self.update_planner_params)
174
+ self.update_planner.generate_plan(self.start_date, self.end_date)
122
175
 
123
176
  def load(self, **kwargs):
124
177
  with self._lock:
@@ -130,20 +183,30 @@ class ParquetArtifact(DfHelper):
130
183
  Generate a Parquet file using the configured DataWrapper class.
131
184
  """
132
185
  with self._lock:
133
- params = self._prepare_params(kwargs)
134
- dw = DataWrapper(self.data_wrapper_class, **params)
135
- dw.process()
186
+ overwrite = kwargs.get('overwrite', False)
187
+ ignore_missing = kwargs.get('ignore_missing', False)
188
+ self._setup_manifest(overwrite, ignore_missing)
189
+ self._setup_update_planner(**kwargs)
190
+ params = self.datawrapper_params.copy()
191
+ params.update({
192
+ 'mmanifest': self.mmanifest,
193
+ 'update_planner': self.update_planner
194
+ })
195
+
196
+ with DataWrapper(self.data_wrapper_class, **params) as dw:
197
+ dw.process()
136
198
 
137
199
  def __enter__(self):
138
200
  if getattr(self, "_entered", False):
139
201
  return self
140
202
  self._entered = True
141
- self.ensure_directory_exists(self.parquet_storage_path)
142
203
  return self
143
204
 
144
205
  def __exit__(self, exc_type, exc_value, traceback):
145
206
  try:
146
- if getattr(self, "_entered", False) and self.fs:
207
+ if self.mmanifest and self.mmanifest._new_records:
208
+ self.mmanifest.save()
209
+ if getattr(self, "_entered", False) and self.fs and self._own_fs:
147
210
  self.fs.close()
148
211
  except Exception as e:
149
212
  self.logger.warning(f"Error closing filesystem: {e}")
@@ -152,6 +215,18 @@ class ParquetArtifact(DfHelper):
152
215
  # return False so exceptions aren’t suppressed
153
216
  return False
154
217
 
218
+ @classmethod
219
+ def get_size_estimate(cls, parquet_path: str, **kwargs) -> int:
220
+ """
221
+ Estimate complexity as total bytes of all .parquet files under parquet_path.
222
+ Returns size in megabytes (so you can cap or scale priority sensibly).
223
+ """
224
+ fs, _, paths = cls.fs.get_fs_token_paths(parquet_path)
225
+ files = fs.glob(f"{parquet_path}/*.parquet")
226
+ total_bytes = sum(fs.size(f) for f in files)
227
+ # convert to “units” (e.g. MB) so priorities stay in a reasonable range
228
+ return max(1, int(total_bytes / (1024 ** 2)))
229
+
155
230
  def update_parquet(self, period: str = 'today', **kwargs) -> None:
156
231
  """Update the Parquet file with data from a specific period."""
157
232
 
@@ -206,36 +281,52 @@ class ParquetArtifact(DfHelper):
206
281
  'end_date': kwargs.get('parquet_end_date', self.parquet_end_date),
207
282
  }
208
283
 
209
- def _prepare_params(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
210
- """Prepare the parameters for generating the Parquet file."""
211
- kwargs = {**self.config, **kwargs}
212
- return {
213
- 'class_params': kwargs.pop('class_params', None),
214
- 'date_field': kwargs.pop('date_field', self.date_field),
284
+ def _prepare_update_params(self, **kwargs) -> Dict[str, Any]:
285
+ self.reverse_order = kwargs.pop('reverse_order', True)
286
+ self.overwrite = kwargs.pop('overwrite', False)
287
+ self.ignore_missing = kwargs.pop('ignore_missing', False)
288
+ self.history_days_threshold = kwargs.pop('history_days_threshold', 30)
289
+ self.max_age_minutes = kwargs.pop('max_age_minutes', 10)
290
+ self.show_progress = kwargs.pop('show_progress', False)
291
+ self.start_date = kwargs.pop('parquet_start_date', self.parquet_start_date)
292
+ self.end_date = kwargs.pop('parquet_end_date', self.parquet_end_date)
293
+ self.parquet_filename = kwargs.pop('parquet_filename', self.parquet_filename)
294
+ self.verbose = kwargs.pop('verbose', False)
295
+
296
+ self.update_planner_params.update({
297
+ 'filename': self.parquet_filename,
215
298
  'data_path': self.parquet_storage_path,
216
- 'parquet_filename': kwargs.pop('parquet_filename', self.parquet_filename),
217
- 'start_date': kwargs.pop('parquet_start_date', self.parquet_start_date),
218
- 'end_date': kwargs.pop('parquet_end_date', self.parquet_end_date),
219
- 'verbose': kwargs.pop('verbose', False),
220
- 'load_params': kwargs.pop('load_params', None),
221
- 'reverse_order': kwargs.pop('reverse_order', True),
222
- 'overwrite': kwargs.pop('overwrite', False),
223
- 'ignore_missing': kwargs.pop('ignore_missing', False),
299
+ 'fs': self.fs,
300
+ 'debug': self.debug,
224
301
  'logger': self.logger,
225
- 'history_days_threshold': kwargs.pop('history_days_threshold', 30),
226
- 'max_age_minutes': kwargs.pop('max_age_minutes', 10),
227
- 'show_progress': kwargs.pop('show_progress', False),
302
+ 'reverse_order': self.reverse_order,
303
+ 'overwrite': self.overwrite,
304
+ 'ignore_missing': self.ignore_missing,
305
+ 'history_days_threshold': self.history_days_threshold,
306
+ 'max_age_minutes': self.max_age_minutes,
307
+ 'show_progress': self.show_progress,
308
+ 'description': f"{self.data_wrapper_class.__name__}",
309
+ 'skipped': self.skipped,
310
+ 'verbose': self.verbose,
311
+ })
312
+
313
+ self.datawrapper_params = {
314
+ 'parquet_filename': self.parquet_filename,
315
+ 'data_path': self.parquet_storage_path,
228
316
  'fs': self.fs,
229
- 'filesystem_type': self.filesystem_type,
230
- 'filesystem_options': self.filesystem_options,
317
+ 'class_params': self.class_params,
318
+ 'date_field': self.date_field,
319
+ 'load_params': self.load_params,
320
+ 'verbose': self.verbose
231
321
  }
232
322
 
233
- @staticmethod
234
- def parse_parquet_period(**kwargs):
323
+ def parse_parquet_period(self, **kwargs):
235
324
  start_date, end_date = DateUtils.parse_period(**kwargs)
325
+ self.parquet_start_date = start_date.strftime('%Y-%m-%d')
326
+ self.parquet_end_date = end_date.strftime('%Y-%m-%d')
236
327
  return {
237
- 'parquet_start_date': start_date.strftime('%Y-%m-%d'),
238
- 'parquet_end_date': end_date.strftime('%Y-%m-%d'),
328
+ 'parquet_start_date': self.parquet_start_date,
329
+ 'parquet_end_date': self.parquet_end_date,
239
330
  }
240
331
 
241
332
  def ensure_directory_exists(self, path: str) -> None:
@@ -155,7 +155,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
155
155
  key = self._engine_key()
156
156
  with self._registry_lock:
157
157
  if self._engine_registry.get(key) is not self.engine:
158
- self.logger.warning("Engine changed")
158
+ self.logger.debug("Engine changed")
159
159
  return 0
160
160
  pool = self.engine.pool
161
161
  if isinstance(pool, QueuePool):
@@ -191,10 +191,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
191
191
  with self._registry_lock:
192
192
  key = self._engine_key()
193
193
  if not self._owns_engine:
194
- self.logger.warning("Not owner, skipping close")
194
+ self.logger.debug("Not owner, skipping close")
195
195
  return
196
196
  if self._engine_registry.get(key) != self.engine:
197
- self.logger.warning("Engine not in registry")
197
+ self.logger.debug("Engine not in registry")
198
198
  return
199
199
  self.engine.dispose()
200
200
  del self._engine_registry[key]
@@ -17,7 +17,7 @@ from .data_wrapper import DataWrapper
17
17
  from .storage_config import StorageConfig
18
18
  from .data_from_http_source import DataFromHttpSource
19
19
  from .webdav_client import WebDAVClient
20
-
20
+ from .manifest_manager import MissingManifestManager
21
21
 
22
22
  __all__ = [
23
23
  "Logger",
@@ -38,5 +38,6 @@ __all__ = [
38
38
  "AirflowDAGManager",
39
39
  "StorageConfig",
40
40
  "DataFromHttpSource",
41
- "WebDAVClient"
41
+ "WebDAVClient",
42
+ "MissingManifestManager",
42
43
  ]