sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +6 -4
- sibi_dst/df_helper/__init__.py +1 -0
- sibi_dst/df_helper/_parquet_artifact.py +533 -113
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -281
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +349 -142
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -0
- sibi_dst/utils/data_wrapper.py +460 -61
- sibi_dst/utils/parquet_saver.py +403 -161
- sibi_dst/utils/update_planner.py +553 -319
- sibi_dst/utils/write_gatekeeper.py +18 -0
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.4.dist-info}/METADATA +2 -2
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.4.dist-info}/RECORD +13 -12
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.4.dist-info}/WHEEL +0 -0
@@ -1,59 +1,90 @@
|
|
1
|
+
# parquet_artifact.py
|
1
2
|
from __future__ import annotations
|
2
3
|
|
3
4
|
import datetime as dt
|
4
5
|
import threading
|
5
6
|
from functools import cached_property
|
6
|
-
from typing import Any, Dict, Type, TypeVar
|
7
|
+
from typing import Any, Dict, Type, TypeVar, Optional, Set
|
7
8
|
|
8
9
|
from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
|
9
|
-
from sibi_dst.utils import MissingManifestManager
|
10
|
+
from sibi_dst.utils import MissingManifestManager
|
10
11
|
|
12
|
+
# Type variable for potential future use with specific DataWrapper subclasses
|
11
13
|
T = TypeVar("T")
|
12
14
|
|
13
15
|
|
14
16
|
class ParquetArtifact(ManagedResource):
|
15
17
|
"""
|
16
|
-
Orchestrates a single dataset
|
17
|
-
-
|
18
|
-
-
|
19
|
-
-
|
20
|
-
- Prevents duplicate concurrent runs
|
21
|
-
- Forwards retry/backoff
|
18
|
+
Orchestrates the generation of a single date-partitioned Parquet dataset.
|
19
|
+
- Manages a MissingManifestManager for tracking missing data.
|
20
|
+
- Uses an UpdatePlanner to determine which dates need processing.
|
21
|
+
- Delegates execution to a DataWrapper for concurrent date processing.
|
22
|
+
- Prevents duplicate concurrent runs for the same dataset.
|
23
|
+
- Forwards retry/backoff configuration to the DataWrapper.
|
22
24
|
"""
|
23
25
|
|
26
|
+
# --- Class-level state for concurrency control ---
|
24
27
|
_global_lock = threading.RLock()
|
25
|
-
_active_runs:
|
28
|
+
_active_runs: Set[tuple[str, str]] = set()
|
26
29
|
logger_extra = {"sibi_dst_component": __name__}
|
27
30
|
|
28
31
|
def __init__(self, **kwargs: Any):
|
32
|
+
"""
|
33
|
+
Initializes the ParquetArtifact.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
**kwargs: Configuration parameters including:
|
37
|
+
- parquet_storage_path (str): The base S3/path for the dataset.
|
38
|
+
- parquet_filename (str): The name of the Parquet file within each date partition.
|
39
|
+
- data_wrapper_class (Type): The class (e.g., Etl...Dc) used to load/process data for a date.
|
40
|
+
- fs (fsspec.AbstractFileSystem): The filesystem object.
|
41
|
+
- logger (Logger): Logger instance.
|
42
|
+
- debug (bool): Enable debug logging.
|
43
|
+
- verbose (bool): Enable verbose logging.
|
44
|
+
- Plus other parameters passed to UpdatePlanner, DataWrapper, etc.
|
45
|
+
"""
|
29
46
|
# Merge defaults from ManagedResource and caller kwargs
|
30
|
-
self.all_kwargs: Dict[str, Any] =
|
47
|
+
self.all_kwargs: Dict[str, Any] = dict(kwargs) # Shallow copy
|
31
48
|
super().__init__(**self.all_kwargs)
|
32
49
|
|
33
|
-
#
|
50
|
+
# --- Instance-level coordination lock ---
|
34
51
|
self._lock = threading.RLock()
|
35
52
|
|
36
|
-
#
|
53
|
+
# --- Core Configuration (validated/accessed frequently) ---
|
54
|
+
if "parquet_storage_path" not in self.all_kwargs:
|
55
|
+
raise ValueError("Required argument 'parquet_storage_path' is missing.")
|
56
|
+
if "parquet_filename" not in self.all_kwargs:
|
57
|
+
raise ValueError("Required argument 'parquet_filename' is missing.")
|
58
|
+
|
37
59
|
self._storage_path: str = self.all_kwargs["parquet_storage_path"]
|
38
60
|
self._parquet_filename: str = self.all_kwargs["parquet_filename"]
|
39
|
-
self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
|
61
|
+
self._data_wrapper_class: Optional[Type] = self.all_kwargs.get("data_wrapper_class")
|
62
|
+
|
63
|
+
# Update logger extra with specific context
|
64
|
+
self.logger_extra.update({
|
65
|
+
"artifact_storage_path": self._storage_path,
|
66
|
+
"artifact_filename": self._parquet_filename
|
67
|
+
})
|
40
68
|
|
41
|
-
#
|
69
|
+
# --------------------- Helpers ---------------------
|
42
70
|
def _invalidate_cached(self, *names: str) -> None:
|
43
|
-
|
44
|
-
|
71
|
+
"""Invalidate cached properties by name."""
|
72
|
+
for name in names:
|
73
|
+
self.__dict__.pop(name, None)
|
45
74
|
|
46
75
|
def _build_manifest_path(self) -> str:
|
47
|
-
|
76
|
+
"""Construct the path for the missing manifest file."""
|
77
|
+
base = self._storage_path.rstrip("/") + "/"
|
48
78
|
return f"{base}_manifests/missing.parquet"
|
49
79
|
|
50
|
-
#
|
80
|
+
# --------------------- Lazy Members (Cached Properties) ---------------------
|
51
81
|
@cached_property
|
52
82
|
def mmanifest(self) -> MissingManifestManager:
|
83
|
+
"""Lazily initialize and return the MissingManifestManager."""
|
53
84
|
self.logger.info("Initializing MissingManifestManager...", extra=self.logger_extra)
|
54
85
|
manifest_path = self._build_manifest_path()
|
55
86
|
|
56
|
-
#
|
87
|
+
# Ensure the manifest directory exists
|
57
88
|
manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
|
58
89
|
self.ensure_directory_exists(manifest_dir)
|
59
90
|
|
@@ -68,7 +99,11 @@ class ParquetArtifact(ManagedResource):
|
|
68
99
|
|
69
100
|
if not mgr._safe_exists(mgr.manifest_path):
|
70
101
|
self.logger.info(f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra)
|
71
|
-
|
102
|
+
try:
|
103
|
+
mgr.save()
|
104
|
+
except Exception as e:
|
105
|
+
self.logger.error(f"Failed to create initial manifest: {e}", extra=self.logger_extra)
|
106
|
+
raise
|
72
107
|
else:
|
73
108
|
self.logger.info(f"Manifest already exists at {mgr.manifest_path}", extra=self.logger_extra)
|
74
109
|
|
@@ -76,9 +111,11 @@ class ParquetArtifact(ManagedResource):
|
|
76
111
|
|
77
112
|
@cached_property
|
78
113
|
def update_planner(self) -> UpdatePlanner:
|
114
|
+
"""Lazily initialize and return the UpdatePlanner."""
|
79
115
|
self.logger.info("Initializing UpdatePlanner...", extra=self.logger_extra)
|
80
116
|
skipped_files = self.mmanifest.load_existing() or []
|
81
117
|
|
118
|
+
# Prepare configuration for the UpdatePlanner
|
82
119
|
cfg = {
|
83
120
|
**self.all_kwargs,
|
84
121
|
"fs": self.fs,
|
@@ -86,18 +123,20 @@ class ParquetArtifact(ManagedResource):
|
|
86
123
|
"logger": self.logger,
|
87
124
|
"description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
|
88
125
|
"skipped": list(skipped_files),
|
89
|
-
"mmanifest": self.mmanifest,
|
126
|
+
"mmanifest": self.mmanifest, # Pass the instance
|
90
127
|
}
|
91
128
|
return UpdatePlanner(**cfg)
|
92
129
|
|
93
130
|
@cached_property
|
94
131
|
def data_wrapper(self) -> DataWrapper:
|
132
|
+
"""Lazily initialize and return the DataWrapper."""
|
95
133
|
self.logger.info("Initializing DataWrapper...", extra=self.logger_extra)
|
96
134
|
|
97
|
-
# Ensure the planner has
|
98
|
-
if
|
99
|
-
|
135
|
+
# Ensure the planner has generated its plan (accessing the property triggers generation if needed)
|
136
|
+
# The planner itself checks if a plan already exists.
|
137
|
+
_ = self.update_planner.plan # Access plan property
|
100
138
|
|
139
|
+
# Prepare parameters for the data wrapper class instantiation (passed to Etl...Dc)
|
101
140
|
class_params = {
|
102
141
|
"debug": self.debug,
|
103
142
|
"logger": self.logger,
|
@@ -105,6 +144,7 @@ class ParquetArtifact(ManagedResource):
|
|
105
144
|
"verbose": self.verbose,
|
106
145
|
}
|
107
146
|
|
147
|
+
# Prepare configuration for the DataWrapper
|
108
148
|
cfg = {
|
109
149
|
"data_path": self._storage_path,
|
110
150
|
"parquet_filename": self._parquet_filename,
|
@@ -115,29 +155,36 @@ class ParquetArtifact(ManagedResource):
|
|
115
155
|
"dataclass": self._data_wrapper_class,
|
116
156
|
"class_params": class_params,
|
117
157
|
"load_params": self.all_kwargs.get("load_params", {}) or {},
|
118
|
-
"mmanifest": self.mmanifest,
|
119
|
-
"update_planner": self.update_planner,
|
158
|
+
"mmanifest": self.mmanifest, # Pass the instance
|
159
|
+
"update_planner": self.update_planner, # Pass the instance
|
120
160
|
"date_field": self.all_kwargs.get("date_field"),
|
121
|
-
#
|
161
|
+
# Pipeline execution knobs
|
122
162
|
"show_progress": bool(self.all_kwargs.get("show_progress", False)),
|
123
163
|
"timeout": float(self.all_kwargs.get("timeout", 30.0)),
|
124
164
|
"max_threads": int(self.all_kwargs.get("max_threads", 3)),
|
125
165
|
}
|
126
166
|
return DataWrapper(**cfg)
|
127
167
|
|
128
|
-
#
|
168
|
+
# --------------------- Public API ---------------------
|
129
169
|
def load(self, **kwargs: Any):
|
130
170
|
"""
|
131
|
-
|
132
|
-
|
171
|
+
Directly load data using the configured data_wrapper_class.
|
172
|
+
This bypasses the planning/manifest process.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
**kwargs: Arguments passed to the dataclass's load method.
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
The result of the dataclass's load method (expected to be a DataFrame).
|
133
179
|
"""
|
134
|
-
self.logger.info(f"
|
180
|
+
self.logger.info(f"Directly loading data using {self._data_wrapper_class}", extra=self.logger_extra)
|
135
181
|
|
136
182
|
if not self._data_wrapper_class:
|
137
183
|
raise ValueError("data_wrapper_class is not configured.")
|
138
184
|
|
185
|
+
# Prepare parameters for direct loading (typically using Parquet backend)
|
139
186
|
params = {
|
140
|
-
"backend": "parquet",
|
187
|
+
"backend": "parquet", # Usually implies loading from existing Parquet
|
141
188
|
"fs": self.fs,
|
142
189
|
"logger": self.logger,
|
143
190
|
"debug": self.debug,
|
@@ -149,173 +196,546 @@ class ParquetArtifact(ManagedResource):
|
|
149
196
|
}
|
150
197
|
|
151
198
|
cls = self._data_wrapper_class
|
199
|
+
# Use context manager to ensure proper setup/teardown of the dataclass instance
|
152
200
|
with cls(**params) as instance:
|
153
201
|
return instance.load(**kwargs)
|
154
202
|
|
155
203
|
def generate_parquet(self, **kwargs: Any) -> None:
|
156
204
|
"""
|
157
|
-
Generate or update Parquet according to the plan.
|
158
|
-
- Merges runtime kwargs
|
159
|
-
- Invalidates dependent
|
160
|
-
- Guards against duplicate concurrent runs
|
161
|
-
- Forwards retry/backoff to DataWrapper.process()
|
205
|
+
Generate or update the Parquet dataset according to the plan.
|
206
|
+
- Merges runtime kwargs.
|
207
|
+
- Invalidates dependent cached properties.
|
208
|
+
- Guards against duplicate concurrent runs.
|
209
|
+
- Forwards retry/backoff settings to DataWrapper.process().
|
162
210
|
"""
|
163
|
-
#
|
211
|
+
# --- 1. Merge runtime configuration ---
|
164
212
|
self.all_kwargs.update(kwargs)
|
213
|
+
|
214
|
+
# --- 2. Invalidate caches that depend on runtime changes ---
|
215
|
+
# These need to be recreated if their dependencies change
|
165
216
|
self._invalidate_cached("update_planner", "data_wrapper")
|
166
217
|
if "overwrite" in kwargs:
|
167
|
-
self._invalidate_cached("mmanifest")
|
218
|
+
self._invalidate_cached("mmanifest") # Overwrite affects manifest creation
|
168
219
|
|
169
|
-
# Global
|
220
|
+
# --- 3. Global concurrency control ---
|
170
221
|
key = (self._storage_path, self._parquet_filename)
|
171
222
|
with ParquetArtifact._global_lock:
|
172
223
|
if key in ParquetArtifact._active_runs:
|
173
224
|
self.logger.info(
|
174
225
|
f"Run already in progress for {key}; skipping this invocation.", extra=self.logger_extra
|
175
226
|
)
|
176
|
-
return
|
227
|
+
return # Exit early if another run is active
|
177
228
|
ParquetArtifact._active_runs.add(key)
|
229
|
+
self.logger.debug(f"Acquired lock for run {key}.", extra=self.logger_extra)
|
178
230
|
|
179
231
|
try:
|
232
|
+
# --- 4. Ensure base storage directory exists ---
|
180
233
|
self.ensure_directory_exists(self._storage_path)
|
181
234
|
|
235
|
+
# --- 5. Generate update plan ---
|
236
|
+
self.logger.debug("Generating update plan...", extra=self.logger_extra)
|
182
237
|
self.update_planner.generate_plan()
|
183
238
|
plan = getattr(self.update_planner, "plan", None)
|
239
|
+
|
240
|
+
# --- 6. Check if any updates are required ---
|
184
241
|
if plan is None or (hasattr(plan, "empty") and plan.empty):
|
185
|
-
# Planning uses Pandas;
|
186
|
-
self.logger.info("No updates needed. Skipping Parquet generation.", extra=self.logger_extra)
|
242
|
+
# Planning uses Pandas; checking .empty is safe.
|
243
|
+
self.logger.info("No updates needed based on the plan. Skipping Parquet generation.", extra=self.logger_extra)
|
187
244
|
return
|
188
245
|
|
189
|
-
#
|
246
|
+
# --- 7. (Optional) Display the plan ---
|
190
247
|
if (
|
191
|
-
getattr(self.update_planner, "show_progress", False)
|
192
|
-
|
248
|
+
getattr(self.update_planner, "show_progress", False) and
|
249
|
+
not getattr(self.update_planner, "_printed_this_run", False)
|
193
250
|
):
|
194
|
-
|
251
|
+
try:
|
252
|
+
self.update_planner.show_update_plan()
|
253
|
+
except Exception as e:
|
254
|
+
self.logger.warning(f"Failed to display update plan: {e}", extra=self.logger_extra)
|
255
|
+
# Mark as printed to avoid repeated display if generate_parquet is called again
|
195
256
|
setattr(self.update_planner, "_printed_this_run", True)
|
196
257
|
|
197
|
-
#
|
258
|
+
# --- 8. Prepare retry/backoff configuration for DataWrapper ---
|
198
259
|
dw_retry_kwargs = {
|
199
260
|
k: self.all_kwargs[k]
|
200
261
|
for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
|
201
262
|
if k in self.all_kwargs
|
202
263
|
}
|
203
264
|
|
204
|
-
|
205
|
-
|
265
|
+
# --- 9. Execute processing via DataWrapper ---
|
266
|
+
with self._lock: # Instance-level lock for accessing cached_property
|
267
|
+
dw = self.data_wrapper # Access the cached property (triggers initialization if needed)
|
206
268
|
if hasattr(dw, "process"):
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
269
|
+
self.logger.info("Starting DataWrapper processing...", extra=self.logger_extra)
|
270
|
+
dw.process(**dw_retry_kwargs) # This is where the concurrent date processing happens
|
271
|
+
self.logger.info("DataWrapper processing completed.", extra=self.logger_extra)
|
272
|
+
|
273
|
+
# --- 10. (Optional) Show benchmark summary ---
|
274
|
+
if getattr(self.update_planner, "show_progress", False) and hasattr(dw, "show_benchmark_summary"):
|
275
|
+
try:
|
276
|
+
dw.show_benchmark_summary()
|
277
|
+
except Exception as e:
|
278
|
+
self.logger.warning(f"Failed to show benchmark summary: {e}", extra=self.logger_extra)
|
212
279
|
|
213
280
|
finally:
|
281
|
+
# --- 11. Release global concurrency lock ---
|
214
282
|
with ParquetArtifact._global_lock:
|
215
283
|
ParquetArtifact._active_runs.discard(key)
|
284
|
+
self.logger.debug(f"Released lock for run {key}.", extra=self.logger_extra)
|
216
285
|
|
217
286
|
def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
|
218
287
|
"""
|
219
|
-
High-level entry point to update Parquet for a
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
288
|
+
High-level entry point to update Parquet for a standard or custom period.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
period (str): The period to update. Options:
|
292
|
+
- Standard periods: 'today', 'yesterday', 'last_7_days', etc. (via DateUtils.parse_period)
|
293
|
+
- 'ytd': Year-to-date.
|
294
|
+
- 'itd': Inception-to-date (requires 'history_begins_on' in kwargs).
|
295
|
+
- 'custom': Requires 'start_on' and 'end_on' (aliases 'start_date'/'start', 'end_date'/'end' supported).
|
296
|
+
**kwargs: Additional arguments passed to generate_parquet, including retry/backoff settings.
|
225
297
|
"""
|
226
298
|
final_kwargs = {**self.all_kwargs, **kwargs}
|
299
|
+
period_params: Dict[str, dt.date] = {}
|
227
300
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
301
|
+
# --- Determine date range based on period ---
|
302
|
+
if period == "itd":
|
303
|
+
start_date_str = final_kwargs.get("history_begins_on")
|
304
|
+
if not start_date_str:
|
305
|
+
raise ValueError("For period 'itd', 'history_begins_on' must be configured.")
|
306
|
+
try:
|
307
|
+
start_date = dt.datetime.strptime(start_date_str, "%Y-%m-%d").date()
|
308
|
+
except ValueError:
|
309
|
+
raise ValueError(f"Invalid date format for 'history_begins_on': {start_date_str}. Expected YYYY-MM-DD.")
|
310
|
+
period_params = {
|
235
311
|
"parquet_start_date": start_date,
|
236
312
|
"parquet_end_date": dt.date.today(),
|
237
313
|
}
|
238
314
|
|
239
|
-
|
240
|
-
|
315
|
+
elif period == "ytd":
|
316
|
+
period_params = {
|
241
317
|
"parquet_start_date": dt.date(dt.date.today().year, 1, 1),
|
242
318
|
"parquet_end_date": dt.date.today(),
|
243
319
|
}
|
244
320
|
|
245
|
-
|
246
|
-
|
247
|
-
Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
|
248
|
-
are provided (with backward compatibility for `start_date`/`end_date` aliases).
|
249
|
-
"""
|
250
|
-
# Backward compatibility: normalize aliases
|
321
|
+
elif period == "custom":
|
322
|
+
# --- Handle 'custom' period with alias normalization ---
|
251
323
|
alias_map = {
|
252
|
-
"start_on":
|
253
|
-
"end_on":
|
324
|
+
"start_on": ["start_date", "start"],
|
325
|
+
"end_on": ["end_date", "end"],
|
254
326
|
}
|
255
|
-
normalized_kwargs = dict(kwargs)
|
256
|
-
|
257
|
-
|
327
|
+
normalized_kwargs: Dict[str, Any] = dict(kwargs) # Shallow copy
|
328
|
+
|
329
|
+
for target_key, aliases in alias_map.items():
|
330
|
+
if target_key not in normalized_kwargs:
|
258
331
|
for alias in aliases:
|
259
332
|
if alias in normalized_kwargs:
|
260
|
-
normalized_kwargs[
|
261
|
-
break
|
333
|
+
normalized_kwargs[target_key] = normalized_kwargs[alias]
|
334
|
+
break # Use the first alias found
|
262
335
|
|
263
|
-
#
|
264
|
-
|
265
|
-
if
|
336
|
+
# --- Validate required keys for 'custom' ---
|
337
|
+
missing_keys = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
|
338
|
+
if missing_keys:
|
266
339
|
raise ValueError(
|
267
|
-
f"For period 'custom', the following required parameters are missing: {', '.join(
|
340
|
+
f"For period 'custom', the following required parameters are missing: {', '.join(missing_keys)}"
|
268
341
|
)
|
269
342
|
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
343
|
+
# --- Parse and validate custom dates ---
|
344
|
+
try:
|
345
|
+
start_date_custom = dt.datetime.strptime(str(normalized_kwargs["start_on"]), "%Y-%m-%d").date()
|
346
|
+
end_date_custom = dt.datetime.strptime(str(normalized_kwargs["end_on"]), "%Y-%m-%d").date()
|
347
|
+
if start_date_custom > end_date_custom:
|
348
|
+
raise ValueError(f"Start date {start_date_custom} cannot be after end date {end_date_custom}.")
|
349
|
+
except ValueError as e:
|
350
|
+
raise ValueError(f"Invalid date format or range for 'custom' period: {e}")
|
274
351
|
|
275
|
-
if period == "itd":
|
276
|
-
period_params = itd_config()
|
277
|
-
elif period == "ytd":
|
278
|
-
period_params = ytd_config()
|
279
|
-
elif period == "custom":
|
280
|
-
period_params = custom_config()
|
281
|
-
else:
|
282
|
-
start_date, end_date = DateUtils.parse_period(period=period)
|
283
352
|
period_params = {
|
284
|
-
"parquet_start_date":
|
285
|
-
"parquet_end_date":
|
353
|
+
"parquet_start_date": start_date_custom,
|
354
|
+
"parquet_end_date": end_date_custom,
|
286
355
|
}
|
287
356
|
|
357
|
+
else:
|
358
|
+
# --- Handle standard periods via DateUtils ---
|
359
|
+
try:
|
360
|
+
start_date_std, end_date_std = DateUtils.parse_period(period=period)
|
361
|
+
period_params = {
|
362
|
+
"parquet_start_date": start_date_std,
|
363
|
+
"parquet_end_date": end_date_std,
|
364
|
+
}
|
365
|
+
except Exception as e:
|
366
|
+
raise ValueError(f"Failed to parse period '{period}': {e}") from e
|
367
|
+
|
368
|
+
# --- Merge period parameters and log ---
|
288
369
|
final_kwargs.update(period_params)
|
289
370
|
self.logger.debug(
|
290
|
-
f"
|
371
|
+
f"Parameters for update_parquet/generate_parquet (period '{period}'): {final_kwargs}",
|
372
|
+
extra=self.logger_extra
|
291
373
|
)
|
292
374
|
|
293
|
-
# Delegate to
|
375
|
+
# --- Delegate to the core generation logic ---
|
294
376
|
self.generate_parquet(**final_kwargs)
|
295
377
|
|
296
|
-
#
|
378
|
+
# --------------------- Utilities ---------------------
|
297
379
|
def ensure_directory_exists(self, path: str) -> None:
|
298
|
-
"""Ensure the directory exists
|
380
|
+
"""Ensure the directory exists, handling potential backend quirks."""
|
299
381
|
with self._lock:
|
300
382
|
if not self.fs.exists(path):
|
301
383
|
self.logger.info(f"Creating directory: {path}", extra=self.logger_extra)
|
302
384
|
try:
|
385
|
+
# Try with exist_ok first (standard approach)
|
303
386
|
self.fs.makedirs(path, exist_ok=True)
|
304
387
|
except TypeError:
|
388
|
+
# Fallback for backends that don't support exist_ok
|
305
389
|
try:
|
306
390
|
self.fs.makedirs(path)
|
307
391
|
except FileExistsError:
|
392
|
+
# Handle race condition where dir was created between checks
|
308
393
|
pass
|
309
|
-
|
310
|
-
|
311
|
-
|
394
|
+
except Exception as e:
|
395
|
+
# Catch other potential errors during directory creation
|
396
|
+
self.logger.error(f"Failed to create directory {path}: {e}", extra=self.logger_extra)
|
397
|
+
raise # Re-raise to prevent proceeding with a missing directory
|
398
|
+
|
399
|
+
# --------------------- Cleanup ---------------------
|
400
|
+
def _cleanup(self) -> None:
|
401
|
+
"""Clean up resources upon instance closure."""
|
312
402
|
try:
|
313
|
-
if
|
314
|
-
|
403
|
+
# Save manifest if it was modified and has new records
|
404
|
+
if (
|
405
|
+
"mmanifest" in self.__dict__ and
|
406
|
+
hasattr(self.mmanifest, '_new_records') and
|
407
|
+
self.mmanifest._new_records
|
315
408
|
):
|
316
|
-
|
317
|
-
|
409
|
+
self.logger.debug("Saving updated manifest during cleanup.", extra=self.logger_extra)
|
410
|
+
self.mmanifest.save()
|
411
|
+
|
412
|
+
# Close the DataWrapper if it was initialized
|
318
413
|
if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
|
414
|
+
self.logger.debug("Closing DataWrapper during cleanup.", extra=self.logger_extra)
|
319
415
|
self.data_wrapper.close()
|
416
|
+
|
320
417
|
except Exception as e:
|
321
|
-
self.logger.warning(f"Error during resource cleanup: {e}", extra=self.logger_extra)
|
418
|
+
self.logger.warning(f"Error during ParquetArtifact resource cleanup: {e}", extra=self.logger_extra)
|
419
|
+
|
420
|
+
|
421
|
+
# from __future__ import annotations
|
422
|
+
#
|
423
|
+
# import datetime as dt
|
424
|
+
# import threading
|
425
|
+
# from functools import cached_property
|
426
|
+
# from typing import Any, Dict, Type, TypeVar
|
427
|
+
#
|
428
|
+
# from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
|
429
|
+
# from sibi_dst.utils import MissingManifestManager, Logger
|
430
|
+
#
|
431
|
+
# T = TypeVar("T")
|
432
|
+
#
|
433
|
+
#
|
434
|
+
# class ParquetArtifact(ManagedResource):
|
435
|
+
# """
|
436
|
+
# Orchestrates a single dataset:
|
437
|
+
# - Builds/uses MissingManifestManager
|
438
|
+
# - Plans work with UpdatePlanner
|
439
|
+
# - Executes with DataWrapper (threaded) saving Dask → Parquet
|
440
|
+
# - Prevents duplicate concurrent runs per (storage_path, filename)
|
441
|
+
# - Forwards retry/backoff knobs to DataWrapper.process()
|
442
|
+
# """
|
443
|
+
#
|
444
|
+
# _global_lock = threading.RLock()
|
445
|
+
# _active_runs: set[tuple[str, str]] = set()
|
446
|
+
# logger_extra = {"sibi_dst_component": __name__}
|
447
|
+
#
|
448
|
+
# def __init__(self, **kwargs: Any):
|
449
|
+
# # Merge defaults from ManagedResource and caller kwargs
|
450
|
+
# self.all_kwargs: Dict[str, Any] = {**kwargs}
|
451
|
+
# super().__init__(**self.all_kwargs)
|
452
|
+
#
|
453
|
+
# # Persist the minimal config we depend on frequently
|
454
|
+
# self._lock = threading.RLock()
|
455
|
+
#
|
456
|
+
# # Required knobs
|
457
|
+
# self._storage_path: str = self.all_kwargs["parquet_storage_path"]
|
458
|
+
# self._parquet_filename: str = self.all_kwargs["parquet_filename"]
|
459
|
+
# self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
|
460
|
+
#
|
461
|
+
# # ---------- helpers ----------
|
462
|
+
# def _invalidate_cached(self, *names: str) -> None:
|
463
|
+
# for n in names:
|
464
|
+
# self.__dict__.pop(n, None)
|
465
|
+
#
|
466
|
+
# def _build_manifest_path(self) -> str:
|
467
|
+
# base = f"{self._storage_path}".rstrip("/") + "/"
|
468
|
+
# return f"{base}_manifests/missing.parquet"
|
469
|
+
#
|
470
|
+
# # ---------- lazy members ----------
|
471
|
+
# @cached_property
|
472
|
+
# def mmanifest(self) -> MissingManifestManager:
|
473
|
+
# self.logger.info("Initializing MissingManifestManager...", extra=self.logger_extra)
|
474
|
+
# manifest_path = self._build_manifest_path()
|
475
|
+
#
|
476
|
+
# # ensure manifest directory exists
|
477
|
+
# manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
|
478
|
+
# self.ensure_directory_exists(manifest_dir)
|
479
|
+
#
|
480
|
+
# mgr = MissingManifestManager(
|
481
|
+
# fs=self.fs,
|
482
|
+
# manifest_path=manifest_path,
|
483
|
+
# clear_existing=self.all_kwargs.get("overwrite", False),
|
484
|
+
# debug=self.debug,
|
485
|
+
# logger=self.logger,
|
486
|
+
# overwrite=self.all_kwargs.get("overwrite", False),
|
487
|
+
# )
|
488
|
+
#
|
489
|
+
# if not mgr._safe_exists(mgr.manifest_path):
|
490
|
+
# self.logger.info(f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra)
|
491
|
+
# mgr.save()
|
492
|
+
# else:
|
493
|
+
# self.logger.info(f"Manifest already exists at {mgr.manifest_path}", extra=self.logger_extra)
|
494
|
+
#
|
495
|
+
# return mgr
|
496
|
+
#
|
497
|
+
# @cached_property
|
498
|
+
# def update_planner(self) -> UpdatePlanner:
|
499
|
+
# self.logger.info("Initializing UpdatePlanner...", extra=self.logger_extra)
|
500
|
+
# skipped_files = self.mmanifest.load_existing() or []
|
501
|
+
#
|
502
|
+
# cfg = {
|
503
|
+
# **self.all_kwargs,
|
504
|
+
# "fs": self.fs,
|
505
|
+
# "debug": self.debug,
|
506
|
+
# "logger": self.logger,
|
507
|
+
# "description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
|
508
|
+
# "skipped": list(skipped_files),
|
509
|
+
# "mmanifest": self.mmanifest,
|
510
|
+
# }
|
511
|
+
# return UpdatePlanner(**cfg)
|
512
|
+
#
|
513
|
+
# @cached_property
|
514
|
+
# def data_wrapper(self) -> DataWrapper:
|
515
|
+
# self.logger.info("Initializing DataWrapper...", extra=self.logger_extra)
|
516
|
+
#
|
517
|
+
# # Ensure the planner has a plan
|
518
|
+
# if getattr(self.update_planner, "plan", None) is None:
|
519
|
+
# self.update_planner.generate_plan()
|
520
|
+
#
|
521
|
+
# class_params = {
|
522
|
+
# "debug": self.debug,
|
523
|
+
# "logger": self.logger,
|
524
|
+
# "fs": self.fs,
|
525
|
+
# "verbose": self.verbose,
|
526
|
+
# }
|
527
|
+
#
|
528
|
+
# cfg = {
|
529
|
+
# "data_path": self._storage_path,
|
530
|
+
# "parquet_filename": self._parquet_filename,
|
531
|
+
# "fs": self.fs,
|
532
|
+
# "debug": self.debug,
|
533
|
+
# "logger": self.logger,
|
534
|
+
# "verbose": self.verbose,
|
535
|
+
# "dataclass": self._data_wrapper_class,
|
536
|
+
# "class_params": class_params,
|
537
|
+
# "load_params": self.all_kwargs.get("load_params", {}) or {},
|
538
|
+
# "mmanifest": self.mmanifest,
|
539
|
+
# "update_planner": self.update_planner,
|
540
|
+
# "date_field": self.all_kwargs.get("date_field"),
|
541
|
+
# # pipeline execution knobs
|
542
|
+
# "show_progress": bool(self.all_kwargs.get("show_progress", False)),
|
543
|
+
# "timeout": float(self.all_kwargs.get("timeout", 30.0)),
|
544
|
+
# "max_threads": int(self.all_kwargs.get("max_threads", 3)),
|
545
|
+
# }
|
546
|
+
# return DataWrapper(**cfg)
|
547
|
+
#
|
548
|
+
# # ---------- public API ----------
|
549
|
+
# def load(self, **kwargs: Any):
|
550
|
+
# """
|
551
|
+
# Direct load using the configured data_wrapper_class (no planner/manifest round-trip).
|
552
|
+
# Expected to return a Dask DataFrame from the loader.
|
553
|
+
# """
|
554
|
+
# self.logger.info(f"Loading data from {self._storage_path}")
|
555
|
+
#
|
556
|
+
# if not self._data_wrapper_class:
|
557
|
+
# raise ValueError("data_wrapper_class is not configured.")
|
558
|
+
#
|
559
|
+
# params = {
|
560
|
+
# "backend": "parquet",
|
561
|
+
# "fs": self.fs,
|
562
|
+
# "logger": self.logger,
|
563
|
+
# "debug": self.debug,
|
564
|
+
# "parquet_storage_path": self._storage_path,
|
565
|
+
# "parquet_filename": self._parquet_filename,
|
566
|
+
# "parquet_start_date": self.all_kwargs.get("parquet_start_date"),
|
567
|
+
# "parquet_end_date": self.all_kwargs.get("parquet_end_date"),
|
568
|
+
# **(self.all_kwargs.get("class_params") or {}),
|
569
|
+
# }
|
570
|
+
#
|
571
|
+
# cls = self._data_wrapper_class
|
572
|
+
# with cls(**params) as instance:
|
573
|
+
# return instance.load(**kwargs)
|
574
|
+
#
|
575
|
+
# def generate_parquet(self, **kwargs: Any) -> None:
|
576
|
+
# """
|
577
|
+
# Generate or update Parquet according to the plan.
|
578
|
+
# - Merges runtime kwargs
|
579
|
+
# - Invalidates dependent caches
|
580
|
+
# - Guards against duplicate concurrent runs
|
581
|
+
# - Forwards retry/backoff to DataWrapper.process()
|
582
|
+
# """
|
583
|
+
# # Merge and invalidate caches that depend on runtime changes
|
584
|
+
# self.all_kwargs.update(kwargs)
|
585
|
+
# self._invalidate_cached("update_planner", "data_wrapper")
|
586
|
+
# if "overwrite" in kwargs:
|
587
|
+
# self._invalidate_cached("mmanifest")
|
588
|
+
#
|
589
|
+
# # Global de-dupe guard
|
590
|
+
# key = (self._storage_path, self._parquet_filename)
|
591
|
+
# with ParquetArtifact._global_lock:
|
592
|
+
# if key in ParquetArtifact._active_runs:
|
593
|
+
# self.logger.info(
|
594
|
+
# f"Run already in progress for {key}; skipping this invocation.", extra=self.logger_extra
|
595
|
+
# )
|
596
|
+
# return
|
597
|
+
# ParquetArtifact._active_runs.add(key)
|
598
|
+
#
|
599
|
+
# try:
|
600
|
+
# self.ensure_directory_exists(self._storage_path)
|
601
|
+
#
|
602
|
+
# self.update_planner.generate_plan()
|
603
|
+
# plan = getattr(self.update_planner, "plan", None)
|
604
|
+
# if plan is None or (hasattr(plan, "empty") and plan.empty):
|
605
|
+
# # Planning uses Pandas; this is safe to check.
|
606
|
+
# self.logger.info("No updates needed. Skipping Parquet generation.", extra=self.logger_extra)
|
607
|
+
# return
|
608
|
+
#
|
609
|
+
# # Print plan once per run
|
610
|
+
# if (
|
611
|
+
# getattr(self.update_planner, "show_progress", False)
|
612
|
+
# and not getattr(self.update_planner, "_printed_this_run", False)
|
613
|
+
# ):
|
614
|
+
# self.update_planner.show_update_plan()
|
615
|
+
# setattr(self.update_planner, "_printed_this_run", True)
|
616
|
+
#
|
617
|
+
# # ---- forward retry/backoff knobs to DataWrapper.process() ----
|
618
|
+
# dw_retry_kwargs = {
|
619
|
+
# k: self.all_kwargs[k]
|
620
|
+
# for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
|
621
|
+
# if k in self.all_kwargs
|
622
|
+
# }
|
623
|
+
#
|
624
|
+
# with self._lock:
|
625
|
+
# dw = self.data_wrapper # single cached_property access
|
626
|
+
# if hasattr(dw, "process"):
|
627
|
+
# dw.process(**dw_retry_kwargs)
|
628
|
+
# if getattr(self.update_planner, "show_progress", False) and hasattr(
|
629
|
+
# dw, "show_benchmark_summary"
|
630
|
+
# ):
|
631
|
+
# dw.show_benchmark_summary()
|
632
|
+
#
|
633
|
+
# finally:
|
634
|
+
# with ParquetArtifact._global_lock:
|
635
|
+
# ParquetArtifact._active_runs.discard(key)
|
636
|
+
#
|
637
|
+
# def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
|
638
|
+
# """
|
639
|
+
# High-level entry point to update Parquet for a given period:
|
640
|
+
# - 'today', 'yesterday', 'last_7_days', etc. via DateUtils.parse_period
|
641
|
+
# - 'ytd'
|
642
|
+
# - 'itd' (requires history_begins_on)
|
643
|
+
# - 'custom' (requires start_on / end_on)
|
644
|
+
# Also accepts retry/backoff knobs which flow to DataWrapper.process().
|
645
|
+
# """
|
646
|
+
# final_kwargs = {**self.all_kwargs, **kwargs}
|
647
|
+
#
|
648
|
+
# def itd_config():
|
649
|
+
# start_date = final_kwargs.get("history_begins_on")
|
650
|
+
# if not start_date:
|
651
|
+
# raise ValueError(
|
652
|
+
# "For period 'itd', 'history_begins_on' must be configured."
|
653
|
+
# )
|
654
|
+
# return {
|
655
|
+
# "parquet_start_date": start_date,
|
656
|
+
# "parquet_end_date": dt.date.today(),
|
657
|
+
# }
|
658
|
+
#
|
659
|
+
# def ytd_config():
|
660
|
+
# return {
|
661
|
+
# "parquet_start_date": dt.date(dt.date.today().year, 1, 1),
|
662
|
+
# "parquet_end_date": dt.date.today(),
|
663
|
+
# }
|
664
|
+
#
|
665
|
+
# def custom_config():
|
666
|
+
# """
|
667
|
+
# Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
|
668
|
+
# are provided (with backward compatibility for `start_date`/`end_date` aliases).
|
669
|
+
# """
|
670
|
+
# # Backward compatibility: normalize aliases
|
671
|
+
# alias_map = {
|
672
|
+
# "start_on": ("start_date", "start"),
|
673
|
+
# "end_on": ("end_date", "end"),
|
674
|
+
# }
|
675
|
+
# normalized_kwargs = dict(kwargs) # shallow copy so we don't mutate original
|
676
|
+
# for target, aliases in alias_map.items():
|
677
|
+
# if target not in normalized_kwargs:
|
678
|
+
# for alias in aliases:
|
679
|
+
# if alias in normalized_kwargs:
|
680
|
+
# normalized_kwargs[target] = normalized_kwargs[alias]
|
681
|
+
# break
|
682
|
+
#
|
683
|
+
# # Validation
|
684
|
+
# missing = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
|
685
|
+
# if missing:
|
686
|
+
# raise ValueError(
|
687
|
+
# f"For period 'custom', the following required parameters are missing: {', '.join(missing)}"
|
688
|
+
# )
|
689
|
+
#
|
690
|
+
# return {
|
691
|
+
# "parquet_start_date": normalized_kwargs["start_on"],
|
692
|
+
# "parquet_end_date": normalized_kwargs["end_on"],
|
693
|
+
# }
|
694
|
+
#
|
695
|
+
# if period == "itd":
|
696
|
+
# period_params = itd_config()
|
697
|
+
# elif period == "ytd":
|
698
|
+
# period_params = ytd_config()
|
699
|
+
# elif period == "custom":
|
700
|
+
# period_params = custom_config()
|
701
|
+
# else:
|
702
|
+
# start_date, end_date = DateUtils.parse_period(period=period)
|
703
|
+
# period_params = {
|
704
|
+
# "parquet_start_date": start_date,
|
705
|
+
# "parquet_end_date": end_date,
|
706
|
+
# }
|
707
|
+
#
|
708
|
+
# final_kwargs.update(period_params)
|
709
|
+
# self.logger.debug(
|
710
|
+
# f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}", extra=self.logger_extra
|
711
|
+
# )
|
712
|
+
#
|
713
|
+
# # Delegate to generator (handles cache invalidation + forwarding knobs)
|
714
|
+
# self.generate_parquet(**final_kwargs)
|
715
|
+
#
|
716
|
+
# # ---------- utils ----------
|
717
|
+
# def ensure_directory_exists(self, path: str) -> None:
|
718
|
+
# """Ensure the directory exists across fsspec backends."""
|
719
|
+
# with self._lock:
|
720
|
+
# if not self.fs.exists(path):
|
721
|
+
# self.logger.info(f"Creating directory: {path}", extra=self.logger_extra)
|
722
|
+
# try:
|
723
|
+
# self.fs.makedirs(path, exist_ok=True)
|
724
|
+
# except TypeError:
|
725
|
+
# try:
|
726
|
+
# self.fs.makedirs(path)
|
727
|
+
# except FileExistsError:
|
728
|
+
# pass
|
729
|
+
#
|
730
|
+
# def _cleanup(self):
|
731
|
+
# """Clean up resources upon exit."""
|
732
|
+
# try:
|
733
|
+
# if "mmanifest" in self.__dict__ and getattr(
|
734
|
+
# self.mmanifest, "_new_records", None
|
735
|
+
# ):
|
736
|
+
# if self.mmanifest._new_records:
|
737
|
+
# self.mmanifest.save()
|
738
|
+
# if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
|
739
|
+
# self.data_wrapper.close()
|
740
|
+
# except Exception as e:
|
741
|
+
# self.logger.warning(f"Error during resource cleanup: {e}", extra=self.logger_extra)
|