sibi-dst 2025.9.2__py3-none-any.whl → 2025.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,59 +1,90 @@
1
+ # parquet_artifact.py
1
2
  from __future__ import annotations
2
3
 
3
4
  import datetime as dt
4
5
  import threading
5
6
  from functools import cached_property
6
- from typing import Any, Dict, Type, TypeVar
7
+ from typing import Any, Dict, Type, TypeVar, Optional, Set
7
8
 
8
9
  from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
9
- from sibi_dst.utils import MissingManifestManager, Logger
10
+ from sibi_dst.utils import MissingManifestManager
10
11
 
12
+ # Type variable for potential future use with specific DataWrapper subclasses
11
13
  T = TypeVar("T")
12
14
 
13
15
 
14
16
  class ParquetArtifact(ManagedResource):
15
17
  """
16
- Orchestrates a single dataset:
17
- - Builds/uses MissingManifestManager
18
- - Plans work with UpdatePlanner
19
- - Executes with DataWrapper (threaded) saving Dask → Parquet
20
- - Prevents duplicate concurrent runs per (storage_path, filename)
21
- - Forwards retry/backoff knobs to DataWrapper.process()
18
+ Orchestrates the generation of a single date-partitioned Parquet dataset.
19
+ - Manages a MissingManifestManager for tracking missing data.
20
+ - Uses an UpdatePlanner to determine which dates need processing.
21
+ - Delegates execution to a DataWrapper for concurrent date processing.
22
+ - Prevents duplicate concurrent runs for the same dataset.
23
+ - Forwards retry/backoff configuration to the DataWrapper.
22
24
  """
23
25
 
26
+ # --- Class-level state for concurrency control ---
24
27
  _global_lock = threading.RLock()
25
- _active_runs: set[tuple[str, str]] = set()
28
+ _active_runs: Set[tuple[str, str]] = set()
26
29
  logger_extra = {"sibi_dst_component": __name__}
27
30
 
28
31
  def __init__(self, **kwargs: Any):
32
+ """
33
+ Initializes the ParquetArtifact.
34
+
35
+ Args:
36
+ **kwargs: Configuration parameters including:
37
+ - parquet_storage_path (str): The base S3/path for the dataset.
38
+ - parquet_filename (str): The name of the Parquet file within each date partition.
39
+ - data_wrapper_class (Type): The class (e.g., Etl...Dc) used to load/process data for a date.
40
+ - fs (fsspec.AbstractFileSystem): The filesystem object.
41
+ - logger (Logger): Logger instance.
42
+ - debug (bool): Enable debug logging.
43
+ - verbose (bool): Enable verbose logging.
44
+ - Plus other parameters passed to UpdatePlanner, DataWrapper, etc.
45
+ """
29
46
  # Merge defaults from ManagedResource and caller kwargs
30
- self.all_kwargs: Dict[str, Any] = {**kwargs}
47
+ self.all_kwargs: Dict[str, Any] = dict(kwargs) # Shallow copy
31
48
  super().__init__(**self.all_kwargs)
32
49
 
33
- # Persist the minimal config we depend on frequently
50
+ # --- Instance-level coordination lock ---
34
51
  self._lock = threading.RLock()
35
52
 
36
- # Required knobs
53
+ # --- Core Configuration (validated/accessed frequently) ---
54
+ if "parquet_storage_path" not in self.all_kwargs:
55
+ raise ValueError("Required argument 'parquet_storage_path' is missing.")
56
+ if "parquet_filename" not in self.all_kwargs:
57
+ raise ValueError("Required argument 'parquet_filename' is missing.")
58
+
37
59
  self._storage_path: str = self.all_kwargs["parquet_storage_path"]
38
60
  self._parquet_filename: str = self.all_kwargs["parquet_filename"]
39
- self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
61
+ self._data_wrapper_class: Optional[Type] = self.all_kwargs.get("data_wrapper_class")
62
+
63
+ # Update logger extra with specific context
64
+ self.logger_extra.update({
65
+ "artifact_storage_path": self._storage_path,
66
+ "artifact_filename": self._parquet_filename
67
+ })
40
68
 
41
- # ---------- helpers ----------
69
+ # --------------------- Helpers ---------------------
42
70
  def _invalidate_cached(self, *names: str) -> None:
43
- for n in names:
44
- self.__dict__.pop(n, None)
71
+ """Invalidate cached properties by name."""
72
+ for name in names:
73
+ self.__dict__.pop(name, None)
45
74
 
46
75
  def _build_manifest_path(self) -> str:
47
- base = f"{self._storage_path}".rstrip("/") + "/"
76
+ """Construct the path for the missing manifest file."""
77
+ base = self._storage_path.rstrip("/") + "/"
48
78
  return f"{base}_manifests/missing.parquet"
49
79
 
50
- # ---------- lazy members ----------
80
+ # --------------------- Lazy Members (Cached Properties) ---------------------
51
81
  @cached_property
52
82
  def mmanifest(self) -> MissingManifestManager:
83
+ """Lazily initialize and return the MissingManifestManager."""
53
84
  self.logger.info("Initializing MissingManifestManager...", extra=self.logger_extra)
54
85
  manifest_path = self._build_manifest_path()
55
86
 
56
- # ensure manifest directory exists
87
+ # Ensure the manifest directory exists
57
88
  manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
58
89
  self.ensure_directory_exists(manifest_dir)
59
90
 
@@ -68,7 +99,11 @@ class ParquetArtifact(ManagedResource):
68
99
 
69
100
  if not mgr._safe_exists(mgr.manifest_path):
70
101
  self.logger.info(f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra)
71
- mgr.save()
102
+ try:
103
+ mgr.save()
104
+ except Exception as e:
105
+ self.logger.error(f"Failed to create initial manifest: {e}", extra=self.logger_extra)
106
+ raise
72
107
  else:
73
108
  self.logger.info(f"Manifest already exists at {mgr.manifest_path}", extra=self.logger_extra)
74
109
 
@@ -76,9 +111,11 @@ class ParquetArtifact(ManagedResource):
76
111
 
77
112
  @cached_property
78
113
  def update_planner(self) -> UpdatePlanner:
114
+ """Lazily initialize and return the UpdatePlanner."""
79
115
  self.logger.info("Initializing UpdatePlanner...", extra=self.logger_extra)
80
116
  skipped_files = self.mmanifest.load_existing() or []
81
117
 
118
+ # Prepare configuration for the UpdatePlanner
82
119
  cfg = {
83
120
  **self.all_kwargs,
84
121
  "fs": self.fs,
@@ -86,18 +123,20 @@ class ParquetArtifact(ManagedResource):
86
123
  "logger": self.logger,
87
124
  "description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
88
125
  "skipped": list(skipped_files),
89
- "mmanifest": self.mmanifest,
126
+ "mmanifest": self.mmanifest, # Pass the instance
90
127
  }
91
128
  return UpdatePlanner(**cfg)
92
129
 
93
130
  @cached_property
94
131
  def data_wrapper(self) -> DataWrapper:
132
+ """Lazily initialize and return the DataWrapper."""
95
133
  self.logger.info("Initializing DataWrapper...", extra=self.logger_extra)
96
134
 
97
- # Ensure the planner has a plan
98
- if getattr(self.update_planner, "plan", None) is None:
99
- self.update_planner.generate_plan()
135
+ # Ensure the planner has generated its plan (accessing the property triggers generation if needed)
136
+ # The planner itself checks if a plan already exists.
137
+ _ = self.update_planner.plan # Access plan property
100
138
 
139
+ # Prepare parameters for the data wrapper class instantiation (passed to Etl...Dc)
101
140
  class_params = {
102
141
  "debug": self.debug,
103
142
  "logger": self.logger,
@@ -105,6 +144,7 @@ class ParquetArtifact(ManagedResource):
105
144
  "verbose": self.verbose,
106
145
  }
107
146
 
147
+ # Prepare configuration for the DataWrapper
108
148
  cfg = {
109
149
  "data_path": self._storage_path,
110
150
  "parquet_filename": self._parquet_filename,
@@ -115,29 +155,36 @@ class ParquetArtifact(ManagedResource):
115
155
  "dataclass": self._data_wrapper_class,
116
156
  "class_params": class_params,
117
157
  "load_params": self.all_kwargs.get("load_params", {}) or {},
118
- "mmanifest": self.mmanifest,
119
- "update_planner": self.update_planner,
158
+ "mmanifest": self.mmanifest, # Pass the instance
159
+ "update_planner": self.update_planner, # Pass the instance
120
160
  "date_field": self.all_kwargs.get("date_field"),
121
- # pipeline execution knobs
161
+ # Pipeline execution knobs
122
162
  "show_progress": bool(self.all_kwargs.get("show_progress", False)),
123
163
  "timeout": float(self.all_kwargs.get("timeout", 30.0)),
124
164
  "max_threads": int(self.all_kwargs.get("max_threads", 3)),
125
165
  }
126
166
  return DataWrapper(**cfg)
127
167
 
128
- # ---------- public API ----------
168
+ # --------------------- Public API ---------------------
129
169
  def load(self, **kwargs: Any):
130
170
  """
131
- Direct load using the configured data_wrapper_class (no planner/manifest round-trip).
132
- Expected to return a Dask DataFrame from the loader.
171
+ Directly load data using the configured data_wrapper_class.
172
+ This bypasses the planning/manifest process.
173
+
174
+ Args:
175
+ **kwargs: Arguments passed to the dataclass's load method.
176
+
177
+ Returns:
178
+ The result of the dataclass's load method (expected to be a DataFrame).
133
179
  """
134
- self.logger.info(f"Loading data from {self._storage_path}")
180
+ self.logger.info(f"Directly loading data using {self._data_wrapper_class}", extra=self.logger_extra)
135
181
 
136
182
  if not self._data_wrapper_class:
137
183
  raise ValueError("data_wrapper_class is not configured.")
138
184
 
185
+ # Prepare parameters for direct loading (typically using Parquet backend)
139
186
  params = {
140
- "backend": "parquet",
187
+ "backend": "parquet", # Usually implies loading from existing Parquet
141
188
  "fs": self.fs,
142
189
  "logger": self.logger,
143
190
  "debug": self.debug,
@@ -149,173 +196,546 @@ class ParquetArtifact(ManagedResource):
149
196
  }
150
197
 
151
198
  cls = self._data_wrapper_class
199
+ # Use context manager to ensure proper setup/teardown of the dataclass instance
152
200
  with cls(**params) as instance:
153
201
  return instance.load(**kwargs)
154
202
 
155
203
  def generate_parquet(self, **kwargs: Any) -> None:
156
204
  """
157
- Generate or update Parquet according to the plan.
158
- - Merges runtime kwargs
159
- - Invalidates dependent caches
160
- - Guards against duplicate concurrent runs
161
- - Forwards retry/backoff to DataWrapper.process()
205
+ Generate or update the Parquet dataset according to the plan.
206
+ - Merges runtime kwargs.
207
+ - Invalidates dependent cached properties.
208
+ - Guards against duplicate concurrent runs.
209
+ - Forwards retry/backoff settings to DataWrapper.process().
162
210
  """
163
- # Merge and invalidate caches that depend on runtime changes
211
+ # --- 1. Merge runtime configuration ---
164
212
  self.all_kwargs.update(kwargs)
213
+
214
+ # --- 2. Invalidate caches that depend on runtime changes ---
215
+ # These need to be recreated if their dependencies change
165
216
  self._invalidate_cached("update_planner", "data_wrapper")
166
217
  if "overwrite" in kwargs:
167
- self._invalidate_cached("mmanifest")
218
+ self._invalidate_cached("mmanifest") # Overwrite affects manifest creation
168
219
 
169
- # Global de-dupe guard
220
+ # --- 3. Global concurrency control ---
170
221
  key = (self._storage_path, self._parquet_filename)
171
222
  with ParquetArtifact._global_lock:
172
223
  if key in ParquetArtifact._active_runs:
173
224
  self.logger.info(
174
225
  f"Run already in progress for {key}; skipping this invocation.", extra=self.logger_extra
175
226
  )
176
- return
227
+ return # Exit early if another run is active
177
228
  ParquetArtifact._active_runs.add(key)
229
+ self.logger.debug(f"Acquired lock for run {key}.", extra=self.logger_extra)
178
230
 
179
231
  try:
232
+ # --- 4. Ensure base storage directory exists ---
180
233
  self.ensure_directory_exists(self._storage_path)
181
234
 
235
+ # --- 5. Generate update plan ---
236
+ self.logger.debug("Generating update plan...", extra=self.logger_extra)
182
237
  self.update_planner.generate_plan()
183
238
  plan = getattr(self.update_planner, "plan", None)
239
+
240
+ # --- 6. Check if any updates are required ---
184
241
  if plan is None or (hasattr(plan, "empty") and plan.empty):
185
- # Planning uses Pandas; this is safe to check.
186
- self.logger.info("No updates needed. Skipping Parquet generation.", extra=self.logger_extra)
242
+ # Planning uses Pandas; checking .empty is safe.
243
+ self.logger.info("No updates needed based on the plan. Skipping Parquet generation.", extra=self.logger_extra)
187
244
  return
188
245
 
189
- # Print plan once per run
246
+ # --- 7. (Optional) Display the plan ---
190
247
  if (
191
- getattr(self.update_planner, "show_progress", False)
192
- and not getattr(self.update_planner, "_printed_this_run", False)
248
+ getattr(self.update_planner, "show_progress", False) and
249
+ not getattr(self.update_planner, "_printed_this_run", False)
193
250
  ):
194
- self.update_planner.show_update_plan()
251
+ try:
252
+ self.update_planner.show_update_plan()
253
+ except Exception as e:
254
+ self.logger.warning(f"Failed to display update plan: {e}", extra=self.logger_extra)
255
+ # Mark as printed to avoid repeated display if generate_parquet is called again
195
256
  setattr(self.update_planner, "_printed_this_run", True)
196
257
 
197
- # ---- forward retry/backoff knobs to DataWrapper.process() ----
258
+ # --- 8. Prepare retry/backoff configuration for DataWrapper ---
198
259
  dw_retry_kwargs = {
199
260
  k: self.all_kwargs[k]
200
261
  for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
201
262
  if k in self.all_kwargs
202
263
  }
203
264
 
204
- with self._lock:
205
- dw = self.data_wrapper # single cached_property access
265
+ # --- 9. Execute processing via DataWrapper ---
266
+ with self._lock: # Instance-level lock for accessing cached_property
267
+ dw = self.data_wrapper # Access the cached property (triggers initialization if needed)
206
268
  if hasattr(dw, "process"):
207
- dw.process(**dw_retry_kwargs)
208
- if getattr(self.update_planner, "show_progress", False) and hasattr(
209
- dw, "show_benchmark_summary"
210
- ):
211
- dw.show_benchmark_summary()
269
+ self.logger.info("Starting DataWrapper processing...", extra=self.logger_extra)
270
+ dw.process(**dw_retry_kwargs) # This is where the concurrent date processing happens
271
+ self.logger.info("DataWrapper processing completed.", extra=self.logger_extra)
272
+
273
+ # --- 10. (Optional) Show benchmark summary ---
274
+ if getattr(self.update_planner, "show_progress", False) and hasattr(dw, "show_benchmark_summary"):
275
+ try:
276
+ dw.show_benchmark_summary()
277
+ except Exception as e:
278
+ self.logger.warning(f"Failed to show benchmark summary: {e}", extra=self.logger_extra)
212
279
 
213
280
  finally:
281
+ # --- 11. Release global concurrency lock ---
214
282
  with ParquetArtifact._global_lock:
215
283
  ParquetArtifact._active_runs.discard(key)
284
+ self.logger.debug(f"Released lock for run {key}.", extra=self.logger_extra)
216
285
 
217
286
  def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
218
287
  """
219
- High-level entry point to update Parquet for a given period:
220
- - 'today', 'yesterday', 'last_7_days', etc. via DateUtils.parse_period
221
- - 'ytd'
222
- - 'itd' (requires history_begins_on)
223
- - 'custom' (requires start_on / end_on)
224
- Also accepts retry/backoff knobs which flow to DataWrapper.process().
288
+ High-level entry point to update Parquet for a standard or custom period.
289
+
290
+ Args:
291
+ period (str): The period to update. Options:
292
+ - Standard periods: 'today', 'yesterday', 'last_7_days', etc. (via DateUtils.parse_period)
293
+ - 'ytd': Year-to-date.
294
+ - 'itd': Inception-to-date (requires 'history_begins_on' in kwargs).
295
+ - 'custom': Requires 'start_on' and 'end_on' (aliases 'start_date'/'start', 'end_date'/'end' supported).
296
+ **kwargs: Additional arguments passed to generate_parquet, including retry/backoff settings.
225
297
  """
226
298
  final_kwargs = {**self.all_kwargs, **kwargs}
299
+ period_params: Dict[str, dt.date] = {}
227
300
 
228
- def itd_config():
229
- start_date = final_kwargs.get("history_begins_on")
230
- if not start_date:
231
- raise ValueError(
232
- "For period 'itd', 'history_begins_on' must be configured."
233
- )
234
- return {
301
+ # --- Determine date range based on period ---
302
+ if period == "itd":
303
+ start_date_str = final_kwargs.get("history_begins_on")
304
+ if not start_date_str:
305
+ raise ValueError("For period 'itd', 'history_begins_on' must be configured.")
306
+ try:
307
+ start_date = dt.datetime.strptime(start_date_str, "%Y-%m-%d").date()
308
+ except ValueError:
309
+ raise ValueError(f"Invalid date format for 'history_begins_on': {start_date_str}. Expected YYYY-MM-DD.")
310
+ period_params = {
235
311
  "parquet_start_date": start_date,
236
312
  "parquet_end_date": dt.date.today(),
237
313
  }
238
314
 
239
- def ytd_config():
240
- return {
315
+ elif period == "ytd":
316
+ period_params = {
241
317
  "parquet_start_date": dt.date(dt.date.today().year, 1, 1),
242
318
  "parquet_end_date": dt.date.today(),
243
319
  }
244
320
 
245
- def custom_config():
246
- """
247
- Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
248
- are provided (with backward compatibility for `start_date`/`end_date` aliases).
249
- """
250
- # Backward compatibility: normalize aliases
321
+ elif period == "custom":
322
+ # --- Handle 'custom' period with alias normalization ---
251
323
  alias_map = {
252
- "start_on": ("start_date", "start"),
253
- "end_on": ("end_date", "end"),
324
+ "start_on": ["start_date", "start"],
325
+ "end_on": ["end_date", "end"],
254
326
  }
255
- normalized_kwargs = dict(kwargs) # shallow copy so we don't mutate original
256
- for target, aliases in alias_map.items():
257
- if target not in normalized_kwargs:
327
+ normalized_kwargs: Dict[str, Any] = dict(kwargs) # Shallow copy
328
+
329
+ for target_key, aliases in alias_map.items():
330
+ if target_key not in normalized_kwargs:
258
331
  for alias in aliases:
259
332
  if alias in normalized_kwargs:
260
- normalized_kwargs[target] = normalized_kwargs[alias]
261
- break
333
+ normalized_kwargs[target_key] = normalized_kwargs[alias]
334
+ break # Use the first alias found
262
335
 
263
- # Validation
264
- missing = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
265
- if missing:
336
+ # --- Validate required keys for 'custom' ---
337
+ missing_keys = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
338
+ if missing_keys:
266
339
  raise ValueError(
267
- f"For period 'custom', the following required parameters are missing: {', '.join(missing)}"
340
+ f"For period 'custom', the following required parameters are missing: {', '.join(missing_keys)}"
268
341
  )
269
342
 
270
- return {
271
- "parquet_start_date": normalized_kwargs["start_on"],
272
- "parquet_end_date": normalized_kwargs["end_on"],
273
- }
343
+ # --- Parse and validate custom dates ---
344
+ try:
345
+ start_date_custom = dt.datetime.strptime(str(normalized_kwargs["start_on"]), "%Y-%m-%d").date()
346
+ end_date_custom = dt.datetime.strptime(str(normalized_kwargs["end_on"]), "%Y-%m-%d").date()
347
+ if start_date_custom > end_date_custom:
348
+ raise ValueError(f"Start date {start_date_custom} cannot be after end date {end_date_custom}.")
349
+ except ValueError as e:
350
+ raise ValueError(f"Invalid date format or range for 'custom' period: {e}")
274
351
 
275
- if period == "itd":
276
- period_params = itd_config()
277
- elif period == "ytd":
278
- period_params = ytd_config()
279
- elif period == "custom":
280
- period_params = custom_config()
281
- else:
282
- start_date, end_date = DateUtils.parse_period(period=period)
283
352
  period_params = {
284
- "parquet_start_date": start_date,
285
- "parquet_end_date": end_date,
353
+ "parquet_start_date": start_date_custom,
354
+ "parquet_end_date": end_date_custom,
286
355
  }
287
356
 
357
+ else:
358
+ # --- Handle standard periods via DateUtils ---
359
+ try:
360
+ start_date_std, end_date_std = DateUtils.parse_period(period=period)
361
+ period_params = {
362
+ "parquet_start_date": start_date_std,
363
+ "parquet_end_date": end_date_std,
364
+ }
365
+ except Exception as e:
366
+ raise ValueError(f"Failed to parse period '{period}': {e}") from e
367
+
368
+ # --- Merge period parameters and log ---
288
369
  final_kwargs.update(period_params)
289
370
  self.logger.debug(
290
- f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}", extra=self.logger_extra
371
+ f"Parameters for update_parquet/generate_parquet (period '{period}'): {final_kwargs}",
372
+ extra=self.logger_extra
291
373
  )
292
374
 
293
- # Delegate to generator (handles cache invalidation + forwarding knobs)
375
+ # --- Delegate to the core generation logic ---
294
376
  self.generate_parquet(**final_kwargs)
295
377
 
296
- # ---------- utils ----------
378
+ # --------------------- Utilities ---------------------
297
379
  def ensure_directory_exists(self, path: str) -> None:
298
- """Ensure the directory exists across fsspec backends."""
380
+ """Ensure the directory exists, handling potential backend quirks."""
299
381
  with self._lock:
300
382
  if not self.fs.exists(path):
301
383
  self.logger.info(f"Creating directory: {path}", extra=self.logger_extra)
302
384
  try:
385
+ # Try with exist_ok first (standard approach)
303
386
  self.fs.makedirs(path, exist_ok=True)
304
387
  except TypeError:
388
+ # Fallback for backends that don't support exist_ok
305
389
  try:
306
390
  self.fs.makedirs(path)
307
391
  except FileExistsError:
392
+ # Handle race condition where dir was created between checks
308
393
  pass
309
-
310
- def _cleanup(self):
311
- """Clean up resources upon exit."""
394
+ except Exception as e:
395
+ # Catch other potential errors during directory creation
396
+ self.logger.error(f"Failed to create directory {path}: {e}", extra=self.logger_extra)
397
+ raise # Re-raise to prevent proceeding with a missing directory
398
+
399
+ # --------------------- Cleanup ---------------------
400
+ def _cleanup(self) -> None:
401
+ """Clean up resources upon instance closure."""
312
402
  try:
313
- if "mmanifest" in self.__dict__ and getattr(
314
- self.mmanifest, "_new_records", None
403
+ # Save manifest if it was modified and has new records
404
+ if (
405
+ "mmanifest" in self.__dict__ and
406
+ hasattr(self.mmanifest, '_new_records') and
407
+ self.mmanifest._new_records
315
408
  ):
316
- if self.mmanifest._new_records:
317
- self.mmanifest.save()
409
+ self.logger.debug("Saving updated manifest during cleanup.", extra=self.logger_extra)
410
+ self.mmanifest.save()
411
+
412
+ # Close the DataWrapper if it was initialized
318
413
  if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
414
+ self.logger.debug("Closing DataWrapper during cleanup.", extra=self.logger_extra)
319
415
  self.data_wrapper.close()
416
+
320
417
  except Exception as e:
321
- self.logger.warning(f"Error during resource cleanup: {e}", extra=self.logger_extra)
418
+ self.logger.warning(f"Error during ParquetArtifact resource cleanup: {e}", extra=self.logger_extra)
419
+
420
+
421
+ # from __future__ import annotations
422
+ #
423
+ # import datetime as dt
424
+ # import threading
425
+ # from functools import cached_property
426
+ # from typing import Any, Dict, Type, TypeVar
427
+ #
428
+ # from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
429
+ # from sibi_dst.utils import MissingManifestManager, Logger
430
+ #
431
+ # T = TypeVar("T")
432
+ #
433
+ #
434
+ # class ParquetArtifact(ManagedResource):
435
+ # """
436
+ # Orchestrates a single dataset:
437
+ # - Builds/uses MissingManifestManager
438
+ # - Plans work with UpdatePlanner
439
+ # - Executes with DataWrapper (threaded) saving Dask → Parquet
440
+ # - Prevents duplicate concurrent runs per (storage_path, filename)
441
+ # - Forwards retry/backoff knobs to DataWrapper.process()
442
+ # """
443
+ #
444
+ # _global_lock = threading.RLock()
445
+ # _active_runs: set[tuple[str, str]] = set()
446
+ # logger_extra = {"sibi_dst_component": __name__}
447
+ #
448
+ # def __init__(self, **kwargs: Any):
449
+ # # Merge defaults from ManagedResource and caller kwargs
450
+ # self.all_kwargs: Dict[str, Any] = {**kwargs}
451
+ # super().__init__(**self.all_kwargs)
452
+ #
453
+ # # Persist the minimal config we depend on frequently
454
+ # self._lock = threading.RLock()
455
+ #
456
+ # # Required knobs
457
+ # self._storage_path: str = self.all_kwargs["parquet_storage_path"]
458
+ # self._parquet_filename: str = self.all_kwargs["parquet_filename"]
459
+ # self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
460
+ #
461
+ # # ---------- helpers ----------
462
+ # def _invalidate_cached(self, *names: str) -> None:
463
+ # for n in names:
464
+ # self.__dict__.pop(n, None)
465
+ #
466
+ # def _build_manifest_path(self) -> str:
467
+ # base = f"{self._storage_path}".rstrip("/") + "/"
468
+ # return f"{base}_manifests/missing.parquet"
469
+ #
470
+ # # ---------- lazy members ----------
471
+ # @cached_property
472
+ # def mmanifest(self) -> MissingManifestManager:
473
+ # self.logger.info("Initializing MissingManifestManager...", extra=self.logger_extra)
474
+ # manifest_path = self._build_manifest_path()
475
+ #
476
+ # # ensure manifest directory exists
477
+ # manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
478
+ # self.ensure_directory_exists(manifest_dir)
479
+ #
480
+ # mgr = MissingManifestManager(
481
+ # fs=self.fs,
482
+ # manifest_path=manifest_path,
483
+ # clear_existing=self.all_kwargs.get("overwrite", False),
484
+ # debug=self.debug,
485
+ # logger=self.logger,
486
+ # overwrite=self.all_kwargs.get("overwrite", False),
487
+ # )
488
+ #
489
+ # if not mgr._safe_exists(mgr.manifest_path):
490
+ # self.logger.info(f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra)
491
+ # mgr.save()
492
+ # else:
493
+ # self.logger.info(f"Manifest already exists at {mgr.manifest_path}", extra=self.logger_extra)
494
+ #
495
+ # return mgr
496
+ #
497
+ # @cached_property
498
+ # def update_planner(self) -> UpdatePlanner:
499
+ # self.logger.info("Initializing UpdatePlanner...", extra=self.logger_extra)
500
+ # skipped_files = self.mmanifest.load_existing() or []
501
+ #
502
+ # cfg = {
503
+ # **self.all_kwargs,
504
+ # "fs": self.fs,
505
+ # "debug": self.debug,
506
+ # "logger": self.logger,
507
+ # "description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
508
+ # "skipped": list(skipped_files),
509
+ # "mmanifest": self.mmanifest,
510
+ # }
511
+ # return UpdatePlanner(**cfg)
512
+ #
513
+ # @cached_property
514
+ # def data_wrapper(self) -> DataWrapper:
515
+ # self.logger.info("Initializing DataWrapper...", extra=self.logger_extra)
516
+ #
517
+ # # Ensure the planner has a plan
518
+ # if getattr(self.update_planner, "plan", None) is None:
519
+ # self.update_planner.generate_plan()
520
+ #
521
+ # class_params = {
522
+ # "debug": self.debug,
523
+ # "logger": self.logger,
524
+ # "fs": self.fs,
525
+ # "verbose": self.verbose,
526
+ # }
527
+ #
528
+ # cfg = {
529
+ # "data_path": self._storage_path,
530
+ # "parquet_filename": self._parquet_filename,
531
+ # "fs": self.fs,
532
+ # "debug": self.debug,
533
+ # "logger": self.logger,
534
+ # "verbose": self.verbose,
535
+ # "dataclass": self._data_wrapper_class,
536
+ # "class_params": class_params,
537
+ # "load_params": self.all_kwargs.get("load_params", {}) or {},
538
+ # "mmanifest": self.mmanifest,
539
+ # "update_planner": self.update_planner,
540
+ # "date_field": self.all_kwargs.get("date_field"),
541
+ # # pipeline execution knobs
542
+ # "show_progress": bool(self.all_kwargs.get("show_progress", False)),
543
+ # "timeout": float(self.all_kwargs.get("timeout", 30.0)),
544
+ # "max_threads": int(self.all_kwargs.get("max_threads", 3)),
545
+ # }
546
+ # return DataWrapper(**cfg)
547
+ #
548
+ # # ---------- public API ----------
549
+ # def load(self, **kwargs: Any):
550
+ # """
551
+ # Direct load using the configured data_wrapper_class (no planner/manifest round-trip).
552
+ # Expected to return a Dask DataFrame from the loader.
553
+ # """
554
+ # self.logger.info(f"Loading data from {self._storage_path}")
555
+ #
556
+ # if not self._data_wrapper_class:
557
+ # raise ValueError("data_wrapper_class is not configured.")
558
+ #
559
+ # params = {
560
+ # "backend": "parquet",
561
+ # "fs": self.fs,
562
+ # "logger": self.logger,
563
+ # "debug": self.debug,
564
+ # "parquet_storage_path": self._storage_path,
565
+ # "parquet_filename": self._parquet_filename,
566
+ # "parquet_start_date": self.all_kwargs.get("parquet_start_date"),
567
+ # "parquet_end_date": self.all_kwargs.get("parquet_end_date"),
568
+ # **(self.all_kwargs.get("class_params") or {}),
569
+ # }
570
+ #
571
+ # cls = self._data_wrapper_class
572
+ # with cls(**params) as instance:
573
+ # return instance.load(**kwargs)
574
+ #
575
+ # def generate_parquet(self, **kwargs: Any) -> None:
576
+ # """
577
+ # Generate or update Parquet according to the plan.
578
+ # - Merges runtime kwargs
579
+ # - Invalidates dependent caches
580
+ # - Guards against duplicate concurrent runs
581
+ # - Forwards retry/backoff to DataWrapper.process()
582
+ # """
583
+ # # Merge and invalidate caches that depend on runtime changes
584
+ # self.all_kwargs.update(kwargs)
585
+ # self._invalidate_cached("update_planner", "data_wrapper")
586
+ # if "overwrite" in kwargs:
587
+ # self._invalidate_cached("mmanifest")
588
+ #
589
+ # # Global de-dupe guard
590
+ # key = (self._storage_path, self._parquet_filename)
591
+ # with ParquetArtifact._global_lock:
592
+ # if key in ParquetArtifact._active_runs:
593
+ # self.logger.info(
594
+ # f"Run already in progress for {key}; skipping this invocation.", extra=self.logger_extra
595
+ # )
596
+ # return
597
+ # ParquetArtifact._active_runs.add(key)
598
+ #
599
+ # try:
600
+ # self.ensure_directory_exists(self._storage_path)
601
+ #
602
+ # self.update_planner.generate_plan()
603
+ # plan = getattr(self.update_planner, "plan", None)
604
+ # if plan is None or (hasattr(plan, "empty") and plan.empty):
605
+ # # Planning uses Pandas; this is safe to check.
606
+ # self.logger.info("No updates needed. Skipping Parquet generation.", extra=self.logger_extra)
607
+ # return
608
+ #
609
+ # # Print plan once per run
610
+ # if (
611
+ # getattr(self.update_planner, "show_progress", False)
612
+ # and not getattr(self.update_planner, "_printed_this_run", False)
613
+ # ):
614
+ # self.update_planner.show_update_plan()
615
+ # setattr(self.update_planner, "_printed_this_run", True)
616
+ #
617
+ # # ---- forward retry/backoff knobs to DataWrapper.process() ----
618
+ # dw_retry_kwargs = {
619
+ # k: self.all_kwargs[k]
620
+ # for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
621
+ # if k in self.all_kwargs
622
+ # }
623
+ #
624
+ # with self._lock:
625
+ # dw = self.data_wrapper # single cached_property access
626
+ # if hasattr(dw, "process"):
627
+ # dw.process(**dw_retry_kwargs)
628
+ # if getattr(self.update_planner, "show_progress", False) and hasattr(
629
+ # dw, "show_benchmark_summary"
630
+ # ):
631
+ # dw.show_benchmark_summary()
632
+ #
633
+ # finally:
634
+ # with ParquetArtifact._global_lock:
635
+ # ParquetArtifact._active_runs.discard(key)
636
+ #
637
+ # def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
638
+ # """
639
+ # High-level entry point to update Parquet for a given period:
640
+ # - 'today', 'yesterday', 'last_7_days', etc. via DateUtils.parse_period
641
+ # - 'ytd'
642
+ # - 'itd' (requires history_begins_on)
643
+ # - 'custom' (requires start_on / end_on)
644
+ # Also accepts retry/backoff knobs which flow to DataWrapper.process().
645
+ # """
646
+ # final_kwargs = {**self.all_kwargs, **kwargs}
647
+ #
648
+ # def itd_config():
649
+ # start_date = final_kwargs.get("history_begins_on")
650
+ # if not start_date:
651
+ # raise ValueError(
652
+ # "For period 'itd', 'history_begins_on' must be configured."
653
+ # )
654
+ # return {
655
+ # "parquet_start_date": start_date,
656
+ # "parquet_end_date": dt.date.today(),
657
+ # }
658
+ #
659
+ # def ytd_config():
660
+ # return {
661
+ # "parquet_start_date": dt.date(dt.date.today().year, 1, 1),
662
+ # "parquet_end_date": dt.date.today(),
663
+ # }
664
+ #
665
+ # def custom_config():
666
+ # """
667
+ # Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
668
+ # are provided (with backward compatibility for `start_date`/`end_date` aliases).
669
+ # """
670
+ # # Backward compatibility: normalize aliases
671
+ # alias_map = {
672
+ # "start_on": ("start_date", "start"),
673
+ # "end_on": ("end_date", "end"),
674
+ # }
675
+ # normalized_kwargs = dict(kwargs) # shallow copy so we don't mutate original
676
+ # for target, aliases in alias_map.items():
677
+ # if target not in normalized_kwargs:
678
+ # for alias in aliases:
679
+ # if alias in normalized_kwargs:
680
+ # normalized_kwargs[target] = normalized_kwargs[alias]
681
+ # break
682
+ #
683
+ # # Validation
684
+ # missing = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
685
+ # if missing:
686
+ # raise ValueError(
687
+ # f"For period 'custom', the following required parameters are missing: {', '.join(missing)}"
688
+ # )
689
+ #
690
+ # return {
691
+ # "parquet_start_date": normalized_kwargs["start_on"],
692
+ # "parquet_end_date": normalized_kwargs["end_on"],
693
+ # }
694
+ #
695
+ # if period == "itd":
696
+ # period_params = itd_config()
697
+ # elif period == "ytd":
698
+ # period_params = ytd_config()
699
+ # elif period == "custom":
700
+ # period_params = custom_config()
701
+ # else:
702
+ # start_date, end_date = DateUtils.parse_period(period=period)
703
+ # period_params = {
704
+ # "parquet_start_date": start_date,
705
+ # "parquet_end_date": end_date,
706
+ # }
707
+ #
708
+ # final_kwargs.update(period_params)
709
+ # self.logger.debug(
710
+ # f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}", extra=self.logger_extra
711
+ # )
712
+ #
713
+ # # Delegate to generator (handles cache invalidation + forwarding knobs)
714
+ # self.generate_parquet(**final_kwargs)
715
+ #
716
+ # # ---------- utils ----------
717
+ # def ensure_directory_exists(self, path: str) -> None:
718
+ # """Ensure the directory exists across fsspec backends."""
719
+ # with self._lock:
720
+ # if not self.fs.exists(path):
721
+ # self.logger.info(f"Creating directory: {path}", extra=self.logger_extra)
722
+ # try:
723
+ # self.fs.makedirs(path, exist_ok=True)
724
+ # except TypeError:
725
+ # try:
726
+ # self.fs.makedirs(path)
727
+ # except FileExistsError:
728
+ # pass
729
+ #
730
+ # def _cleanup(self):
731
+ # """Clean up resources upon exit."""
732
+ # try:
733
+ # if "mmanifest" in self.__dict__ and getattr(
734
+ # self.mmanifest, "_new_records", None
735
+ # ):
736
+ # if self.mmanifest._new_records:
737
+ # self.mmanifest.save()
738
+ # if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
739
+ # self.data_wrapper.close()
740
+ # except Exception as e:
741
+ # self.logger.warning(f"Error during resource cleanup: {e}", extra=self.logger_extra)