sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
  3. sibi_dst/df_helper/_df_helper.py +417 -117
  4. sibi_dst/df_helper/_parquet_artifact.py +255 -283
  5. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  6. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  7. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  8. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  9. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  10. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  12. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  13. sibi_dst/utils/base.py +302 -96
  14. sibi_dst/utils/clickhouse_writer.py +472 -206
  15. sibi_dst/utils/data_utils.py +139 -186
  16. sibi_dst/utils/data_wrapper.py +317 -73
  17. sibi_dst/utils/date_utils.py +1 -0
  18. sibi_dst/utils/df_utils.py +193 -213
  19. sibi_dst/utils/file_utils.py +3 -2
  20. sibi_dst/utils/filepath_generator.py +314 -152
  21. sibi_dst/utils/log_utils.py +581 -242
  22. sibi_dst/utils/manifest_manager.py +60 -76
  23. sibi_dst/utils/parquet_saver.py +33 -27
  24. sibi_dst/utils/phone_formatter.py +88 -95
  25. sibi_dst/utils/update_planner.py +180 -178
  26. sibi_dst/utils/webdav_client.py +116 -166
  27. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
  28. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +29 -27
  29. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
@@ -1,328 +1,300 @@
1
1
  from __future__ import annotations
2
- import datetime
2
+
3
+ import datetime as dt
3
4
  import threading
4
- from typing import Optional, Any, Dict, ClassVar
5
+ from functools import cached_property
6
+ from typing import Any, Dict, Type, TypeVar
5
7
 
6
- import dask.dataframe as dd
7
- import fsspec
8
+ from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
9
+ from sibi_dst.utils import MissingManifestManager, Logger
8
10
 
9
- from sibi_dst.df_helper import DfHelper
10
- from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner
11
- from sibi_dst.utils import MissingManifestManager
11
+ T = TypeVar("T")
12
12
 
13
13
 
14
- class ParquetArtifact(DfHelper):
14
+ class ParquetArtifact(ManagedResource):
15
15
  """
16
- Class designed to manage Parquet data storage and retrieval using a specified
17
- DataWrapper class for data processing. It provides functionality for loading,
18
- updating, rebuilding, and generating Parquet files within a configurable
19
- storage filesystem. The class ensures that all essential configurations and
20
- filesystems are properly set up before operations.
21
-
22
- Detailed functionality includes support for dynamically managing and generating
23
- Parquet files based on time periods, with customizable options for paths,
24
- filenames, date fields, and more. It is an abstraction for efficiently handling
25
- storage tasks related to distributed or local file systems.
26
-
27
- :ivar config: Configuration dictionary containing all configurable parameters
28
- for managing Parquet data storage, such as paths, filenames,
29
- and date ranges.
30
- :type config: dict
31
- :ivar df: Cached Dask DataFrame used to store and manipulate data loaded
32
- from the Parquet file.
33
- :type df: Optional[dask.dataframe.DataFrame]
34
- :ivar data_wrapper_class: Class responsible for abstracting data processing
35
- operations required for Parquet file generation.
36
- :type data_wrapper_class: type
37
- :ivar date_field: Name of the field used to identify and process data by date.
38
- :type date_field: Optional[str]
39
- :ivar parquet_storage_path: Filesystem path to store Parquet files.
40
- :type parquet_storage_path: Optional[str]
41
- :ivar parquet_filename: Name of the Parquet file to be generated and managed.
42
- :type parquet_filename: Optional[str]
43
- :ivar parquet_start_date: Date string specifying the start date for data range
44
- processing.
45
- :type parquet_start_date: Optional[str]
46
- :ivar parquet_end_date: Date string specifying the end date for data range
47
- processing.
48
- :type parquet_end_date: Optional[str]
49
- :ivar filesystem_type: Type of the filesystem used for managing storage
50
- operations (e.g., `file`, `s3`, etc.).
51
- :type filesystem_type: str
52
- :ivar filesystem_options: Additional options for configuring the filesystem.
53
- :type filesystem_options: dict
54
- :ivar fs: Filesystem object used for storage operations.
55
- :type fs: fsspec.AbstractFileSystem
16
+ Orchestrates a single dataset:
17
+ - Builds/uses MissingManifestManager
18
+ - Plans work with UpdatePlanner
19
+ - Executes with DataWrapper (threaded) saving Dask Parquet
20
+ - Prevents duplicate concurrent runs per (storage_path, filename)
21
+ - Forwards retry/backoff knobs to DataWrapper.process()
56
22
  """
57
- DEFAULT_CONFIG: ClassVar[Dict[str, str]] = {
58
- 'backend': 'parquet'
59
- }
60
23
 
24
+ _global_lock = threading.RLock()
25
+ _active_runs: set[tuple[str, str]] = set()
61
26
 
62
- def __init__(self, data_wrapper_class, **kwargs):
63
- """
64
- Initializes an instance of the class with given configuration and validates
65
- required parameters. Sets up the filesystem to handle storage, ensuring
66
- necessary directories exist. The configuration supports a variety of options
67
- to manage parquet storage requirements, including paths, filenames, and date
68
- ranges.
69
-
70
- :param data_wrapper_class: The class responsible for wrapping data to be managed
71
- by this instance.
72
- :type data_wrapper_class: type
73
- :param kwargs: Arbitrary keyword arguments to override default configuration.
74
- Includes settings for `date_field`, `parquet_storage_path`,
75
- `parquet_filename`, `parquet_start_date`, `parquet_end_date`,
76
- `filesystem_type`, `filesystem_options`, and `fs`.
77
- :type kwargs: dict
78
-
79
- :raises ValueError: If any of the required configuration options
80
- (`date_field`, `parquet_storage_path`,
81
- `parquet_filename`, `parquet_start_date`,
82
- or `parquet_end_date`) are missing or not set properly.
83
- """
27
+ def __init__(self, **kwargs: Any):
28
+ # Merge defaults from ManagedResource and caller kwargs
29
+ self.all_kwargs: Dict[str, Any] = {**kwargs}
30
+ super().__init__(**self.all_kwargs)
84
31
 
85
- """Initialize with config, validate required fields, and setup filesystem."""
86
- self._lock = threading.Lock()
87
- self.config = {
88
- **self.DEFAULT_CONFIG,
89
- **kwargs,
90
- }
91
- self.df: Optional[dd.DataFrame] = None
92
- super().__init__(**self.config)
93
- self.data_wrapper_class = data_wrapper_class
94
-
95
- self.date_field = self._validate_required('date_field')
96
- self.parquet_storage_path = self._validate_required('parquet_storage_path')
97
- self.parquet_filename = self._validate_required('parquet_filename')
98
- self.parquet_start_date = self._validate_required('parquet_start_date')
99
- self.parquet_end_date = self._validate_required('parquet_end_date')
100
-
101
- self.class_params = self.config.pop('class_params', {
102
- 'debug': self.debug,
103
- 'logger': self.logger,
104
- 'fs': self.fs,
105
- 'verbose': self.verbose,
106
- })
107
- # Populate parameters to pass to load method of DataWrapper class
108
- self.load_params = self.config.setdefault('load_params', {})
109
- # Ensure the directory exists
110
- self.ensure_directory_exists(self.parquet_storage_path)
111
- #super().__init__(**self.config)
112
- self.update_planner_params = {}
113
- self.datawrapper_params = {}
114
-
115
- def _validate_required(self, key: str) -> Any:
116
- """Validate required configuration fields."""
117
- value = self.config.setdefault(key, None)
118
- if value is None:
119
- raise ValueError(f'{key} must be set')
120
- return value
121
-
122
- def _setup_manifest(self, overwrite: bool = False, ignore_missing: bool = False):
123
- self.skipped = []
124
- self.missing_manifest_path = f"{self.parquet_storage_path}_manifests/missing.parquet"
125
- self.mmanifest = MissingManifestManager(
32
+ # Persist the minimal config we depend on frequently
33
+ self._lock = threading.RLock()
34
+
35
+ # Required knobs
36
+ self._storage_path: str = self.all_kwargs["parquet_storage_path"]
37
+ self._parquet_filename: str = self.all_kwargs["parquet_filename"]
38
+ self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
39
+
40
+ # ---------- helpers ----------
41
+ def _invalidate_cached(self, *names: str) -> None:
42
+ for n in names:
43
+ self.__dict__.pop(n, None)
44
+
45
+ def _build_manifest_path(self) -> str:
46
+ base = f"{self._storage_path}".rstrip("/") + "/"
47
+ return f"{base}_manifests/missing.parquet"
48
+
49
+ # ---------- lazy members ----------
50
+ @cached_property
51
+ def mmanifest(self) -> MissingManifestManager:
52
+ self.logger.info("Initializing MissingManifestManager...")
53
+ manifest_path = self._build_manifest_path()
54
+
55
+ # ensure manifest directory exists
56
+ manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
57
+ self.ensure_directory_exists(manifest_dir)
58
+
59
+ mgr = MissingManifestManager(
126
60
  fs=self.fs,
127
- manifest_path=self.missing_manifest_path,
128
- clear_existing=overwrite,
129
- debug= self.debug,
130
- logger=self.logger
61
+ manifest_path=manifest_path,
62
+ clear_existing=self.all_kwargs.get("overwrite", False),
63
+ debug=self.debug,
64
+ logger=self.logger,
65
+ overwrite=self.all_kwargs.get("overwrite", False),
131
66
  )
132
67
 
133
- # Initialize skipped files
134
- manifest_exists = self.mmanifest._safe_exists(self.missing_manifest_path)
135
- if not manifest_exists:
136
- self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
137
- self.mmanifest.save()
138
- #self.mmanifest.cleanup_temp_manifests()
68
+ if not mgr._safe_exists(mgr.manifest_path):
69
+ self.logger.info(f"Creating new manifest at {mgr.manifest_path}")
70
+ mgr.save()
139
71
  else:
140
- self.logger.info(f"Manifest already exists at {self.missing_manifest_path}")
72
+ self.logger.info(f"Manifest already exists at {mgr.manifest_path}")
73
+
74
+ return mgr
75
+
76
+ @cached_property
77
+ def update_planner(self) -> UpdatePlanner:
78
+ self.logger.info("Initializing UpdatePlanner...")
79
+ skipped_files = self.mmanifest.load_existing() or []
80
+
81
+ cfg = {
82
+ **self.all_kwargs,
83
+ "fs": self.fs,
84
+ "debug": self.debug,
85
+ "logger": self.logger,
86
+ "description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
87
+ "skipped": list(skipped_files),
88
+ "mmanifest": self.mmanifest,
89
+ }
90
+ return UpdatePlanner(**cfg)
141
91
 
142
- # Load skipped files if manifest exists and ignore_missing is True
143
- self.skipped = self.mmanifest.load_existing() # if ignore_missing and manifest_exists else []
144
- self.logger.info(f"Skipped: {self.skipped}")
145
- if overwrite:
146
- self.skipped = []
147
- self.ignore_missing = False
92
+ @cached_property
93
+ def data_wrapper(self) -> DataWrapper:
94
+ self.logger.info("Initializing DataWrapper...")
148
95
 
149
- def _setup_update_planner(self, **kwargs) -> None:
150
- self._prepare_update_params(**kwargs)
151
- self.update_planner = UpdatePlanner(**self.update_planner_params)
152
- self.update_planner.generate_plan(start=self.start_date,end= self.end_date)
96
+ # Ensure the planner has a plan
97
+ if getattr(self.update_planner, "plan", None) is None:
98
+ self.update_planner.generate_plan()
153
99
 
154
- def load(self, **kwargs):
155
- with self._lock:
156
- self.df = super().load(**kwargs)
157
- return self.df
100
+ class_params = {
101
+ "debug": self.debug,
102
+ "logger": self.logger,
103
+ "fs": self.fs,
104
+ "verbose": self.verbose,
105
+ }
158
106
 
159
- def generate_parquet(self, **kwargs) -> None:
160
- """
161
- Generate a Parquet file using the configured DataWrapper class.
162
- """
163
- with self._lock:
164
- overwrite = kwargs.get('overwrite', False)
165
- ignore_missing = kwargs.get('ignore_missing', False)
166
- self._setup_manifest(overwrite, ignore_missing)
167
- self._setup_update_planner(**kwargs)
168
- params = self.datawrapper_params.copy()
169
- params.update({
170
- 'mmanifest': self.mmanifest,
171
- 'update_planner': self.update_planner
172
- })
173
-
174
- with DataWrapper(self.data_wrapper_class, **params) as dw:
175
- dw.process()
176
-
177
- def __exit__(self, exc_type, exc_value, traceback):
178
- try:
179
- if self.mmanifest and self.mmanifest._new_records:
180
- self.mmanifest.save()
181
- except Exception as e:
182
- self.logger.warning(f"Error closing filesystem: {e}")
183
- finally:
184
- super().__exit__(exc_type, exc_value, traceback)
185
- # return False so exceptions aren’t suppressed
186
- return False
107
+ cfg = {
108
+ "data_path": self._storage_path,
109
+ "parquet_filename": self._parquet_filename,
110
+ "fs": self.fs,
111
+ "debug": self.debug,
112
+ "logger": self.logger,
113
+ "verbose": self.verbose,
114
+ "dataclass": self._data_wrapper_class,
115
+ "class_params": class_params,
116
+ "load_params": self.all_kwargs.get("load_params", {}) or {},
117
+ "mmanifest": self.mmanifest,
118
+ "update_planner": self.update_planner,
119
+ "date_field": self.all_kwargs.get("date_field"),
120
+ # pipeline execution knobs
121
+ "show_progress": bool(self.all_kwargs.get("show_progress", False)),
122
+ "timeout": float(self.all_kwargs.get("timeout", 30.0)),
123
+ "max_threads": int(self.all_kwargs.get("max_threads", 3)),
124
+ }
125
+ return DataWrapper(**cfg)
187
126
 
188
- def get_size_estimate(self, **kwargs) -> int:
127
+ # ---------- public API ----------
128
+ def load(self, **kwargs: Any):
189
129
  """
190
- Synchronously estimates artifact size for use in multi-threaded environments.
191
-
192
- This method safely executes asynchronous I/O operations from a synchronous
193
- context, handling variations in fsspec filesystem implementations.
130
+ Direct load using the configured data_wrapper_class (no planner/manifest round-trip).
131
+ Expected to return a Dask DataFrame from the loader.
194
132
  """
133
+ self.logger.info(f"Loading data from {self._storage_path}")
134
+
135
+ if not self._data_wrapper_class:
136
+ raise ValueError("data_wrapper_class is not configured.")
137
+
138
+ params = {
139
+ "backend": "parquet",
140
+ "fs": self.fs,
141
+ "logger": self.logger,
142
+ "debug": self.debug,
143
+ "parquet_storage_path": self._storage_path,
144
+ "parquet_filename": self._parquet_filename,
145
+ "parquet_start_date": self.all_kwargs.get("parquet_start_date"),
146
+ "parquet_end_date": self.all_kwargs.get("parquet_end_date"),
147
+ **(self.all_kwargs.get("class_params") or {}),
148
+ }
195
149
 
196
- async def _get_total_bytes_async():
197
- """A helper async coroutine to perform the I/O."""
198
- import asyncio
199
-
200
- files = await self.fs._glob(f"{self.parquet_storage_path}/*.parquet")
201
- if not files:
202
- return 0
150
+ cls = self._data_wrapper_class
151
+ with cls(**params) as instance:
152
+ return instance.load(**kwargs)
203
153
 
204
- size_tasks = [self.fs._size(f) for f in files]
205
- sizes = await asyncio.gather(*size_tasks)
206
- return sum(s for s in sizes if s is not None)
154
+ def generate_parquet(self, **kwargs: Any) -> None:
155
+ """
156
+ Generate or update Parquet according to the plan.
157
+ - Merges runtime kwargs
158
+ - Invalidates dependent caches
159
+ - Guards against duplicate concurrent runs
160
+ - Forwards retry/backoff to DataWrapper.process()
161
+ """
162
+ # Merge and invalidate caches that depend on runtime changes
163
+ self.all_kwargs.update(kwargs)
164
+ self._invalidate_cached("update_planner", "data_wrapper")
165
+ if "overwrite" in kwargs:
166
+ self._invalidate_cached("mmanifest")
167
+
168
+ # Global de-dupe guard
169
+ key = (self._storage_path, self._parquet_filename)
170
+ with ParquetArtifact._global_lock:
171
+ if key in ParquetArtifact._active_runs:
172
+ self.logger.info(
173
+ f"Run already in progress for {key}; skipping this invocation."
174
+ )
175
+ return
176
+ ParquetArtifact._active_runs.add(key)
207
177
 
208
178
  try:
209
- # Attempt the standard fsspec method first
210
- total_bytes = self.fs.sync(_get_total_bytes_async())
211
- except AttributeError:
212
- # fallback for filesystems like s3fs that lack .sync()
213
- total_bytes = self.fs.loop.run_until_complete(_get_total_bytes_async())
179
+ self.ensure_directory_exists(self._storage_path)
180
+
181
+ self.update_planner.generate_plan()
182
+ plan = getattr(self.update_planner, "plan", None)
183
+ if plan is None or (hasattr(plan, "empty") and plan.empty):
184
+ # Planning uses Pandas; this is safe to check.
185
+ self.logger.info("No updates needed. Skipping Parquet generation.")
186
+ return
187
+
188
+ # Print plan once per run
189
+ if (
190
+ getattr(self.update_planner, "show_progress", False)
191
+ and not getattr(self.update_planner, "_printed_this_run", False)
192
+ ):
193
+ self.update_planner.show_update_plan()
194
+ setattr(self.update_planner, "_printed_this_run", True)
195
+
196
+ # ---- forward retry/backoff knobs to DataWrapper.process() ----
197
+ dw_retry_kwargs = {
198
+ k: self.all_kwargs[k]
199
+ for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
200
+ if k in self.all_kwargs
201
+ }
214
202
 
215
- # Convert to megabytes, ensuring a minimum of 1
216
- return max(1, int(total_bytes / (1024 ** 2)))
203
+ with self._lock:
204
+ dw = self.data_wrapper # single cached_property access
205
+ if hasattr(dw, "process"):
206
+ dw.process(**dw_retry_kwargs)
207
+ if getattr(self.update_planner, "show_progress", False) and hasattr(
208
+ dw, "show_benchmark_summary"
209
+ ):
210
+ dw.show_benchmark_summary()
217
211
 
218
- def update_parquet(self, period: str = 'today', **kwargs) -> None:
219
- """Update the Parquet file with data from a specific period."""
212
+ finally:
213
+ with ParquetArtifact._global_lock:
214
+ ParquetArtifact._active_runs.discard(key)
215
+
216
+ def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
217
+ """
218
+ High-level entry point to update Parquet for a given period:
219
+ - 'today', 'yesterday', 'last_7_days', etc. via DateUtils.parse_period
220
+ - 'ytd'
221
+ - 'itd' (requires history_begins_on)
222
+ - 'custom' (requires start_on / end_on)
223
+ Also accepts retry/backoff knobs which flow to DataWrapper.process().
224
+ """
225
+ final_kwargs = {**self.all_kwargs, **kwargs}
220
226
 
221
227
  def itd_config():
222
- try:
223
- start_date = kwargs.pop('history_begins_on')
224
- except KeyError:
225
- raise ValueError("For period 'itd', you must provide 'history_begins_on' in kwargs.")
226
- return {'parquet_start_date': start_date, 'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')}
228
+ start_date = final_kwargs.get("history_begins_on")
229
+ if not start_date:
230
+ raise ValueError(
231
+ "For period 'itd', 'history_begins_on' must be configured."
232
+ )
233
+ return {
234
+ "parquet_start_date": start_date,
235
+ "parquet_end_date": dt.date.today(),
236
+ }
227
237
 
228
238
  def ytd_config():
229
239
  return {
230
- 'parquet_start_date': datetime.date(datetime.date.today().year, 1, 1).strftime('%Y-%m-%d'),
231
- 'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')
240
+ "parquet_start_date": dt.date(dt.date.today().year, 1, 1),
241
+ "parquet_end_date": dt.date.today(),
232
242
  }
233
243
 
234
244
  def custom_config():
235
- try:
236
- start_date = kwargs.pop('start_on')
237
- end_date = kwargs.pop('end_on')
238
- except KeyError:
239
- raise ValueError("For period 'custom', you must provide 'start_on' in kwargs.")
245
+ if "start_on" not in final_kwargs or "end_on" not in final_kwargs:
246
+ raise ValueError(
247
+ "For period 'custom', provide 'start_on' and 'end_on'."
248
+ )
240
249
  return {
241
- 'parquet_start_date': start_date,
242
- 'parquet_end_date': end_date
250
+ "parquet_start_date": final_kwargs["start_on"],
251
+ "parquet_end_date": final_kwargs["end_on"],
243
252
  }
244
253
 
245
- config_map = {
246
- 'itd': itd_config,
247
- 'ytd': ytd_config,
248
- 'custom': custom_config,
249
- }
250
-
251
- if period in config_map:
252
- kwargs.update(config_map[period]())
254
+ if period == "itd":
255
+ period_params = itd_config()
256
+ elif period == "ytd":
257
+ period_params = ytd_config()
258
+ elif period == "custom":
259
+ period_params = custom_config()
253
260
  else:
254
- kwargs.update(self.parse_parquet_period(period=period))
255
- self.logger.debug(f"kwargs passed to update parquet: {kwargs}")
256
- self.generate_parquet(**kwargs)
257
-
258
- def rebuild_parquet(self, **kwargs) -> None:
259
- """Rebuild the Parquet file from the start to end date."""
260
- kwargs.update(self._get_rebuild_params(kwargs))
261
- self.generate_parquet(**kwargs)
262
-
263
- def _get_rebuild_params(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
264
- """Prepare parameters for rebuilding the Parquet file."""
265
- return {
266
- 'overwrite': True,
267
- 'reverse_order': True,
268
- 'start_date': kwargs.get('parquet_start_date', self.parquet_start_date),
269
- 'end_date': kwargs.get('parquet_end_date', self.parquet_end_date),
270
- }
261
+ start_date, end_date = DateUtils.parse_period(period=period)
262
+ period_params = {
263
+ "parquet_start_date": start_date,
264
+ "parquet_end_date": end_date,
265
+ }
271
266
 
272
- def _prepare_update_params(self, **kwargs) -> Dict[str, Any]:
273
- self.reverse_order = kwargs.pop('reverse_order', True)
274
- self.overwrite = kwargs.pop('overwrite', False)
275
- self.ignore_missing = kwargs.pop('ignore_missing', False)
276
- self.history_days_threshold = kwargs.pop('history_days_threshold', 30)
277
- self.max_age_minutes = kwargs.pop('max_age_minutes', 10)
278
- self.show_progress = kwargs.pop('show_progress', False)
279
- self.start_date = kwargs.pop('parquet_start_date', self.parquet_start_date)
280
- self.end_date = kwargs.pop('parquet_end_date', self.parquet_end_date)
281
- self.parquet_filename = kwargs.pop('parquet_filename', self.parquet_filename)
282
- self.verbose = kwargs.pop('verbose', False)
283
-
284
- self.update_planner_params.update({
285
- 'filename': self.parquet_filename,
286
- 'data_path': self.parquet_storage_path,
287
- 'fs': self.fs,
288
- 'debug': self.debug,
289
- 'logger': self.logger,
290
- 'reverse_order': self.reverse_order,
291
- 'overwrite': self.overwrite,
292
- 'ignore_missing': self.ignore_missing,
293
- 'history_days_threshold': self.history_days_threshold,
294
- 'max_age_minutes': self.max_age_minutes,
295
- 'show_progress': self.show_progress,
296
- 'description': f"{self.data_wrapper_class.__name__}",
297
- 'skipped': self.skipped,
298
- 'verbose': self.verbose,
299
- })
300
-
301
- self.datawrapper_params = {
302
- 'parquet_filename': self.parquet_filename,
303
- 'data_path': self.parquet_storage_path,
304
- 'fs': self.fs,
305
- 'debug': self.debug,
306
- 'logger': self.logger,
307
- 'class_params': self.class_params,
308
- 'date_field': self.date_field,
309
- 'load_params': self.load_params,
310
- 'verbose': self.verbose
311
- }
267
+ final_kwargs.update(period_params)
268
+ self.logger.debug(
269
+ f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}"
270
+ )
312
271
 
313
- def parse_parquet_period(self, **kwargs):
314
- start_date, end_date = DateUtils.parse_period(**kwargs)
315
- self.parquet_start_date = start_date.strftime('%Y-%m-%d')
316
- self.parquet_end_date = end_date.strftime('%Y-%m-%d')
317
- return {
318
- 'parquet_start_date': self.parquet_start_date,
319
- 'parquet_end_date': self.parquet_end_date,
320
- }
272
+ # Delegate to generator (handles cache invalidation + forwarding knobs)
273
+ self.generate_parquet(**final_kwargs)
321
274
 
275
+ # ---------- utils ----------
322
276
  def ensure_directory_exists(self, path: str) -> None:
323
- """Ensure the directory exists in the specified filesystem."""
277
+ """Ensure the directory exists across fsspec backends."""
324
278
  with self._lock:
325
- try:
326
- self.fs.makedirs(path, exist_ok=True)
327
- except Exception as e:
328
- raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
279
+ if not self.fs.exists(path):
280
+ self.logger.info(f"Creating directory: {path}")
281
+ try:
282
+ self.fs.makedirs(path, exist_ok=True)
283
+ except TypeError:
284
+ try:
285
+ self.fs.makedirs(path)
286
+ except FileExistsError:
287
+ pass
288
+
289
+ def _cleanup(self):
290
+ """Clean up resources upon exit."""
291
+ try:
292
+ if "mmanifest" in self.__dict__ and getattr(
293
+ self.mmanifest, "_new_records", None
294
+ ):
295
+ if self.mmanifest._new_records:
296
+ self.mmanifest.save()
297
+ if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
298
+ self.data_wrapper.close()
299
+ except Exception as e:
300
+ self.logger.warning(f"Error during resource cleanup: {e}")
@@ -202,16 +202,20 @@ class ParquetConfig(BaseModel):
202
202
 
203
203
  try:
204
204
  self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
205
- return dd.read_parquet(
205
+ dd_result=dd.read_parquet(
206
206
  paths_to_load,
207
207
  engine="pyarrow",
208
208
  filesystem=self.fs,
209
209
  exclude=["_*", ".*"]
210
210
  )
211
+ return dd_result
212
+ except FileNotFoundError as e:
213
+ self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
214
+ self.logger.debug("Returning empty DataFrame due to missing parquet files.")
215
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
211
216
  except Exception as e:
212
- # This robust error handling is excellent.
213
- self.logger.error(f"Parquet loading failed for paths {paths_to_load}: {e}", exc_info=True)
214
- self.logger.warning("Returning empty DataFrame due to loading error.")
217
+ self.logger.debug(f"Parquet loading failed for paths {paths_to_load}: {e}")
218
+ self.logger.debug("Returning empty DataFrame due to loading error.")
215
219
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
216
220
 
217
221