sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/__init__.py +3 -2
  3. sibi_dst/df_helper/_artifact_updater_async.py +238 -0
  4. sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
  5. sibi_dst/df_helper/_df_helper.py +418 -118
  6. sibi_dst/df_helper/_parquet_artifact.py +275 -283
  7. sibi_dst/df_helper/_parquet_reader.py +9 -10
  8. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  9. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  10. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  12. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  13. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  14. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  15. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  16. sibi_dst/utils/__init__.py +2 -0
  17. sibi_dst/utils/base.py +235 -100
  18. sibi_dst/utils/business_days.py +248 -0
  19. sibi_dst/utils/clickhouse_writer.py +472 -206
  20. sibi_dst/utils/data_utils.py +139 -186
  21. sibi_dst/utils/data_wrapper.py +392 -88
  22. sibi_dst/utils/date_utils.py +711 -393
  23. sibi_dst/utils/df_utils.py +193 -213
  24. sibi_dst/utils/file_age_checker.py +301 -0
  25. sibi_dst/utils/file_utils.py +3 -2
  26. sibi_dst/utils/filepath_generator.py +314 -152
  27. sibi_dst/utils/log_utils.py +581 -242
  28. sibi_dst/utils/manifest_manager.py +60 -76
  29. sibi_dst/utils/parquet_saver.py +33 -27
  30. sibi_dst/utils/periods.py +42 -0
  31. sibi_dst/utils/phone_formatter.py +88 -95
  32. sibi_dst/utils/update_planner.py +180 -178
  33. sibi_dst/utils/webdav_client.py +116 -166
  34. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
  35. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
  36. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
  37. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -1,328 +1,320 @@
1
1
  from __future__ import annotations
2
- import datetime
2
+
3
+ import datetime as dt
3
4
  import threading
4
- from typing import Optional, Any, Dict, ClassVar
5
+ from functools import cached_property
6
+ from typing import Any, Dict, Type, TypeVar
5
7
 
6
- import dask.dataframe as dd
7
- import fsspec
8
+ from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
9
+ from sibi_dst.utils import MissingManifestManager, Logger
8
10
 
9
- from sibi_dst.df_helper import DfHelper
10
- from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner
11
- from sibi_dst.utils import MissingManifestManager
11
+ T = TypeVar("T")
12
12
 
13
13
 
14
- class ParquetArtifact(DfHelper):
14
+ class ParquetArtifact(ManagedResource):
15
15
  """
16
- Class designed to manage Parquet data storage and retrieval using a specified
17
- DataWrapper class for data processing. It provides functionality for loading,
18
- updating, rebuilding, and generating Parquet files within a configurable
19
- storage filesystem. The class ensures that all essential configurations and
20
- filesystems are properly set up before operations.
21
-
22
- Detailed functionality includes support for dynamically managing and generating
23
- Parquet files based on time periods, with customizable options for paths,
24
- filenames, date fields, and more. It is an abstraction for efficiently handling
25
- storage tasks related to distributed or local file systems.
26
-
27
- :ivar config: Configuration dictionary containing all configurable parameters
28
- for managing Parquet data storage, such as paths, filenames,
29
- and date ranges.
30
- :type config: dict
31
- :ivar df: Cached Dask DataFrame used to store and manipulate data loaded
32
- from the Parquet file.
33
- :type df: Optional[dask.dataframe.DataFrame]
34
- :ivar data_wrapper_class: Class responsible for abstracting data processing
35
- operations required for Parquet file generation.
36
- :type data_wrapper_class: type
37
- :ivar date_field: Name of the field used to identify and process data by date.
38
- :type date_field: Optional[str]
39
- :ivar parquet_storage_path: Filesystem path to store Parquet files.
40
- :type parquet_storage_path: Optional[str]
41
- :ivar parquet_filename: Name of the Parquet file to be generated and managed.
42
- :type parquet_filename: Optional[str]
43
- :ivar parquet_start_date: Date string specifying the start date for data range
44
- processing.
45
- :type parquet_start_date: Optional[str]
46
- :ivar parquet_end_date: Date string specifying the end date for data range
47
- processing.
48
- :type parquet_end_date: Optional[str]
49
- :ivar filesystem_type: Type of the filesystem used for managing storage
50
- operations (e.g., `file`, `s3`, etc.).
51
- :type filesystem_type: str
52
- :ivar filesystem_options: Additional options for configuring the filesystem.
53
- :type filesystem_options: dict
54
- :ivar fs: Filesystem object used for storage operations.
55
- :type fs: fsspec.AbstractFileSystem
16
+ Orchestrates a single dataset:
17
+ - Builds/uses MissingManifestManager
18
+ - Plans work with UpdatePlanner
19
+ - Executes with DataWrapper (threaded) saving Dask Parquet
20
+ - Prevents duplicate concurrent runs per (storage_path, filename)
21
+ - Forwards retry/backoff knobs to DataWrapper.process()
56
22
  """
57
- DEFAULT_CONFIG: ClassVar[Dict[str, str]] = {
58
- 'backend': 'parquet'
59
- }
60
23
 
24
+ _global_lock = threading.RLock()
25
+ _active_runs: set[tuple[str, str]] = set()
61
26
 
62
- def __init__(self, data_wrapper_class, **kwargs):
63
- """
64
- Initializes an instance of the class with given configuration and validates
65
- required parameters. Sets up the filesystem to handle storage, ensuring
66
- necessary directories exist. The configuration supports a variety of options
67
- to manage parquet storage requirements, including paths, filenames, and date
68
- ranges.
69
-
70
- :param data_wrapper_class: The class responsible for wrapping data to be managed
71
- by this instance.
72
- :type data_wrapper_class: type
73
- :param kwargs: Arbitrary keyword arguments to override default configuration.
74
- Includes settings for `date_field`, `parquet_storage_path`,
75
- `parquet_filename`, `parquet_start_date`, `parquet_end_date`,
76
- `filesystem_type`, `filesystem_options`, and `fs`.
77
- :type kwargs: dict
78
-
79
- :raises ValueError: If any of the required configuration options
80
- (`date_field`, `parquet_storage_path`,
81
- `parquet_filename`, `parquet_start_date`,
82
- or `parquet_end_date`) are missing or not set properly.
83
- """
27
+ def __init__(self, **kwargs: Any):
28
+ # Merge defaults from ManagedResource and caller kwargs
29
+ self.all_kwargs: Dict[str, Any] = {**kwargs}
30
+ super().__init__(**self.all_kwargs)
84
31
 
85
- """Initialize with config, validate required fields, and setup filesystem."""
86
- self._lock = threading.Lock()
87
- self.config = {
88
- **self.DEFAULT_CONFIG,
89
- **kwargs,
90
- }
91
- self.df: Optional[dd.DataFrame] = None
92
- super().__init__(**self.config)
93
- self.data_wrapper_class = data_wrapper_class
94
-
95
- self.date_field = self._validate_required('date_field')
96
- self.parquet_storage_path = self._validate_required('parquet_storage_path')
97
- self.parquet_filename = self._validate_required('parquet_filename')
98
- self.parquet_start_date = self._validate_required('parquet_start_date')
99
- self.parquet_end_date = self._validate_required('parquet_end_date')
100
-
101
- self.class_params = self.config.pop('class_params', {
102
- 'debug': self.debug,
103
- 'logger': self.logger,
104
- 'fs': self.fs,
105
- 'verbose': self.verbose,
106
- })
107
- # Populate parameters to pass to load method of DataWrapper class
108
- self.load_params = self.config.setdefault('load_params', {})
109
- # Ensure the directory exists
110
- self.ensure_directory_exists(self.parquet_storage_path)
111
- #super().__init__(**self.config)
112
- self.update_planner_params = {}
113
- self.datawrapper_params = {}
114
-
115
- def _validate_required(self, key: str) -> Any:
116
- """Validate required configuration fields."""
117
- value = self.config.setdefault(key, None)
118
- if value is None:
119
- raise ValueError(f'{key} must be set')
120
- return value
121
-
122
- def _setup_manifest(self, overwrite: bool = False, ignore_missing: bool = False):
123
- self.skipped = []
124
- self.missing_manifest_path = f"{self.parquet_storage_path}_manifests/missing.parquet"
125
- self.mmanifest = MissingManifestManager(
32
+ # Persist the minimal config we depend on frequently
33
+ self._lock = threading.RLock()
34
+
35
+ # Required knobs
36
+ self._storage_path: str = self.all_kwargs["parquet_storage_path"]
37
+ self._parquet_filename: str = self.all_kwargs["parquet_filename"]
38
+ self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
39
+
40
+ # ---------- helpers ----------
41
+ def _invalidate_cached(self, *names: str) -> None:
42
+ for n in names:
43
+ self.__dict__.pop(n, None)
44
+
45
+ def _build_manifest_path(self) -> str:
46
+ base = f"{self._storage_path}".rstrip("/") + "/"
47
+ return f"{base}_manifests/missing.parquet"
48
+
49
+ # ---------- lazy members ----------
50
+ @cached_property
51
+ def mmanifest(self) -> MissingManifestManager:
52
+ self.logger.info("Initializing MissingManifestManager...")
53
+ manifest_path = self._build_manifest_path()
54
+
55
+ # ensure manifest directory exists
56
+ manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
57
+ self.ensure_directory_exists(manifest_dir)
58
+
59
+ mgr = MissingManifestManager(
126
60
  fs=self.fs,
127
- manifest_path=self.missing_manifest_path,
128
- clear_existing=overwrite,
129
- debug= self.debug,
130
- logger=self.logger
61
+ manifest_path=manifest_path,
62
+ clear_existing=self.all_kwargs.get("overwrite", False),
63
+ debug=self.debug,
64
+ logger=self.logger,
65
+ overwrite=self.all_kwargs.get("overwrite", False),
131
66
  )
132
67
 
133
- # Initialize skipped files
134
- manifest_exists = self.mmanifest._safe_exists(self.missing_manifest_path)
135
- if not manifest_exists:
136
- self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
137
- self.mmanifest.save()
138
- #self.mmanifest.cleanup_temp_manifests()
68
+ if not mgr._safe_exists(mgr.manifest_path):
69
+ self.logger.info(f"Creating new manifest at {mgr.manifest_path}")
70
+ mgr.save()
139
71
  else:
140
- self.logger.info(f"Manifest already exists at {self.missing_manifest_path}")
72
+ self.logger.info(f"Manifest already exists at {mgr.manifest_path}")
73
+
74
+ return mgr
75
+
76
+ @cached_property
77
+ def update_planner(self) -> UpdatePlanner:
78
+ self.logger.info("Initializing UpdatePlanner...")
79
+ skipped_files = self.mmanifest.load_existing() or []
80
+
81
+ cfg = {
82
+ **self.all_kwargs,
83
+ "fs": self.fs,
84
+ "debug": self.debug,
85
+ "logger": self.logger,
86
+ "description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
87
+ "skipped": list(skipped_files),
88
+ "mmanifest": self.mmanifest,
89
+ }
90
+ return UpdatePlanner(**cfg)
141
91
 
142
- # Load skipped files if manifest exists and ignore_missing is True
143
- self.skipped = self.mmanifest.load_existing() # if ignore_missing and manifest_exists else []
144
- self.logger.info(f"Skipped: {self.skipped}")
145
- if overwrite:
146
- self.skipped = []
147
- self.ignore_missing = False
92
+ @cached_property
93
+ def data_wrapper(self) -> DataWrapper:
94
+ self.logger.info("Initializing DataWrapper...")
148
95
 
149
- def _setup_update_planner(self, **kwargs) -> None:
150
- self._prepare_update_params(**kwargs)
151
- self.update_planner = UpdatePlanner(**self.update_planner_params)
152
- self.update_planner.generate_plan(start=self.start_date,end= self.end_date)
96
+ # Ensure the planner has a plan
97
+ if getattr(self.update_planner, "plan", None) is None:
98
+ self.update_planner.generate_plan()
153
99
 
154
- def load(self, **kwargs):
155
- with self._lock:
156
- self.df = super().load(**kwargs)
157
- return self.df
100
+ class_params = {
101
+ "debug": self.debug,
102
+ "logger": self.logger,
103
+ "fs": self.fs,
104
+ "verbose": self.verbose,
105
+ }
158
106
 
159
- def generate_parquet(self, **kwargs) -> None:
160
- """
161
- Generate a Parquet file using the configured DataWrapper class.
162
- """
163
- with self._lock:
164
- overwrite = kwargs.get('overwrite', False)
165
- ignore_missing = kwargs.get('ignore_missing', False)
166
- self._setup_manifest(overwrite, ignore_missing)
167
- self._setup_update_planner(**kwargs)
168
- params = self.datawrapper_params.copy()
169
- params.update({
170
- 'mmanifest': self.mmanifest,
171
- 'update_planner': self.update_planner
172
- })
173
-
174
- with DataWrapper(self.data_wrapper_class, **params) as dw:
175
- dw.process()
176
-
177
- def __exit__(self, exc_type, exc_value, traceback):
178
- try:
179
- if self.mmanifest and self.mmanifest._new_records:
180
- self.mmanifest.save()
181
- except Exception as e:
182
- self.logger.warning(f"Error closing filesystem: {e}")
183
- finally:
184
- super().__exit__(exc_type, exc_value, traceback)
185
- # return False so exceptions aren’t suppressed
186
- return False
107
+ cfg = {
108
+ "data_path": self._storage_path,
109
+ "parquet_filename": self._parquet_filename,
110
+ "fs": self.fs,
111
+ "debug": self.debug,
112
+ "logger": self.logger,
113
+ "verbose": self.verbose,
114
+ "dataclass": self._data_wrapper_class,
115
+ "class_params": class_params,
116
+ "load_params": self.all_kwargs.get("load_params", {}) or {},
117
+ "mmanifest": self.mmanifest,
118
+ "update_planner": self.update_planner,
119
+ "date_field": self.all_kwargs.get("date_field"),
120
+ # pipeline execution knobs
121
+ "show_progress": bool(self.all_kwargs.get("show_progress", False)),
122
+ "timeout": float(self.all_kwargs.get("timeout", 30.0)),
123
+ "max_threads": int(self.all_kwargs.get("max_threads", 3)),
124
+ }
125
+ return DataWrapper(**cfg)
187
126
 
188
- def get_size_estimate(self, **kwargs) -> int:
127
+ # ---------- public API ----------
128
+ def load(self, **kwargs: Any):
189
129
  """
190
- Synchronously estimates artifact size for use in multi-threaded environments.
191
-
192
- This method safely executes asynchronous I/O operations from a synchronous
193
- context, handling variations in fsspec filesystem implementations.
130
+ Direct load using the configured data_wrapper_class (no planner/manifest round-trip).
131
+ Expected to return a Dask DataFrame from the loader.
194
132
  """
133
+ self.logger.info(f"Loading data from {self._storage_path}")
134
+
135
+ if not self._data_wrapper_class:
136
+ raise ValueError("data_wrapper_class is not configured.")
137
+
138
+ params = {
139
+ "backend": "parquet",
140
+ "fs": self.fs,
141
+ "logger": self.logger,
142
+ "debug": self.debug,
143
+ "parquet_storage_path": self._storage_path,
144
+ "parquet_filename": self._parquet_filename,
145
+ "parquet_start_date": self.all_kwargs.get("parquet_start_date"),
146
+ "parquet_end_date": self.all_kwargs.get("parquet_end_date"),
147
+ **(self.all_kwargs.get("class_params") or {}),
148
+ }
195
149
 
196
- async def _get_total_bytes_async():
197
- """A helper async coroutine to perform the I/O."""
198
- import asyncio
199
-
200
- files = await self.fs._glob(f"{self.parquet_storage_path}/*.parquet")
201
- if not files:
202
- return 0
150
+ cls = self._data_wrapper_class
151
+ with cls(**params) as instance:
152
+ return instance.load(**kwargs)
203
153
 
204
- size_tasks = [self.fs._size(f) for f in files]
205
- sizes = await asyncio.gather(*size_tasks)
206
- return sum(s for s in sizes if s is not None)
154
+ def generate_parquet(self, **kwargs: Any) -> None:
155
+ """
156
+ Generate or update Parquet according to the plan.
157
+ - Merges runtime kwargs
158
+ - Invalidates dependent caches
159
+ - Guards against duplicate concurrent runs
160
+ - Forwards retry/backoff to DataWrapper.process()
161
+ """
162
+ # Merge and invalidate caches that depend on runtime changes
163
+ self.all_kwargs.update(kwargs)
164
+ self._invalidate_cached("update_planner", "data_wrapper")
165
+ if "overwrite" in kwargs:
166
+ self._invalidate_cached("mmanifest")
167
+
168
+ # Global de-dupe guard
169
+ key = (self._storage_path, self._parquet_filename)
170
+ with ParquetArtifact._global_lock:
171
+ if key in ParquetArtifact._active_runs:
172
+ self.logger.info(
173
+ f"Run already in progress for {key}; skipping this invocation."
174
+ )
175
+ return
176
+ ParquetArtifact._active_runs.add(key)
207
177
 
208
178
  try:
209
- # Attempt the standard fsspec method first
210
- total_bytes = self.fs.sync(_get_total_bytes_async())
211
- except AttributeError:
212
- # fallback for filesystems like s3fs that lack .sync()
213
- total_bytes = self.fs.loop.run_until_complete(_get_total_bytes_async())
179
+ self.ensure_directory_exists(self._storage_path)
180
+
181
+ self.update_planner.generate_plan()
182
+ plan = getattr(self.update_planner, "plan", None)
183
+ if plan is None or (hasattr(plan, "empty") and plan.empty):
184
+ # Planning uses Pandas; this is safe to check.
185
+ self.logger.info("No updates needed. Skipping Parquet generation.")
186
+ return
187
+
188
+ # Print plan once per run
189
+ if (
190
+ getattr(self.update_planner, "show_progress", False)
191
+ and not getattr(self.update_planner, "_printed_this_run", False)
192
+ ):
193
+ self.update_planner.show_update_plan()
194
+ setattr(self.update_planner, "_printed_this_run", True)
195
+
196
+ # ---- forward retry/backoff knobs to DataWrapper.process() ----
197
+ dw_retry_kwargs = {
198
+ k: self.all_kwargs[k]
199
+ for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
200
+ if k in self.all_kwargs
201
+ }
214
202
 
215
- # Convert to megabytes, ensuring a minimum of 1
216
- return max(1, int(total_bytes / (1024 ** 2)))
203
+ with self._lock:
204
+ dw = self.data_wrapper # single cached_property access
205
+ if hasattr(dw, "process"):
206
+ dw.process(**dw_retry_kwargs)
207
+ if getattr(self.update_planner, "show_progress", False) and hasattr(
208
+ dw, "show_benchmark_summary"
209
+ ):
210
+ dw.show_benchmark_summary()
217
211
 
218
- def update_parquet(self, period: str = 'today', **kwargs) -> None:
219
- """Update the Parquet file with data from a specific period."""
212
+ finally:
213
+ with ParquetArtifact._global_lock:
214
+ ParquetArtifact._active_runs.discard(key)
215
+
216
+ def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
217
+ """
218
+ High-level entry point to update Parquet for a given period:
219
+ - 'today', 'yesterday', 'last_7_days', etc. via DateUtils.parse_period
220
+ - 'ytd'
221
+ - 'itd' (requires history_begins_on)
222
+ - 'custom' (requires start_on / end_on)
223
+ Also accepts retry/backoff knobs which flow to DataWrapper.process().
224
+ """
225
+ final_kwargs = {**self.all_kwargs, **kwargs}
220
226
 
221
227
  def itd_config():
222
- try:
223
- start_date = kwargs.pop('history_begins_on')
224
- except KeyError:
225
- raise ValueError("For period 'itd', you must provide 'history_begins_on' in kwargs.")
226
- return {'parquet_start_date': start_date, 'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')}
228
+ start_date = final_kwargs.get("history_begins_on")
229
+ if not start_date:
230
+ raise ValueError(
231
+ "For period 'itd', 'history_begins_on' must be configured."
232
+ )
233
+ return {
234
+ "parquet_start_date": start_date,
235
+ "parquet_end_date": dt.date.today(),
236
+ }
227
237
 
228
238
  def ytd_config():
229
239
  return {
230
- 'parquet_start_date': datetime.date(datetime.date.today().year, 1, 1).strftime('%Y-%m-%d'),
231
- 'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')
240
+ "parquet_start_date": dt.date(dt.date.today().year, 1, 1),
241
+ "parquet_end_date": dt.date.today(),
232
242
  }
233
243
 
234
244
  def custom_config():
235
- try:
236
- start_date = kwargs.pop('start_on')
237
- end_date = kwargs.pop('end_on')
238
- except KeyError:
239
- raise ValueError("For period 'custom', you must provide 'start_on' in kwargs.")
240
- return {
241
- 'parquet_start_date': start_date,
242
- 'parquet_end_date': end_date
245
+ """
246
+ Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
247
+ are provided (with backward compatibility for `start_date`/`end_date` aliases).
248
+ """
249
+ # Backward compatibility: normalize aliases
250
+ alias_map = {
251
+ "start_on": ("start_date", "start"),
252
+ "end_on": ("end_date", "end"),
243
253
  }
254
+ normalized_kwargs = dict(kwargs) # shallow copy so we don't mutate original
255
+ for target, aliases in alias_map.items():
256
+ if target not in normalized_kwargs:
257
+ for alias in aliases:
258
+ if alias in normalized_kwargs:
259
+ normalized_kwargs[target] = normalized_kwargs[alias]
260
+ break
261
+
262
+ # Validation
263
+ missing = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
264
+ if missing:
265
+ raise ValueError(
266
+ f"For period 'custom', the following required parameters are missing: {', '.join(missing)}"
267
+ )
244
268
 
245
- config_map = {
246
- 'itd': itd_config,
247
- 'ytd': ytd_config,
248
- 'custom': custom_config,
249
- }
269
+ return {
270
+ "parquet_start_date": normalized_kwargs["start_on"],
271
+ "parquet_end_date": normalized_kwargs["end_on"],
272
+ }
250
273
 
251
- if period in config_map:
252
- kwargs.update(config_map[period]())
274
+ if period == "itd":
275
+ period_params = itd_config()
276
+ elif period == "ytd":
277
+ period_params = ytd_config()
278
+ elif period == "custom":
279
+ period_params = custom_config()
253
280
  else:
254
- kwargs.update(self.parse_parquet_period(period=period))
255
- self.logger.debug(f"kwargs passed to update parquet: {kwargs}")
256
- self.generate_parquet(**kwargs)
257
-
258
- def rebuild_parquet(self, **kwargs) -> None:
259
- """Rebuild the Parquet file from the start to end date."""
260
- kwargs.update(self._get_rebuild_params(kwargs))
261
- self.generate_parquet(**kwargs)
262
-
263
- def _get_rebuild_params(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
264
- """Prepare parameters for rebuilding the Parquet file."""
265
- return {
266
- 'overwrite': True,
267
- 'reverse_order': True,
268
- 'start_date': kwargs.get('parquet_start_date', self.parquet_start_date),
269
- 'end_date': kwargs.get('parquet_end_date', self.parquet_end_date),
270
- }
281
+ start_date, end_date = DateUtils.parse_period(period=period)
282
+ period_params = {
283
+ "parquet_start_date": start_date,
284
+ "parquet_end_date": end_date,
285
+ }
271
286
 
272
- def _prepare_update_params(self, **kwargs) -> Dict[str, Any]:
273
- self.reverse_order = kwargs.pop('reverse_order', True)
274
- self.overwrite = kwargs.pop('overwrite', False)
275
- self.ignore_missing = kwargs.pop('ignore_missing', False)
276
- self.history_days_threshold = kwargs.pop('history_days_threshold', 30)
277
- self.max_age_minutes = kwargs.pop('max_age_minutes', 10)
278
- self.show_progress = kwargs.pop('show_progress', False)
279
- self.start_date = kwargs.pop('parquet_start_date', self.parquet_start_date)
280
- self.end_date = kwargs.pop('parquet_end_date', self.parquet_end_date)
281
- self.parquet_filename = kwargs.pop('parquet_filename', self.parquet_filename)
282
- self.verbose = kwargs.pop('verbose', False)
283
-
284
- self.update_planner_params.update({
285
- 'filename': self.parquet_filename,
286
- 'data_path': self.parquet_storage_path,
287
- 'fs': self.fs,
288
- 'debug': self.debug,
289
- 'logger': self.logger,
290
- 'reverse_order': self.reverse_order,
291
- 'overwrite': self.overwrite,
292
- 'ignore_missing': self.ignore_missing,
293
- 'history_days_threshold': self.history_days_threshold,
294
- 'max_age_minutes': self.max_age_minutes,
295
- 'show_progress': self.show_progress,
296
- 'description': f"{self.data_wrapper_class.__name__}",
297
- 'skipped': self.skipped,
298
- 'verbose': self.verbose,
299
- })
300
-
301
- self.datawrapper_params = {
302
- 'parquet_filename': self.parquet_filename,
303
- 'data_path': self.parquet_storage_path,
304
- 'fs': self.fs,
305
- 'debug': self.debug,
306
- 'logger': self.logger,
307
- 'class_params': self.class_params,
308
- 'date_field': self.date_field,
309
- 'load_params': self.load_params,
310
- 'verbose': self.verbose
311
- }
287
+ final_kwargs.update(period_params)
288
+ self.logger.debug(
289
+ f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}"
290
+ )
312
291
 
313
- def parse_parquet_period(self, **kwargs):
314
- start_date, end_date = DateUtils.parse_period(**kwargs)
315
- self.parquet_start_date = start_date.strftime('%Y-%m-%d')
316
- self.parquet_end_date = end_date.strftime('%Y-%m-%d')
317
- return {
318
- 'parquet_start_date': self.parquet_start_date,
319
- 'parquet_end_date': self.parquet_end_date,
320
- }
292
+ # Delegate to generator (handles cache invalidation + forwarding knobs)
293
+ self.generate_parquet(**final_kwargs)
321
294
 
295
+ # ---------- utils ----------
322
296
  def ensure_directory_exists(self, path: str) -> None:
323
- """Ensure the directory exists in the specified filesystem."""
297
+ """Ensure the directory exists across fsspec backends."""
324
298
  with self._lock:
325
- try:
326
- self.fs.makedirs(path, exist_ok=True)
327
- except Exception as e:
328
- raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
299
+ if not self.fs.exists(path):
300
+ self.logger.info(f"Creating directory: {path}")
301
+ try:
302
+ self.fs.makedirs(path, exist_ok=True)
303
+ except TypeError:
304
+ try:
305
+ self.fs.makedirs(path)
306
+ except FileExistsError:
307
+ pass
308
+
309
+ def _cleanup(self):
310
+ """Clean up resources upon exit."""
311
+ try:
312
+ if "mmanifest" in self.__dict__ and getattr(
313
+ self.mmanifest, "_new_records", None
314
+ ):
315
+ if self.mmanifest._new_records:
316
+ self.mmanifest.save()
317
+ if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
318
+ self.data_wrapper.close()
319
+ except Exception as e:
320
+ self.logger.warning(f"Error during resource cleanup: {e}")