sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/__init__.py +3 -2
- sibi_dst/df_helper/_artifact_updater_async.py +238 -0
- sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
- sibi_dst/df_helper/_df_helper.py +418 -118
- sibi_dst/df_helper/_parquet_artifact.py +275 -283
- sibi_dst/df_helper/_parquet_reader.py +9 -10
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/route_path_builder.py +45 -46
- sibi_dst/utils/__init__.py +2 -0
- sibi_dst/utils/base.py +235 -100
- sibi_dst/utils/business_days.py +248 -0
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +392 -88
- sibi_dst/utils/date_utils.py +711 -393
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_age_checker.py +301 -0
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/periods.py +42 -0
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -1,328 +1,320 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
|
2
|
+
|
3
|
+
import datetime as dt
|
3
4
|
import threading
|
4
|
-
from
|
5
|
+
from functools import cached_property
|
6
|
+
from typing import Any, Dict, Type, TypeVar
|
5
7
|
|
6
|
-
|
7
|
-
import
|
8
|
+
from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
|
9
|
+
from sibi_dst.utils import MissingManifestManager, Logger
|
8
10
|
|
9
|
-
|
10
|
-
from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner
|
11
|
-
from sibi_dst.utils import MissingManifestManager
|
11
|
+
T = TypeVar("T")
|
12
12
|
|
13
13
|
|
14
|
-
class ParquetArtifact(
|
14
|
+
class ParquetArtifact(ManagedResource):
|
15
15
|
"""
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
Detailed functionality includes support for dynamically managing and generating
|
23
|
-
Parquet files based on time periods, with customizable options for paths,
|
24
|
-
filenames, date fields, and more. It is an abstraction for efficiently handling
|
25
|
-
storage tasks related to distributed or local file systems.
|
26
|
-
|
27
|
-
:ivar config: Configuration dictionary containing all configurable parameters
|
28
|
-
for managing Parquet data storage, such as paths, filenames,
|
29
|
-
and date ranges.
|
30
|
-
:type config: dict
|
31
|
-
:ivar df: Cached Dask DataFrame used to store and manipulate data loaded
|
32
|
-
from the Parquet file.
|
33
|
-
:type df: Optional[dask.dataframe.DataFrame]
|
34
|
-
:ivar data_wrapper_class: Class responsible for abstracting data processing
|
35
|
-
operations required for Parquet file generation.
|
36
|
-
:type data_wrapper_class: type
|
37
|
-
:ivar date_field: Name of the field used to identify and process data by date.
|
38
|
-
:type date_field: Optional[str]
|
39
|
-
:ivar parquet_storage_path: Filesystem path to store Parquet files.
|
40
|
-
:type parquet_storage_path: Optional[str]
|
41
|
-
:ivar parquet_filename: Name of the Parquet file to be generated and managed.
|
42
|
-
:type parquet_filename: Optional[str]
|
43
|
-
:ivar parquet_start_date: Date string specifying the start date for data range
|
44
|
-
processing.
|
45
|
-
:type parquet_start_date: Optional[str]
|
46
|
-
:ivar parquet_end_date: Date string specifying the end date for data range
|
47
|
-
processing.
|
48
|
-
:type parquet_end_date: Optional[str]
|
49
|
-
:ivar filesystem_type: Type of the filesystem used for managing storage
|
50
|
-
operations (e.g., `file`, `s3`, etc.).
|
51
|
-
:type filesystem_type: str
|
52
|
-
:ivar filesystem_options: Additional options for configuring the filesystem.
|
53
|
-
:type filesystem_options: dict
|
54
|
-
:ivar fs: Filesystem object used for storage operations.
|
55
|
-
:type fs: fsspec.AbstractFileSystem
|
16
|
+
Orchestrates a single dataset:
|
17
|
+
- Builds/uses MissingManifestManager
|
18
|
+
- Plans work with UpdatePlanner
|
19
|
+
- Executes with DataWrapper (threaded) saving Dask → Parquet
|
20
|
+
- Prevents duplicate concurrent runs per (storage_path, filename)
|
21
|
+
- Forwards retry/backoff knobs to DataWrapper.process()
|
56
22
|
"""
|
57
|
-
DEFAULT_CONFIG: ClassVar[Dict[str, str]] = {
|
58
|
-
'backend': 'parquet'
|
59
|
-
}
|
60
23
|
|
24
|
+
_global_lock = threading.RLock()
|
25
|
+
_active_runs: set[tuple[str, str]] = set()
|
61
26
|
|
62
|
-
def __init__(self,
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
necessary directories exist. The configuration supports a variety of options
|
67
|
-
to manage parquet storage requirements, including paths, filenames, and date
|
68
|
-
ranges.
|
69
|
-
|
70
|
-
:param data_wrapper_class: The class responsible for wrapping data to be managed
|
71
|
-
by this instance.
|
72
|
-
:type data_wrapper_class: type
|
73
|
-
:param kwargs: Arbitrary keyword arguments to override default configuration.
|
74
|
-
Includes settings for `date_field`, `parquet_storage_path`,
|
75
|
-
`parquet_filename`, `parquet_start_date`, `parquet_end_date`,
|
76
|
-
`filesystem_type`, `filesystem_options`, and `fs`.
|
77
|
-
:type kwargs: dict
|
78
|
-
|
79
|
-
:raises ValueError: If any of the required configuration options
|
80
|
-
(`date_field`, `parquet_storage_path`,
|
81
|
-
`parquet_filename`, `parquet_start_date`,
|
82
|
-
or `parquet_end_date`) are missing or not set properly.
|
83
|
-
"""
|
27
|
+
def __init__(self, **kwargs: Any):
|
28
|
+
# Merge defaults from ManagedResource and caller kwargs
|
29
|
+
self.all_kwargs: Dict[str, Any] = {**kwargs}
|
30
|
+
super().__init__(**self.all_kwargs)
|
84
31
|
|
85
|
-
|
86
|
-
self._lock = threading.
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
self.
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
self.ensure_directory_exists(
|
111
|
-
|
112
|
-
|
113
|
-
self.datawrapper_params = {}
|
114
|
-
|
115
|
-
def _validate_required(self, key: str) -> Any:
|
116
|
-
"""Validate required configuration fields."""
|
117
|
-
value = self.config.setdefault(key, None)
|
118
|
-
if value is None:
|
119
|
-
raise ValueError(f'{key} must be set')
|
120
|
-
return value
|
121
|
-
|
122
|
-
def _setup_manifest(self, overwrite: bool = False, ignore_missing: bool = False):
|
123
|
-
self.skipped = []
|
124
|
-
self.missing_manifest_path = f"{self.parquet_storage_path}_manifests/missing.parquet"
|
125
|
-
self.mmanifest = MissingManifestManager(
|
32
|
+
# Persist the minimal config we depend on frequently
|
33
|
+
self._lock = threading.RLock()
|
34
|
+
|
35
|
+
# Required knobs
|
36
|
+
self._storage_path: str = self.all_kwargs["parquet_storage_path"]
|
37
|
+
self._parquet_filename: str = self.all_kwargs["parquet_filename"]
|
38
|
+
self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
|
39
|
+
|
40
|
+
# ---------- helpers ----------
|
41
|
+
def _invalidate_cached(self, *names: str) -> None:
|
42
|
+
for n in names:
|
43
|
+
self.__dict__.pop(n, None)
|
44
|
+
|
45
|
+
def _build_manifest_path(self) -> str:
|
46
|
+
base = f"{self._storage_path}".rstrip("/") + "/"
|
47
|
+
return f"{base}_manifests/missing.parquet"
|
48
|
+
|
49
|
+
# ---------- lazy members ----------
|
50
|
+
@cached_property
|
51
|
+
def mmanifest(self) -> MissingManifestManager:
|
52
|
+
self.logger.info("Initializing MissingManifestManager...")
|
53
|
+
manifest_path = self._build_manifest_path()
|
54
|
+
|
55
|
+
# ensure manifest directory exists
|
56
|
+
manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
|
57
|
+
self.ensure_directory_exists(manifest_dir)
|
58
|
+
|
59
|
+
mgr = MissingManifestManager(
|
126
60
|
fs=self.fs,
|
127
|
-
manifest_path=
|
128
|
-
clear_existing=overwrite,
|
129
|
-
debug=
|
130
|
-
logger=self.logger
|
61
|
+
manifest_path=manifest_path,
|
62
|
+
clear_existing=self.all_kwargs.get("overwrite", False),
|
63
|
+
debug=self.debug,
|
64
|
+
logger=self.logger,
|
65
|
+
overwrite=self.all_kwargs.get("overwrite", False),
|
131
66
|
)
|
132
67
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
|
137
|
-
self.mmanifest.save()
|
138
|
-
#self.mmanifest.cleanup_temp_manifests()
|
68
|
+
if not mgr._safe_exists(mgr.manifest_path):
|
69
|
+
self.logger.info(f"Creating new manifest at {mgr.manifest_path}")
|
70
|
+
mgr.save()
|
139
71
|
else:
|
140
|
-
self.logger.info(f"Manifest already exists at {
|
72
|
+
self.logger.info(f"Manifest already exists at {mgr.manifest_path}")
|
73
|
+
|
74
|
+
return mgr
|
75
|
+
|
76
|
+
@cached_property
|
77
|
+
def update_planner(self) -> UpdatePlanner:
|
78
|
+
self.logger.info("Initializing UpdatePlanner...")
|
79
|
+
skipped_files = self.mmanifest.load_existing() or []
|
80
|
+
|
81
|
+
cfg = {
|
82
|
+
**self.all_kwargs,
|
83
|
+
"fs": self.fs,
|
84
|
+
"debug": self.debug,
|
85
|
+
"logger": self.logger,
|
86
|
+
"description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
|
87
|
+
"skipped": list(skipped_files),
|
88
|
+
"mmanifest": self.mmanifest,
|
89
|
+
}
|
90
|
+
return UpdatePlanner(**cfg)
|
141
91
|
|
142
|
-
|
143
|
-
|
144
|
-
self.logger.info(
|
145
|
-
if overwrite:
|
146
|
-
self.skipped = []
|
147
|
-
self.ignore_missing = False
|
92
|
+
@cached_property
|
93
|
+
def data_wrapper(self) -> DataWrapper:
|
94
|
+
self.logger.info("Initializing DataWrapper...")
|
148
95
|
|
149
|
-
|
150
|
-
self.
|
151
|
-
|
152
|
-
self.update_planner.generate_plan(start=self.start_date,end= self.end_date)
|
96
|
+
# Ensure the planner has a plan
|
97
|
+
if getattr(self.update_planner, "plan", None) is None:
|
98
|
+
self.update_planner.generate_plan()
|
153
99
|
|
154
|
-
|
155
|
-
|
156
|
-
self.
|
157
|
-
|
100
|
+
class_params = {
|
101
|
+
"debug": self.debug,
|
102
|
+
"logger": self.logger,
|
103
|
+
"fs": self.fs,
|
104
|
+
"verbose": self.verbose,
|
105
|
+
}
|
158
106
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
self.
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
try:
|
179
|
-
if self.mmanifest and self.mmanifest._new_records:
|
180
|
-
self.mmanifest.save()
|
181
|
-
except Exception as e:
|
182
|
-
self.logger.warning(f"Error closing filesystem: {e}")
|
183
|
-
finally:
|
184
|
-
super().__exit__(exc_type, exc_value, traceback)
|
185
|
-
# return False so exceptions aren’t suppressed
|
186
|
-
return False
|
107
|
+
cfg = {
|
108
|
+
"data_path": self._storage_path,
|
109
|
+
"parquet_filename": self._parquet_filename,
|
110
|
+
"fs": self.fs,
|
111
|
+
"debug": self.debug,
|
112
|
+
"logger": self.logger,
|
113
|
+
"verbose": self.verbose,
|
114
|
+
"dataclass": self._data_wrapper_class,
|
115
|
+
"class_params": class_params,
|
116
|
+
"load_params": self.all_kwargs.get("load_params", {}) or {},
|
117
|
+
"mmanifest": self.mmanifest,
|
118
|
+
"update_planner": self.update_planner,
|
119
|
+
"date_field": self.all_kwargs.get("date_field"),
|
120
|
+
# pipeline execution knobs
|
121
|
+
"show_progress": bool(self.all_kwargs.get("show_progress", False)),
|
122
|
+
"timeout": float(self.all_kwargs.get("timeout", 30.0)),
|
123
|
+
"max_threads": int(self.all_kwargs.get("max_threads", 3)),
|
124
|
+
}
|
125
|
+
return DataWrapper(**cfg)
|
187
126
|
|
188
|
-
|
127
|
+
# ---------- public API ----------
|
128
|
+
def load(self, **kwargs: Any):
|
189
129
|
"""
|
190
|
-
|
191
|
-
|
192
|
-
This method safely executes asynchronous I/O operations from a synchronous
|
193
|
-
context, handling variations in fsspec filesystem implementations.
|
130
|
+
Direct load using the configured data_wrapper_class (no planner/manifest round-trip).
|
131
|
+
Expected to return a Dask DataFrame from the loader.
|
194
132
|
"""
|
133
|
+
self.logger.info(f"Loading data from {self._storage_path}")
|
134
|
+
|
135
|
+
if not self._data_wrapper_class:
|
136
|
+
raise ValueError("data_wrapper_class is not configured.")
|
137
|
+
|
138
|
+
params = {
|
139
|
+
"backend": "parquet",
|
140
|
+
"fs": self.fs,
|
141
|
+
"logger": self.logger,
|
142
|
+
"debug": self.debug,
|
143
|
+
"parquet_storage_path": self._storage_path,
|
144
|
+
"parquet_filename": self._parquet_filename,
|
145
|
+
"parquet_start_date": self.all_kwargs.get("parquet_start_date"),
|
146
|
+
"parquet_end_date": self.all_kwargs.get("parquet_end_date"),
|
147
|
+
**(self.all_kwargs.get("class_params") or {}),
|
148
|
+
}
|
195
149
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
files = await self.fs._glob(f"{self.parquet_storage_path}/*.parquet")
|
201
|
-
if not files:
|
202
|
-
return 0
|
150
|
+
cls = self._data_wrapper_class
|
151
|
+
with cls(**params) as instance:
|
152
|
+
return instance.load(**kwargs)
|
203
153
|
|
204
|
-
|
205
|
-
|
206
|
-
|
154
|
+
def generate_parquet(self, **kwargs: Any) -> None:
|
155
|
+
"""
|
156
|
+
Generate or update Parquet according to the plan.
|
157
|
+
- Merges runtime kwargs
|
158
|
+
- Invalidates dependent caches
|
159
|
+
- Guards against duplicate concurrent runs
|
160
|
+
- Forwards retry/backoff to DataWrapper.process()
|
161
|
+
"""
|
162
|
+
# Merge and invalidate caches that depend on runtime changes
|
163
|
+
self.all_kwargs.update(kwargs)
|
164
|
+
self._invalidate_cached("update_planner", "data_wrapper")
|
165
|
+
if "overwrite" in kwargs:
|
166
|
+
self._invalidate_cached("mmanifest")
|
167
|
+
|
168
|
+
# Global de-dupe guard
|
169
|
+
key = (self._storage_path, self._parquet_filename)
|
170
|
+
with ParquetArtifact._global_lock:
|
171
|
+
if key in ParquetArtifact._active_runs:
|
172
|
+
self.logger.info(
|
173
|
+
f"Run already in progress for {key}; skipping this invocation."
|
174
|
+
)
|
175
|
+
return
|
176
|
+
ParquetArtifact._active_runs.add(key)
|
207
177
|
|
208
178
|
try:
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
179
|
+
self.ensure_directory_exists(self._storage_path)
|
180
|
+
|
181
|
+
self.update_planner.generate_plan()
|
182
|
+
plan = getattr(self.update_planner, "plan", None)
|
183
|
+
if plan is None or (hasattr(plan, "empty") and plan.empty):
|
184
|
+
# Planning uses Pandas; this is safe to check.
|
185
|
+
self.logger.info("No updates needed. Skipping Parquet generation.")
|
186
|
+
return
|
187
|
+
|
188
|
+
# Print plan once per run
|
189
|
+
if (
|
190
|
+
getattr(self.update_planner, "show_progress", False)
|
191
|
+
and not getattr(self.update_planner, "_printed_this_run", False)
|
192
|
+
):
|
193
|
+
self.update_planner.show_update_plan()
|
194
|
+
setattr(self.update_planner, "_printed_this_run", True)
|
195
|
+
|
196
|
+
# ---- forward retry/backoff knobs to DataWrapper.process() ----
|
197
|
+
dw_retry_kwargs = {
|
198
|
+
k: self.all_kwargs[k]
|
199
|
+
for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
|
200
|
+
if k in self.all_kwargs
|
201
|
+
}
|
214
202
|
|
215
|
-
|
216
|
-
|
203
|
+
with self._lock:
|
204
|
+
dw = self.data_wrapper # single cached_property access
|
205
|
+
if hasattr(dw, "process"):
|
206
|
+
dw.process(**dw_retry_kwargs)
|
207
|
+
if getattr(self.update_planner, "show_progress", False) and hasattr(
|
208
|
+
dw, "show_benchmark_summary"
|
209
|
+
):
|
210
|
+
dw.show_benchmark_summary()
|
217
211
|
|
218
|
-
|
219
|
-
|
212
|
+
finally:
|
213
|
+
with ParquetArtifact._global_lock:
|
214
|
+
ParquetArtifact._active_runs.discard(key)
|
215
|
+
|
216
|
+
def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
|
217
|
+
"""
|
218
|
+
High-level entry point to update Parquet for a given period:
|
219
|
+
- 'today', 'yesterday', 'last_7_days', etc. via DateUtils.parse_period
|
220
|
+
- 'ytd'
|
221
|
+
- 'itd' (requires history_begins_on)
|
222
|
+
- 'custom' (requires start_on / end_on)
|
223
|
+
Also accepts retry/backoff knobs which flow to DataWrapper.process().
|
224
|
+
"""
|
225
|
+
final_kwargs = {**self.all_kwargs, **kwargs}
|
220
226
|
|
221
227
|
def itd_config():
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
228
|
+
start_date = final_kwargs.get("history_begins_on")
|
229
|
+
if not start_date:
|
230
|
+
raise ValueError(
|
231
|
+
"For period 'itd', 'history_begins_on' must be configured."
|
232
|
+
)
|
233
|
+
return {
|
234
|
+
"parquet_start_date": start_date,
|
235
|
+
"parquet_end_date": dt.date.today(),
|
236
|
+
}
|
227
237
|
|
228
238
|
def ytd_config():
|
229
239
|
return {
|
230
|
-
|
231
|
-
|
240
|
+
"parquet_start_date": dt.date(dt.date.today().year, 1, 1),
|
241
|
+
"parquet_end_date": dt.date.today(),
|
232
242
|
}
|
233
243
|
|
234
244
|
def custom_config():
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
245
|
+
"""
|
246
|
+
Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
|
247
|
+
are provided (with backward compatibility for `start_date`/`end_date` aliases).
|
248
|
+
"""
|
249
|
+
# Backward compatibility: normalize aliases
|
250
|
+
alias_map = {
|
251
|
+
"start_on": ("start_date", "start"),
|
252
|
+
"end_on": ("end_date", "end"),
|
243
253
|
}
|
254
|
+
normalized_kwargs = dict(kwargs) # shallow copy so we don't mutate original
|
255
|
+
for target, aliases in alias_map.items():
|
256
|
+
if target not in normalized_kwargs:
|
257
|
+
for alias in aliases:
|
258
|
+
if alias in normalized_kwargs:
|
259
|
+
normalized_kwargs[target] = normalized_kwargs[alias]
|
260
|
+
break
|
261
|
+
|
262
|
+
# Validation
|
263
|
+
missing = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
|
264
|
+
if missing:
|
265
|
+
raise ValueError(
|
266
|
+
f"For period 'custom', the following required parameters are missing: {', '.join(missing)}"
|
267
|
+
)
|
244
268
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
}
|
269
|
+
return {
|
270
|
+
"parquet_start_date": normalized_kwargs["start_on"],
|
271
|
+
"parquet_end_date": normalized_kwargs["end_on"],
|
272
|
+
}
|
250
273
|
|
251
|
-
if period
|
252
|
-
|
274
|
+
if period == "itd":
|
275
|
+
period_params = itd_config()
|
276
|
+
elif period == "ytd":
|
277
|
+
period_params = ytd_config()
|
278
|
+
elif period == "custom":
|
279
|
+
period_params = custom_config()
|
253
280
|
else:
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
"""Rebuild the Parquet file from the start to end date."""
|
260
|
-
kwargs.update(self._get_rebuild_params(kwargs))
|
261
|
-
self.generate_parquet(**kwargs)
|
262
|
-
|
263
|
-
def _get_rebuild_params(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
264
|
-
"""Prepare parameters for rebuilding the Parquet file."""
|
265
|
-
return {
|
266
|
-
'overwrite': True,
|
267
|
-
'reverse_order': True,
|
268
|
-
'start_date': kwargs.get('parquet_start_date', self.parquet_start_date),
|
269
|
-
'end_date': kwargs.get('parquet_end_date', self.parquet_end_date),
|
270
|
-
}
|
281
|
+
start_date, end_date = DateUtils.parse_period(period=period)
|
282
|
+
period_params = {
|
283
|
+
"parquet_start_date": start_date,
|
284
|
+
"parquet_end_date": end_date,
|
285
|
+
}
|
271
286
|
|
272
|
-
|
273
|
-
self.
|
274
|
-
|
275
|
-
|
276
|
-
self.history_days_threshold = kwargs.pop('history_days_threshold', 30)
|
277
|
-
self.max_age_minutes = kwargs.pop('max_age_minutes', 10)
|
278
|
-
self.show_progress = kwargs.pop('show_progress', False)
|
279
|
-
self.start_date = kwargs.pop('parquet_start_date', self.parquet_start_date)
|
280
|
-
self.end_date = kwargs.pop('parquet_end_date', self.parquet_end_date)
|
281
|
-
self.parquet_filename = kwargs.pop('parquet_filename', self.parquet_filename)
|
282
|
-
self.verbose = kwargs.pop('verbose', False)
|
283
|
-
|
284
|
-
self.update_planner_params.update({
|
285
|
-
'filename': self.parquet_filename,
|
286
|
-
'data_path': self.parquet_storage_path,
|
287
|
-
'fs': self.fs,
|
288
|
-
'debug': self.debug,
|
289
|
-
'logger': self.logger,
|
290
|
-
'reverse_order': self.reverse_order,
|
291
|
-
'overwrite': self.overwrite,
|
292
|
-
'ignore_missing': self.ignore_missing,
|
293
|
-
'history_days_threshold': self.history_days_threshold,
|
294
|
-
'max_age_minutes': self.max_age_minutes,
|
295
|
-
'show_progress': self.show_progress,
|
296
|
-
'description': f"{self.data_wrapper_class.__name__}",
|
297
|
-
'skipped': self.skipped,
|
298
|
-
'verbose': self.verbose,
|
299
|
-
})
|
300
|
-
|
301
|
-
self.datawrapper_params = {
|
302
|
-
'parquet_filename': self.parquet_filename,
|
303
|
-
'data_path': self.parquet_storage_path,
|
304
|
-
'fs': self.fs,
|
305
|
-
'debug': self.debug,
|
306
|
-
'logger': self.logger,
|
307
|
-
'class_params': self.class_params,
|
308
|
-
'date_field': self.date_field,
|
309
|
-
'load_params': self.load_params,
|
310
|
-
'verbose': self.verbose
|
311
|
-
}
|
287
|
+
final_kwargs.update(period_params)
|
288
|
+
self.logger.debug(
|
289
|
+
f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}"
|
290
|
+
)
|
312
291
|
|
313
|
-
|
314
|
-
|
315
|
-
self.parquet_start_date = start_date.strftime('%Y-%m-%d')
|
316
|
-
self.parquet_end_date = end_date.strftime('%Y-%m-%d')
|
317
|
-
return {
|
318
|
-
'parquet_start_date': self.parquet_start_date,
|
319
|
-
'parquet_end_date': self.parquet_end_date,
|
320
|
-
}
|
292
|
+
# Delegate to generator (handles cache invalidation + forwarding knobs)
|
293
|
+
self.generate_parquet(**final_kwargs)
|
321
294
|
|
295
|
+
# ---------- utils ----------
|
322
296
|
def ensure_directory_exists(self, path: str) -> None:
|
323
|
-
"""Ensure the directory exists
|
297
|
+
"""Ensure the directory exists across fsspec backends."""
|
324
298
|
with self._lock:
|
325
|
-
|
326
|
-
self.
|
327
|
-
|
328
|
-
|
299
|
+
if not self.fs.exists(path):
|
300
|
+
self.logger.info(f"Creating directory: {path}")
|
301
|
+
try:
|
302
|
+
self.fs.makedirs(path, exist_ok=True)
|
303
|
+
except TypeError:
|
304
|
+
try:
|
305
|
+
self.fs.makedirs(path)
|
306
|
+
except FileExistsError:
|
307
|
+
pass
|
308
|
+
|
309
|
+
def _cleanup(self):
|
310
|
+
"""Clean up resources upon exit."""
|
311
|
+
try:
|
312
|
+
if "mmanifest" in self.__dict__ and getattr(
|
313
|
+
self.mmanifest, "_new_records", None
|
314
|
+
):
|
315
|
+
if self.mmanifest._new_records:
|
316
|
+
self.mmanifest.save()
|
317
|
+
if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
|
318
|
+
self.data_wrapper.close()
|
319
|
+
except Exception as e:
|
320
|
+
self.logger.warning(f"Error during resource cleanup: {e}")
|