sibi-dst 2025.9.10__py3-none-any.whl → 2025.9.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +11 -6
- sibi_dst/df_helper/__init__.py +0 -1
- sibi_dst/df_helper/_artifact_updater_async.py +199 -175
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -3
- sibi_dst/osmnx_helper/__init__.py +3 -1
- sibi_dst/utils/__init__.py +2 -1
- sibi_dst/utils/boilerplate/base_pipeline.py +1 -2
- sibi_dst/utils/business_days.py +19 -51
- sibi_dst/utils/dask_utils.py +124 -1
- sibi_dst/utils/data_wrapper.py +0 -11
- sibi_dst/utils/filepath_generator.py +1 -154
- {sibi_dst-2025.9.10.dist-info → sibi_dst-2025.9.12.dist-info}/METADATA +26 -30
- {sibi_dst-2025.9.10.dist-info → sibi_dst-2025.9.12.dist-info}/RECORD +27 -27
- {sibi_dst-2025.9.10.dist-info → sibi_dst-2025.9.12.dist-info}/WHEEL +2 -1
- sibi_dst-2025.9.12.dist-info/top_level.txt +1 -0
- sibi_dst/df_helper/data_cleaner.py +0 -132
sibi_dst/__init__.py
CHANGED
@@ -10,12 +10,17 @@ try:
|
|
10
10
|
except version_reader.PackageNotFoundError:
|
11
11
|
__version__ = "unknown"
|
12
12
|
|
13
|
-
|
14
|
-
"__version__",
|
15
|
-
]
|
16
|
-
|
17
|
-
import sibi_dst.df_helper as df_helper
|
13
|
+
from sibi_dst.df_helper import *
|
18
14
|
from sibi_dst.osmnx_helper import *
|
19
15
|
from sibi_dst.geopy_helper import *
|
20
|
-
from sibi_dst
|
16
|
+
from sibi_dst import utils as sibiutils
|
21
17
|
|
18
|
+
|
19
|
+
__all__ = [
|
20
|
+
"__version__",
|
21
|
+
"DfHelper",
|
22
|
+
"ParquetArtifact",
|
23
|
+
"ParquetReader",
|
24
|
+
"ArtifactUpdaterMultiWrapperAsync",
|
25
|
+
"sibiutils"
|
26
|
+
]
|
sibi_dst/df_helper/__init__.py
CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
3
3
|
from ._df_helper import DfHelper
|
4
4
|
from ._parquet_artifact import ParquetArtifact
|
5
5
|
from ._parquet_reader import ParquetReader
|
6
|
-
#from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
|
7
6
|
from ._artifact_updater_async import ArtifactUpdaterMultiWrapperAsync
|
8
7
|
from ._artifact_updater_threaded import ArtifactUpdaterMultiWrapperThreaded
|
9
8
|
|
@@ -4,28 +4,24 @@ import asyncio
|
|
4
4
|
import datetime
|
5
5
|
import random
|
6
6
|
import time
|
7
|
-
|
7
|
+
import pickle
|
8
|
+
from contextlib import ExitStack, suppress
|
8
9
|
from dataclasses import dataclass
|
9
10
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Type
|
10
11
|
|
11
|
-
from sibi_dst.utils import ManagedResource
|
12
|
-
|
13
|
-
try:
|
14
|
-
from dask.distributed import Client, LocalCluster
|
15
|
-
except ImportError:
|
16
|
-
Client = None
|
17
|
-
LocalCluster = None
|
12
|
+
from sibi_dst.utils import ManagedResource, Logger
|
13
|
+
from sibi_dst.utils.dask_utils import DaskClientMixin
|
18
14
|
|
19
15
|
|
20
16
|
@dataclass(slots=True)
|
21
17
|
class _RetryCfg:
|
18
|
+
"""Retry and backoff configuration."""
|
22
19
|
attempts: int = 3
|
23
20
|
backoff_base: float = 2.0
|
24
21
|
backoff_max: float = 60.0
|
25
22
|
jitter: float = 0.15
|
26
23
|
|
27
24
|
|
28
|
-
# ---------------- Worker (safe for Dask pickling) ----------------
|
29
25
|
def run_artifact_update(
|
30
26
|
cls: Type,
|
31
27
|
artifact_class_kwargs: Dict[str, Any],
|
@@ -33,21 +29,43 @@ def run_artifact_update(
|
|
33
29
|
period: str,
|
34
30
|
artifact_kwargs: Dict[str, Any],
|
35
31
|
) -> Dict[str, Any]:
|
36
|
-
"""
|
32
|
+
"""
|
33
|
+
Executed inside Dask worker.
|
34
|
+
Instantiates artifact and runs update_parquet() with retry logic.
|
35
|
+
Reconstructs logger and filesystem if not provided (worker isolation safe).
|
36
|
+
"""
|
37
37
|
import logging
|
38
|
+
import fsspec
|
39
|
+
from sibi_dst.utils import Logger
|
38
40
|
|
39
|
-
logger
|
41
|
+
# ---- Reinitialize a lightweight logger for the worker
|
42
|
+
worker_logger = Logger.default_logger(logger_name=cls.__name__) if hasattr(Logger, "default_logger") else logging.getLogger(cls.__name__)
|
43
|
+
worker_logger.set_level(logging.INFO)
|
44
|
+
|
45
|
+
# ---- Ensure fs is recreated if missing
|
46
|
+
fs = artifact_class_kwargs.get("fs")
|
47
|
+
if fs is None or isinstance(fs, str):
|
48
|
+
try:
|
49
|
+
fs_protocol = fs if isinstance(fs, str) else "file"
|
50
|
+
fs = fsspec.filesystem(fs_protocol)
|
51
|
+
except Exception:
|
52
|
+
fs = fsspec.filesystem("file")
|
53
|
+
|
54
|
+
# ---- Merge reconstructed environment into kwargs
|
55
|
+
artifact_kwargs_final = {
|
56
|
+
**artifact_class_kwargs,
|
57
|
+
"logger": worker_logger,
|
58
|
+
"fs": fs,
|
59
|
+
}
|
40
60
|
|
41
|
-
|
42
|
-
|
43
|
-
success = False
|
44
|
-
error_msg = None
|
61
|
+
start_time = datetime.datetime.now()
|
62
|
+
success, error_msg, attempts = False, None, 0
|
45
63
|
|
46
64
|
for attempt in range(1, retry.attempts + 1):
|
47
|
-
|
65
|
+
attempts = attempt
|
48
66
|
try:
|
49
67
|
with ExitStack() as stack:
|
50
|
-
inst = cls(**
|
68
|
+
inst = cls(**artifact_kwargs_final)
|
51
69
|
inst = stack.enter_context(inst)
|
52
70
|
inst.update_parquet(period=period, **artifact_kwargs)
|
53
71
|
success = True
|
@@ -59,31 +77,40 @@ def run_artifact_update(
|
|
59
77
|
delay *= 1 + random.uniform(0, retry.jitter)
|
60
78
|
time.sleep(delay)
|
61
79
|
|
62
|
-
|
63
|
-
|
80
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
81
|
+
status = "😀" if success else "😩"
|
82
|
+
worker_logger.info(
|
83
|
+
f"{status} {cls.__name__} [{period}] finished in {duration:.2f}s ({attempts} attempt(s))"
|
84
|
+
)
|
64
85
|
|
65
86
|
return {
|
66
87
|
"artifact": cls.__name__,
|
67
88
|
"period": period,
|
68
|
-
"start": start_wall.isoformat(),
|
69
|
-
"end": end_wall.isoformat(),
|
70
|
-
"processing_time": duration,
|
71
|
-
"retries": attempt_count - 1 if success else attempt_count,
|
72
89
|
"success": success,
|
73
90
|
"error": error_msg,
|
91
|
+
"attempts": attempts,
|
92
|
+
"duration_seconds": duration,
|
93
|
+
"started_at": start_time.isoformat(),
|
94
|
+
"ended_at": datetime.datetime.now().isoformat(),
|
74
95
|
}
|
75
96
|
|
76
97
|
|
77
|
-
|
98
|
+
# ---------------- Async Orchestrator ----------------
|
99
|
+
class ArtifactUpdaterMultiWrapperAsync(DaskClientMixin, ManagedResource):
|
78
100
|
"""
|
79
|
-
Async
|
80
|
-
|
101
|
+
Async orchestrator for concurrent artifact updates.
|
102
|
+
|
103
|
+
• Uses Dask client (via DaskClientMixin) or local threads.
|
104
|
+
• Automatically sanitizes non-picklable arguments (e.g., loggers, fs).
|
105
|
+
• Provides structured retries, async orchestration, and safe cleanup.
|
81
106
|
"""
|
82
107
|
|
83
108
|
def __init__(
|
84
109
|
self,
|
85
110
|
wrapped_classes: Dict[str, Sequence[Type]],
|
86
111
|
*,
|
112
|
+
logger: Logger,
|
113
|
+
fs,
|
87
114
|
max_workers: int = 3,
|
88
115
|
retry_attempts: int = 3,
|
89
116
|
update_timeout_seconds: int = 600,
|
@@ -92,88 +119,127 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
92
119
|
backoff_jitter: float = 0.15,
|
93
120
|
priority_fn: Optional[Callable[[Type], int]] = None,
|
94
121
|
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
95
|
-
dask_client: Optional[Client] = None,
|
96
122
|
use_dask: bool = True,
|
123
|
+
dask_client: Optional[Any] = None,
|
124
|
+
debug: bool = False,
|
125
|
+
verbose: bool = False,
|
97
126
|
**kwargs: Any,
|
98
127
|
) -> None:
|
99
|
-
super().__init__(
|
128
|
+
super().__init__(logger=logger, fs=fs, debug=debug, verbose=verbose)
|
129
|
+
|
130
|
+
# ---- Client lifecycle management
|
131
|
+
|
132
|
+
self.own_dask_client = dask_client is None
|
133
|
+
self._init_dask_client(dask_client, logger=logger)
|
134
|
+
self.use_dask = use_dask
|
100
135
|
|
136
|
+
# ---- Core configuration
|
101
137
|
self.wrapped_classes = wrapped_classes
|
102
|
-
self.max_workers =
|
103
|
-
self.update_timeout_seconds = int(update_timeout_seconds)
|
138
|
+
self.max_workers = max_workers
|
104
139
|
self.priority_fn = priority_fn
|
105
|
-
self.
|
106
|
-
self.client: Optional[Client] = dask_client
|
107
|
-
self._owns_client = False
|
140
|
+
self.update_timeout_seconds = update_timeout_seconds
|
108
141
|
|
142
|
+
# ---- Retry configuration
|
109
143
|
self._retry = _RetryCfg(
|
110
|
-
attempts=
|
111
|
-
backoff_base=
|
112
|
-
backoff_max=
|
113
|
-
jitter=
|
144
|
+
attempts=retry_attempts,
|
145
|
+
backoff_base=backoff_base,
|
146
|
+
backoff_max=backoff_max,
|
147
|
+
jitter=backoff_jitter,
|
114
148
|
)
|
115
149
|
|
116
|
-
#
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
}
|
123
|
-
|
124
|
-
self.artifact_class_kwargs = {
|
125
|
-
"logger": self.logger,
|
126
|
-
"fs": self.fs,
|
127
|
-
"debug": self.debug,
|
128
|
-
"verbose": self.verbose,
|
129
|
-
**(artifact_class_kwargs or {}),
|
130
|
-
}
|
150
|
+
# ---- Artifact instantiation arguments
|
151
|
+
self.artifact_class_kwargs = {
|
152
|
+
"logger": logger,
|
153
|
+
"fs": fs,
|
154
|
+
"debug": debug,
|
155
|
+
"verbose": verbose,
|
156
|
+
**(artifact_class_kwargs or {}),
|
157
|
+
}
|
131
158
|
|
159
|
+
# ---- Runtime tracking
|
132
160
|
self.completion_secs: Dict[str, float] = {}
|
133
161
|
self.failed: List[str] = []
|
134
|
-
self.
|
135
|
-
|
136
|
-
if self.use_dask and Client is None:
|
137
|
-
raise RuntimeError("Dask is not installed, cannot use Dask mode")
|
138
|
-
|
139
|
-
# auto-start local client if requested
|
140
|
-
if self.use_dask and not self.client:
|
141
|
-
self.client = Client(
|
142
|
-
LocalCluster(
|
143
|
-
n_workers=max_workers,
|
144
|
-
threads_per_worker=1,
|
145
|
-
dashboard_address=None,
|
146
|
-
)
|
147
|
-
)
|
148
|
-
self._owns_client = True
|
162
|
+
self._stop_event = asyncio.Event()
|
149
163
|
|
150
|
-
|
164
|
+
self.logger_extra = {"sibi_dst_component": self.__class__.__name__}
|
165
|
+
|
166
|
+
if self.use_dask:
|
167
|
+
self.logger.debug(f"Initialized with Dask client: {self.dask_client}")
|
168
|
+
else:
|
169
|
+
self.logger.debug(f"Running in local thread-based mode.")
|
170
|
+
|
171
|
+
async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
|
172
|
+
"""Runs updates for all artifacts in a given period."""
|
173
|
+
self.completion_secs.clear()
|
174
|
+
self.failed.clear()
|
175
|
+
classes = self._classes_for(period)
|
176
|
+
|
177
|
+
self.logger.info(
|
178
|
+
f"Starting artifact updates for period '{period}' ({len(classes)} artifacts).",
|
179
|
+
extra=self.logger_extra,
|
180
|
+
)
|
151
181
|
|
152
|
-
def _classes_for(self, period: str) -> List[Type]:
|
153
182
|
try:
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
183
|
+
if self.use_dask:
|
184
|
+
futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
|
185
|
+
results = await asyncio.to_thread(lambda: self.dask_client.gather(futures))
|
186
|
+
else:
|
187
|
+
sem = asyncio.Semaphore(self.max_workers)
|
188
|
+
tasks = [self._run_one_async(cls, period, sem, kwargs) for cls in classes]
|
189
|
+
results = await asyncio.gather(*tasks)
|
190
|
+
|
191
|
+
self.logger.info(
|
192
|
+
f"Completed {len(results)} artifact updates for period '{period}'.",
|
193
|
+
extra=self.logger_extra,
|
194
|
+
)
|
195
|
+
return results
|
196
|
+
|
197
|
+
finally:
|
198
|
+
# Always cleanup if we own the client
|
199
|
+
if getattr(self, "own_dask_client", False):
|
200
|
+
self._close_dask_client()
|
201
|
+
|
202
|
+
|
203
|
+
def _sanitize_kwargs_for_dask(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
204
|
+
"""
|
205
|
+
Removes non-picklable runtime objects (e.g., loggers, fs) before sending to Dask.
|
206
|
+
"""
|
207
|
+
clean: Dict[str, Any] = {}
|
208
|
+
for k, v in kwargs.items():
|
160
209
|
try:
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
210
|
+
pickle.dumps(v)
|
211
|
+
clean[k] = v
|
212
|
+
except Exception:
|
213
|
+
self.logger.debug(f"Skipping non-picklable key '{k}' for Dask worker.")
|
214
|
+
return clean
|
165
215
|
|
166
216
|
def _submit_one_dask(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]):
|
167
|
-
|
217
|
+
"""Submit one artifact job to Dask."""
|
218
|
+
safe_kwargs = self._sanitize_kwargs_for_dask(self.artifact_class_kwargs)
|
219
|
+
return self.dask_client.submit(
|
168
220
|
run_artifact_update,
|
169
221
|
cls,
|
170
|
-
|
222
|
+
safe_kwargs,
|
171
223
|
self._retry,
|
172
224
|
period,
|
173
225
|
artifact_kwargs,
|
174
226
|
pure=False,
|
175
227
|
)
|
176
228
|
|
229
|
+
def _classes_for(self, period: str) -> List[Type]:
|
230
|
+
"""Selects artifact classes for the given period."""
|
231
|
+
try:
|
232
|
+
classes = list(self.wrapped_classes[period])
|
233
|
+
except KeyError:
|
234
|
+
raise ValueError(f"No artifacts configured for period '{period}'.")
|
235
|
+
if not classes:
|
236
|
+
raise ValueError(f"No artifact classes found for '{period}'.")
|
237
|
+
|
238
|
+
if self.priority_fn:
|
239
|
+
with suppress(Exception):
|
240
|
+
classes.sort(key=self.priority_fn)
|
241
|
+
return classes
|
242
|
+
|
177
243
|
async def _run_one_async(
|
178
244
|
self,
|
179
245
|
cls: Type,
|
@@ -181,112 +247,70 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
181
247
|
sem: asyncio.Semaphore,
|
182
248
|
artifact_kwargs: Dict[str, Any],
|
183
249
|
) -> Dict[str, Any]:
|
184
|
-
"""
|
250
|
+
"""Fallback local async execution (no Dask)."""
|
185
251
|
name = cls.__name__
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
"success": success,
|
231
|
-
"error": error_msg,
|
232
|
-
}
|
233
|
-
|
234
|
-
if success:
|
235
|
-
self.logger.info(f"✅ Artifact {name} succeeded", extra=result)
|
236
|
-
self.completion_secs[name] = duration
|
237
|
-
else:
|
238
|
-
self.logger.error(f"❌ Artifact {name} failed", extra=result)
|
239
|
-
self.failed.append(name)
|
240
|
-
|
241
|
-
return result
|
252
|
+
start_time = datetime.datetime.now()
|
253
|
+
|
254
|
+
async with sem:
|
255
|
+
for attempt in range(1, self._retry.attempts + 1):
|
256
|
+
try:
|
257
|
+
def _sync_block():
|
258
|
+
with ExitStack() as stack:
|
259
|
+
inst = cls(**self.artifact_class_kwargs)
|
260
|
+
inst = stack.enter_context(inst)
|
261
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
262
|
+
|
263
|
+
await asyncio.wait_for(
|
264
|
+
asyncio.to_thread(_sync_block),
|
265
|
+
timeout=self.update_timeout_seconds,
|
266
|
+
)
|
267
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
268
|
+
self.completion_secs[name] = duration
|
269
|
+
self.logger.info(f"✅ {name} completed in {duration:.2f}s")
|
270
|
+
return {
|
271
|
+
"artifact": name,
|
272
|
+
"period": period,
|
273
|
+
"success": True,
|
274
|
+
"attempts": attempt,
|
275
|
+
"duration_seconds": duration,
|
276
|
+
}
|
277
|
+
|
278
|
+
except Exception as e:
|
279
|
+
if attempt < self._retry.attempts:
|
280
|
+
delay = min(self._retry.backoff_base ** attempt, self._retry.backoff_max)
|
281
|
+
delay *= 1 + random.uniform(0, self._retry.jitter)
|
282
|
+
self.logger.warning(f"Retry {attempt}/{self._retry.attempts} for {name}: {e}")
|
283
|
+
await asyncio.sleep(delay)
|
284
|
+
else:
|
285
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
286
|
+
self.failed.append(name)
|
287
|
+
self.logger.error(f"❌ {name} failed after {attempt} attempts: {e}")
|
288
|
+
return {
|
289
|
+
"artifact": name,
|
290
|
+
"period": period,
|
291
|
+
"success": False,
|
292
|
+
"attempts": attempt,
|
293
|
+
"error": str(e),
|
294
|
+
"duration_seconds": duration,
|
295
|
+
}
|
242
296
|
|
243
|
-
# ---- Public API -----------------------------------------------------------
|
244
|
-
|
245
|
-
async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
|
246
|
-
self.completion_secs.clear()
|
247
|
-
self.failed.clear()
|
248
|
-
classes = self._classes_for(period)
|
249
|
-
|
250
|
-
try:
|
251
|
-
if self.use_dask:
|
252
|
-
futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
|
253
|
-
results = await asyncio.to_thread(lambda: self.client.gather(futures))
|
254
|
-
else:
|
255
|
-
sem = asyncio.Semaphore(self.max_workers)
|
256
|
-
tasks = [
|
257
|
-
asyncio.create_task(self._run_one_async(cls, period, sem, kwargs))
|
258
|
-
for cls in classes
|
259
|
-
]
|
260
|
-
results = await asyncio.gather(*tasks)
|
261
|
-
return results
|
262
|
-
finally:
|
263
|
-
# only shut down if we own the client
|
264
|
-
if self._owns_client:
|
265
|
-
self.close()
|
266
297
|
|
267
298
|
def get_update_status(self) -> Dict[str, Any]:
|
299
|
+
"""Returns summary of completed, failed, and pending artifacts."""
|
268
300
|
done = set(self.completion_secs)
|
269
301
|
fail = set(self.failed)
|
270
|
-
all_names = {
|
302
|
+
all_names = {cls.__name__ for v in self.wrapped_classes.values() for cls in v}
|
271
303
|
return {
|
272
304
|
"total": len(all_names),
|
273
305
|
"completed": sorted(done),
|
274
306
|
"failed": sorted(fail),
|
275
307
|
"pending": sorted(all_names - done - fail),
|
276
|
-
"completion_times":
|
308
|
+
"completion_times": self.completion_secs,
|
277
309
|
}
|
278
310
|
|
279
|
-
# ---- Lifecycle ------------------------------------------------------------
|
280
|
-
|
281
311
|
def _cleanup(self) -> None:
|
282
|
-
"""
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
self.client.close()
|
287
|
-
if cluster is not None:
|
288
|
-
cluster.close()
|
289
|
-
finally:
|
290
|
-
self.client = None
|
291
|
-
self._owns_client = False
|
312
|
+
"""Ensures safe resource closure."""
|
313
|
+
with suppress(Exception):
|
314
|
+
if getattr(self, "own_dask_client", False):
|
315
|
+
self._close_dask_client()
|
292
316
|
|
@@ -231,7 +231,7 @@ class ParquetConfig(BaseModel):
|
|
231
231
|
Builds a list of path patterns for dask.read_parquet.
|
232
232
|
Respects partition_on + start/end date if given.
|
233
233
|
"""
|
234
|
-
|
234
|
+
self.logger.debug(f"_resolve_paths_for_read: {self.partition_on}")
|
235
235
|
# Partitioned dataset by column
|
236
236
|
if self.partition_on and self.parquet_start_date and self.parquet_end_date:
|
237
237
|
if not isinstance(self.partition_on, (list, tuple)):
|
@@ -244,12 +244,10 @@ class ParquetConfig(BaseModel):
|
|
244
244
|
days = pd.date_range(start=start, end=end, freq="D").date
|
245
245
|
|
246
246
|
base = self.parquet_storage_path.rstrip("/")
|
247
|
-
print("base:",base)
|
248
247
|
result= [
|
249
248
|
f"{base}/{parts[0]}={d.isoformat()}/*.parquet"
|
250
249
|
for d in days
|
251
250
|
]
|
252
|
-
print("result:",result)
|
253
251
|
return result
|
254
252
|
|
255
253
|
# Date-ranged folders (non-partitioned, using FilePathGenerator)
|
sibi_dst/utils/__init__.py
CHANGED
@@ -24,6 +24,7 @@ from .manifest_manager import MissingManifestManager
|
|
24
24
|
__all__ = [
|
25
25
|
"Logger",
|
26
26
|
"ManagedResource",
|
27
|
+
|
27
28
|
"ConfigManager",
|
28
29
|
"ConfigLoader",
|
29
30
|
"DateUtils",
|
@@ -42,5 +43,5 @@ __all__ = [
|
|
42
43
|
"FsRegistry",
|
43
44
|
"DataFromHttpSource",
|
44
45
|
"WebDAVClient",
|
45
|
-
"MissingManifestManager"
|
46
|
+
"MissingManifestManager",
|
46
47
|
]
|
@@ -93,7 +93,7 @@ class BasePipeline(ManagedResource):
|
|
93
93
|
df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
|
94
94
|
df["partition_date"] = df[self.date_field].dt.date.astype(str)
|
95
95
|
|
96
|
-
out_path = self.storage_path.rstrip("/")
|
96
|
+
out_path = self.storage_path.rstrip("/")
|
97
97
|
self.logger.info("Saving dataset to %s", out_path)
|
98
98
|
ps = ParquetSaver(
|
99
99
|
df_result=df,
|
@@ -111,7 +111,6 @@ class BasePipeline(ManagedResource):
|
|
111
111
|
parquet_start_date=self.start_date,
|
112
112
|
parquet_end_date=self.end_date,
|
113
113
|
parquet_storage_path=self.storage_path,
|
114
|
-
parquet_filename=self._get_output_filename(),
|
115
114
|
fs=self.fs,
|
116
115
|
debug=self.debug,
|
117
116
|
logger=self.logger,
|