sibi-dst 2025.9.10__py3-none-any.whl → 2025.9.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sibi_dst/__init__.py CHANGED
@@ -10,12 +10,17 @@ try:
10
10
  except version_reader.PackageNotFoundError:
11
11
  __version__ = "unknown"
12
12
 
13
- __all__ = [
14
- "__version__",
15
- ]
16
-
17
- import sibi_dst.df_helper as df_helper
13
+ from sibi_dst.df_helper import *
18
14
  from sibi_dst.osmnx_helper import *
19
15
  from sibi_dst.geopy_helper import *
20
- from sibi_dst.utils import *
16
+ from sibi_dst import utils as sibiutils
21
17
 
18
+
19
+ __all__ = [
20
+ "__version__",
21
+ "DfHelper",
22
+ "ParquetArtifact",
23
+ "ParquetReader",
24
+ "ArtifactUpdaterMultiWrapperAsync",
25
+ "sibiutils"
26
+ ]
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
- #from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
7
6
  from ._artifact_updater_async import ArtifactUpdaterMultiWrapperAsync
8
7
  from ._artifact_updater_threaded import ArtifactUpdaterMultiWrapperThreaded
9
8
 
@@ -4,28 +4,24 @@ import asyncio
4
4
  import datetime
5
5
  import random
6
6
  import time
7
- from contextlib import ExitStack
7
+ import pickle
8
+ from contextlib import ExitStack, suppress
8
9
  from dataclasses import dataclass
9
10
  from typing import Any, Callable, Dict, List, Optional, Sequence, Type
10
11
 
11
- from sibi_dst.utils import ManagedResource
12
-
13
- try:
14
- from dask.distributed import Client, LocalCluster
15
- except ImportError:
16
- Client = None
17
- LocalCluster = None
12
+ from sibi_dst.utils import ManagedResource, Logger
13
+ from sibi_dst.utils.dask_utils import DaskClientMixin
18
14
 
19
15
 
20
16
  @dataclass(slots=True)
21
17
  class _RetryCfg:
18
+ """Retry and backoff configuration."""
22
19
  attempts: int = 3
23
20
  backoff_base: float = 2.0
24
21
  backoff_max: float = 60.0
25
22
  jitter: float = 0.15
26
23
 
27
24
 
28
- # ---------------- Worker (safe for Dask pickling) ----------------
29
25
  def run_artifact_update(
30
26
  cls: Type,
31
27
  artifact_class_kwargs: Dict[str, Any],
@@ -33,21 +29,43 @@ def run_artifact_update(
33
29
  period: str,
34
30
  artifact_kwargs: Dict[str, Any],
35
31
  ) -> Dict[str, Any]:
36
- """Standalone worker — safe for Dask distributed execution."""
32
+ """
33
+ Executed inside Dask worker.
34
+ Instantiates artifact and runs update_parquet() with retry logic.
35
+ Reconstructs logger and filesystem if not provided (worker isolation safe).
36
+ """
37
37
  import logging
38
+ import fsspec
39
+ from sibi_dst.utils import Logger
38
40
 
39
- logger = logging.getLogger(cls.__name__)
41
+ # ---- Reinitialize a lightweight logger for the worker
42
+ worker_logger = Logger.default_logger(logger_name=cls.__name__) if hasattr(Logger, "default_logger") else logging.getLogger(cls.__name__)
43
+ worker_logger.set_level(logging.INFO)
44
+
45
+ # ---- Ensure fs is recreated if missing
46
+ fs = artifact_class_kwargs.get("fs")
47
+ if fs is None or isinstance(fs, str):
48
+ try:
49
+ fs_protocol = fs if isinstance(fs, str) else "file"
50
+ fs = fsspec.filesystem(fs_protocol)
51
+ except Exception:
52
+ fs = fsspec.filesystem("file")
53
+
54
+ # ---- Merge reconstructed environment into kwargs
55
+ artifact_kwargs_final = {
56
+ **artifact_class_kwargs,
57
+ "logger": worker_logger,
58
+ "fs": fs,
59
+ }
40
60
 
41
- start_wall = datetime.datetime.now()
42
- attempt_count = 0
43
- success = False
44
- error_msg = None
61
+ start_time = datetime.datetime.now()
62
+ success, error_msg, attempts = False, None, 0
45
63
 
46
64
  for attempt in range(1, retry.attempts + 1):
47
- attempt_count = attempt
65
+ attempts = attempt
48
66
  try:
49
67
  with ExitStack() as stack:
50
- inst = cls(**artifact_class_kwargs)
68
+ inst = cls(**artifact_kwargs_final)
51
69
  inst = stack.enter_context(inst)
52
70
  inst.update_parquet(period=period, **artifact_kwargs)
53
71
  success = True
@@ -59,31 +77,40 @@ def run_artifact_update(
59
77
  delay *= 1 + random.uniform(0, retry.jitter)
60
78
  time.sleep(delay)
61
79
 
62
- end_wall = datetime.datetime.now()
63
- duration = (end_wall - start_wall).total_seconds()
80
+ duration = (datetime.datetime.now() - start_time).total_seconds()
81
+ status = "😀" if success else "😩"
82
+ worker_logger.info(
83
+ f"{status} {cls.__name__} [{period}] finished in {duration:.2f}s ({attempts} attempt(s))"
84
+ )
64
85
 
65
86
  return {
66
87
  "artifact": cls.__name__,
67
88
  "period": period,
68
- "start": start_wall.isoformat(),
69
- "end": end_wall.isoformat(),
70
- "processing_time": duration,
71
- "retries": attempt_count - 1 if success else attempt_count,
72
89
  "success": success,
73
90
  "error": error_msg,
91
+ "attempts": attempts,
92
+ "duration_seconds": duration,
93
+ "started_at": start_time.isoformat(),
94
+ "ended_at": datetime.datetime.now().isoformat(),
74
95
  }
75
96
 
76
97
 
77
- class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
98
+ # ---------------- Async Orchestrator ----------------
99
+ class ArtifactUpdaterMultiWrapperAsync(DaskClientMixin, ManagedResource):
78
100
  """
79
- Async/Threaded orchestrator.
80
- Dask-enabled if a Client is passed (or created automatically).
101
+ Async orchestrator for concurrent artifact updates.
102
+
103
+ • Uses Dask client (via DaskClientMixin) or local threads.
104
+ • Automatically sanitizes non-picklable arguments (e.g., loggers, fs).
105
+ • Provides structured retries, async orchestration, and safe cleanup.
81
106
  """
82
107
 
83
108
  def __init__(
84
109
  self,
85
110
  wrapped_classes: Dict[str, Sequence[Type]],
86
111
  *,
112
+ logger: Logger,
113
+ fs,
87
114
  max_workers: int = 3,
88
115
  retry_attempts: int = 3,
89
116
  update_timeout_seconds: int = 600,
@@ -92,88 +119,127 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
92
119
  backoff_jitter: float = 0.15,
93
120
  priority_fn: Optional[Callable[[Type], int]] = None,
94
121
  artifact_class_kwargs: Optional[Dict[str, Any]] = None,
95
- dask_client: Optional[Client] = None,
96
122
  use_dask: bool = True,
123
+ dask_client: Optional[Any] = None,
124
+ debug: bool = False,
125
+ verbose: bool = False,
97
126
  **kwargs: Any,
98
127
  ) -> None:
99
- super().__init__(**kwargs)
128
+ super().__init__(logger=logger, fs=fs, debug=debug, verbose=verbose)
129
+
130
+ # ---- Client lifecycle management
131
+
132
+ self.own_dask_client = dask_client is None
133
+ self._init_dask_client(dask_client, logger=logger)
134
+ self.use_dask = use_dask
100
135
 
136
+ # ---- Core configuration
101
137
  self.wrapped_classes = wrapped_classes
102
- self.max_workers = int(max_workers)
103
- self.update_timeout_seconds = int(update_timeout_seconds)
138
+ self.max_workers = max_workers
104
139
  self.priority_fn = priority_fn
105
- self.use_dask = use_dask
106
- self.client: Optional[Client] = dask_client
107
- self._owns_client = False
140
+ self.update_timeout_seconds = update_timeout_seconds
108
141
 
142
+ # ---- Retry configuration
109
143
  self._retry = _RetryCfg(
110
- attempts=int(retry_attempts),
111
- backoff_base=float(backoff_base),
112
- backoff_max=float(backoff_max),
113
- jitter=float(backoff_jitter),
144
+ attempts=retry_attempts,
145
+ backoff_base=backoff_base,
146
+ backoff_max=backoff_max,
147
+ jitter=backoff_jitter,
114
148
  )
115
149
 
116
- # Safe kwargs for artifacts
117
- if self.use_dask:
118
- self.artifact_class_kwargs = {
119
- "debug": self.debug,
120
- "verbose": self.verbose,
121
- **(artifact_class_kwargs or {}),
122
- }
123
- else:
124
- self.artifact_class_kwargs = {
125
- "logger": self.logger,
126
- "fs": self.fs,
127
- "debug": self.debug,
128
- "verbose": self.verbose,
129
- **(artifact_class_kwargs or {}),
130
- }
150
+ # ---- Artifact instantiation arguments
151
+ self.artifact_class_kwargs = {
152
+ "logger": logger,
153
+ "fs": fs,
154
+ "debug": debug,
155
+ "verbose": verbose,
156
+ **(artifact_class_kwargs or {}),
157
+ }
131
158
 
159
+ # ---- Runtime tracking
132
160
  self.completion_secs: Dict[str, float] = {}
133
161
  self.failed: List[str] = []
134
- self._stop = asyncio.Event()
135
-
136
- if self.use_dask and Client is None:
137
- raise RuntimeError("Dask is not installed, cannot use Dask mode")
138
-
139
- # auto-start local client if requested
140
- if self.use_dask and not self.client:
141
- self.client = Client(
142
- LocalCluster(
143
- n_workers=max_workers,
144
- threads_per_worker=1,
145
- dashboard_address=None,
146
- )
147
- )
148
- self._owns_client = True
162
+ self._stop_event = asyncio.Event()
149
163
 
150
- # ---- Internals ------------------------------------------------------------
164
+ self.logger_extra = {"sibi_dst_component": self.__class__.__name__}
165
+
166
+ if self.use_dask:
167
+ self.logger.debug(f"Initialized with Dask client: {self.dask_client}")
168
+ else:
169
+ self.logger.debug(f"Running in local thread-based mode.")
170
+
171
+ async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
172
+ """Runs updates for all artifacts in a given period."""
173
+ self.completion_secs.clear()
174
+ self.failed.clear()
175
+ classes = self._classes_for(period)
176
+
177
+ self.logger.info(
178
+ f"Starting artifact updates for period '{period}' ({len(classes)} artifacts).",
179
+ extra=self.logger_extra,
180
+ )
151
181
 
152
- def _classes_for(self, period: str) -> List[Type]:
153
182
  try:
154
- classes = list(self.wrapped_classes[period])
155
- except KeyError:
156
- raise ValueError(f"Unsupported period '{period}'.")
157
- if not classes:
158
- raise ValueError(f"No artifact classes configured for '{period}'.")
159
- if self.priority_fn:
183
+ if self.use_dask:
184
+ futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
185
+ results = await asyncio.to_thread(lambda: self.dask_client.gather(futures))
186
+ else:
187
+ sem = asyncio.Semaphore(self.max_workers)
188
+ tasks = [self._run_one_async(cls, period, sem, kwargs) for cls in classes]
189
+ results = await asyncio.gather(*tasks)
190
+
191
+ self.logger.info(
192
+ f"Completed {len(results)} artifact updates for period '{period}'.",
193
+ extra=self.logger_extra,
194
+ )
195
+ return results
196
+
197
+ finally:
198
+ # Always cleanup if we own the client
199
+ if getattr(self, "own_dask_client", False):
200
+ self._close_dask_client()
201
+
202
+
203
+ def _sanitize_kwargs_for_dask(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
204
+ """
205
+ Removes non-picklable runtime objects (e.g., loggers, fs) before sending to Dask.
206
+ """
207
+ clean: Dict[str, Any] = {}
208
+ for k, v in kwargs.items():
160
209
  try:
161
- classes.sort(key=self.priority_fn)
162
- except Exception as e:
163
- self.logger.warning(f"priority_fn failed; using listed order: {e}")
164
- return classes
210
+ pickle.dumps(v)
211
+ clean[k] = v
212
+ except Exception:
213
+ self.logger.debug(f"Skipping non-picklable key '{k}' for Dask worker.")
214
+ return clean
165
215
 
166
216
  def _submit_one_dask(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]):
167
- return self.client.submit(
217
+ """Submit one artifact job to Dask."""
218
+ safe_kwargs = self._sanitize_kwargs_for_dask(self.artifact_class_kwargs)
219
+ return self.dask_client.submit(
168
220
  run_artifact_update,
169
221
  cls,
170
- dict(self.artifact_class_kwargs),
222
+ safe_kwargs,
171
223
  self._retry,
172
224
  period,
173
225
  artifact_kwargs,
174
226
  pure=False,
175
227
  )
176
228
 
229
+ def _classes_for(self, period: str) -> List[Type]:
230
+ """Selects artifact classes for the given period."""
231
+ try:
232
+ classes = list(self.wrapped_classes[period])
233
+ except KeyError:
234
+ raise ValueError(f"No artifacts configured for period '{period}'.")
235
+ if not classes:
236
+ raise ValueError(f"No artifact classes found for '{period}'.")
237
+
238
+ if self.priority_fn:
239
+ with suppress(Exception):
240
+ classes.sort(key=self.priority_fn)
241
+ return classes
242
+
177
243
  async def _run_one_async(
178
244
  self,
179
245
  cls: Type,
@@ -181,112 +247,70 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
181
247
  sem: asyncio.Semaphore,
182
248
  artifact_kwargs: Dict[str, Any],
183
249
  ) -> Dict[str, Any]:
184
- """Async/threaded fallback execution."""
250
+ """Fallback local async execution (no Dask)."""
185
251
  name = cls.__name__
186
- self.logger.info(f"▶️ Starting {name} for period '{period}'")
187
- start_wall = datetime.datetime.now()
188
-
189
- attempt_count = 0
190
- success = False
191
- error_msg = None
192
-
193
- try:
194
- async with sem:
195
- for attempt in range(1, self._retry.attempts + 1):
196
- attempt_count = attempt
197
- try:
198
- def _sync_block():
199
- with ExitStack() as stack:
200
- inst = cls(**self.artifact_class_kwargs)
201
- inst = stack.enter_context(inst)
202
- inst.update_parquet(period=period, **artifact_kwargs)
203
-
204
- await asyncio.wait_for(
205
- asyncio.to_thread(_sync_block),
206
- timeout=self.update_timeout_seconds,
207
- )
208
- success = True
209
- break
210
- except Exception as e:
211
- error_msg = str(e)
212
- if attempt < self._retry.attempts and not self._stop.is_set():
213
- delay = min(
214
- self._retry.backoff_base ** (attempt - 1),
215
- self._retry.backoff_max,
216
- )
217
- delay *= 1 + random.uniform(0, self._retry.jitter)
218
- await asyncio.sleep(delay)
219
- finally:
220
- end_wall = datetime.datetime.now()
221
- duration = (end_wall - start_wall).total_seconds()
222
-
223
- result = {
224
- "artifact": name,
225
- "period": period,
226
- "start": start_wall.isoformat(),
227
- "end": end_wall.isoformat(),
228
- "processing_time": duration,
229
- "retries": attempt_count - 1 if success else attempt_count,
230
- "success": success,
231
- "error": error_msg,
232
- }
233
-
234
- if success:
235
- self.logger.info(f"✅ Artifact {name} succeeded", extra=result)
236
- self.completion_secs[name] = duration
237
- else:
238
- self.logger.error(f"❌ Artifact {name} failed", extra=result)
239
- self.failed.append(name)
240
-
241
- return result
252
+ start_time = datetime.datetime.now()
253
+
254
+ async with sem:
255
+ for attempt in range(1, self._retry.attempts + 1):
256
+ try:
257
+ def _sync_block():
258
+ with ExitStack() as stack:
259
+ inst = cls(**self.artifact_class_kwargs)
260
+ inst = stack.enter_context(inst)
261
+ inst.update_parquet(period=period, **artifact_kwargs)
262
+
263
+ await asyncio.wait_for(
264
+ asyncio.to_thread(_sync_block),
265
+ timeout=self.update_timeout_seconds,
266
+ )
267
+ duration = (datetime.datetime.now() - start_time).total_seconds()
268
+ self.completion_secs[name] = duration
269
+ self.logger.info(f"✅ {name} completed in {duration:.2f}s")
270
+ return {
271
+ "artifact": name,
272
+ "period": period,
273
+ "success": True,
274
+ "attempts": attempt,
275
+ "duration_seconds": duration,
276
+ }
277
+
278
+ except Exception as e:
279
+ if attempt < self._retry.attempts:
280
+ delay = min(self._retry.backoff_base ** attempt, self._retry.backoff_max)
281
+ delay *= 1 + random.uniform(0, self._retry.jitter)
282
+ self.logger.warning(f"Retry {attempt}/{self._retry.attempts} for {name}: {e}")
283
+ await asyncio.sleep(delay)
284
+ else:
285
+ duration = (datetime.datetime.now() - start_time).total_seconds()
286
+ self.failed.append(name)
287
+ self.logger.error(f"❌ {name} failed after {attempt} attempts: {e}")
288
+ return {
289
+ "artifact": name,
290
+ "period": period,
291
+ "success": False,
292
+ "attempts": attempt,
293
+ "error": str(e),
294
+ "duration_seconds": duration,
295
+ }
242
296
 
243
- # ---- Public API -----------------------------------------------------------
244
-
245
- async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
246
- self.completion_secs.clear()
247
- self.failed.clear()
248
- classes = self._classes_for(period)
249
-
250
- try:
251
- if self.use_dask:
252
- futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
253
- results = await asyncio.to_thread(lambda: self.client.gather(futures))
254
- else:
255
- sem = asyncio.Semaphore(self.max_workers)
256
- tasks = [
257
- asyncio.create_task(self._run_one_async(cls, period, sem, kwargs))
258
- for cls in classes
259
- ]
260
- results = await asyncio.gather(*tasks)
261
- return results
262
- finally:
263
- # only shut down if we own the client
264
- if self._owns_client:
265
- self.close()
266
297
 
267
298
  def get_update_status(self) -> Dict[str, Any]:
299
+ """Returns summary of completed, failed, and pending artifacts."""
268
300
  done = set(self.completion_secs)
269
301
  fail = set(self.failed)
270
- all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
302
+ all_names = {cls.__name__ for v in self.wrapped_classes.values() for cls in v}
271
303
  return {
272
304
  "total": len(all_names),
273
305
  "completed": sorted(done),
274
306
  "failed": sorted(fail),
275
307
  "pending": sorted(all_names - done - fail),
276
- "completion_times": dict(self.completion_secs),
308
+ "completion_times": self.completion_secs,
277
309
  }
278
310
 
279
- # ---- Lifecycle ------------------------------------------------------------
280
-
281
311
  def _cleanup(self) -> None:
282
- """Release any resources created by this wrapper."""
283
- if self._owns_client and self.client is not None:
284
- try:
285
- cluster = getattr(self.client, "cluster", None)
286
- self.client.close()
287
- if cluster is not None:
288
- cluster.close()
289
- finally:
290
- self.client = None
291
- self._owns_client = False
312
+ """Ensures safe resource closure."""
313
+ with suppress(Exception):
314
+ if getattr(self, "own_dask_client", False):
315
+ self._close_dask_client()
292
316
 
@@ -231,7 +231,7 @@ class ParquetConfig(BaseModel):
231
231
  Builds a list of path patterns for dask.read_parquet.
232
232
  Respects partition_on + start/end date if given.
233
233
  """
234
- print(f"_resolve_paths_for_read: {self.partition_on}")
234
+ self.logger.debug(f"_resolve_paths_for_read: {self.partition_on}")
235
235
  # Partitioned dataset by column
236
236
  if self.partition_on and self.parquet_start_date and self.parquet_end_date:
237
237
  if not isinstance(self.partition_on, (list, tuple)):
@@ -244,12 +244,10 @@ class ParquetConfig(BaseModel):
244
244
  days = pd.date_range(start=start, end=end, freq="D").date
245
245
 
246
246
  base = self.parquet_storage_path.rstrip("/")
247
- print("base:",base)
248
247
  result= [
249
248
  f"{base}/{parts[0]}={d.isoformat()}/*.parquet"
250
249
  for d in days
251
250
  ]
252
- print("result:",result)
253
251
  return result
254
252
 
255
253
  # Date-ranged folders (non-partitioned, using FilePathGenerator)
@@ -1,7 +1,9 @@
1
1
  from .base_osm_map import BaseOsmMap
2
2
  from .utils import PBFHandler
3
+ from .route_path_builder import RoutePathBuilder, RoutePathBuilderConfig
3
4
  __all__ = [
4
5
  "BaseOsmMap",
5
- #"MapConfig",
6
+ "RoutePathBuilder",
7
+ "RoutePathBuilderConfig",
6
8
  "PBFHandler",
7
9
  ]
@@ -24,6 +24,7 @@ from .manifest_manager import MissingManifestManager
24
24
  __all__ = [
25
25
  "Logger",
26
26
  "ManagedResource",
27
+
27
28
  "ConfigManager",
28
29
  "ConfigLoader",
29
30
  "DateUtils",
@@ -42,5 +43,5 @@ __all__ = [
42
43
  "FsRegistry",
43
44
  "DataFromHttpSource",
44
45
  "WebDAVClient",
45
- "MissingManifestManager"
46
+ "MissingManifestManager",
46
47
  ]
@@ -93,7 +93,7 @@ class BasePipeline(ManagedResource):
93
93
  df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
94
94
  df["partition_date"] = df[self.date_field].dt.date.astype(str)
95
95
 
96
- out_path = self.storage_path.rstrip("/")+"/"+self._get_output_filename(fmt="parquet")
96
+ out_path = self.storage_path.rstrip("/")
97
97
  self.logger.info("Saving dataset to %s", out_path)
98
98
  ps = ParquetSaver(
99
99
  df_result=df,
@@ -111,7 +111,6 @@ class BasePipeline(ManagedResource):
111
111
  parquet_start_date=self.start_date,
112
112
  parquet_end_date=self.end_date,
113
113
  parquet_storage_path=self.storage_path,
114
- parquet_filename=self._get_output_filename(),
115
114
  fs=self.fs,
116
115
  debug=self.debug,
117
116
  logger=self.logger,