sibi-dst 2025.9.11__py3-none-any.whl → 2025.9.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sibi_dst/__init__.py CHANGED
@@ -10,12 +10,17 @@ try:
10
10
  except version_reader.PackageNotFoundError:
11
11
  __version__ = "unknown"
12
12
 
13
- __all__ = [
14
- "__version__",
15
- ]
16
-
17
- import sibi_dst.df_helper as df_helper
13
+ from sibi_dst.df_helper import *
18
14
  from sibi_dst.osmnx_helper import *
19
15
  from sibi_dst.geopy_helper import *
20
- from sibi_dst.utils import *
16
+ from sibi_dst import utils as sibiutils
21
17
 
18
+
19
+ __all__ = [
20
+ "__version__",
21
+ "DfHelper",
22
+ "ParquetArtifact",
23
+ "ParquetReader",
24
+ "ArtifactUpdaterMultiWrapperAsync",
25
+ "sibiutils"
26
+ ]
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
- #from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
7
6
  from ._artifact_updater_async import ArtifactUpdaterMultiWrapperAsync
8
7
  from ._artifact_updater_threaded import ArtifactUpdaterMultiWrapperThreaded
9
8
 
@@ -4,28 +4,24 @@ import asyncio
4
4
  import datetime
5
5
  import random
6
6
  import time
7
- from contextlib import ExitStack
7
+ import pickle
8
+ from contextlib import ExitStack, suppress
8
9
  from dataclasses import dataclass
9
10
  from typing import Any, Callable, Dict, List, Optional, Sequence, Type
10
11
 
11
- from sibi_dst.utils import ManagedResource
12
-
13
- try:
14
- from dask.distributed import Client, LocalCluster
15
- except ImportError:
16
- Client = None
17
- LocalCluster = None
12
+ from sibi_dst.utils import ManagedResource, Logger
13
+ from sibi_dst.utils.dask_utils import DaskClientMixin
18
14
 
19
15
 
20
16
  @dataclass(slots=True)
21
17
  class _RetryCfg:
18
+ """Retry and backoff configuration."""
22
19
  attempts: int = 3
23
20
  backoff_base: float = 2.0
24
21
  backoff_max: float = 60.0
25
22
  jitter: float = 0.15
26
23
 
27
24
 
28
- # ---------------- Worker (safe for Dask pickling) ----------------
29
25
  def run_artifact_update(
30
26
  cls: Type,
31
27
  artifact_class_kwargs: Dict[str, Any],
@@ -33,21 +29,43 @@ def run_artifact_update(
33
29
  period: str,
34
30
  artifact_kwargs: Dict[str, Any],
35
31
  ) -> Dict[str, Any]:
36
- """Standalone worker — safe for Dask distributed execution."""
32
+ """
33
+ Executed inside Dask worker.
34
+ Instantiates artifact and runs update_parquet() with retry logic.
35
+ Reconstructs logger and filesystem if not provided (worker isolation safe).
36
+ """
37
37
  import logging
38
+ import fsspec
39
+ from sibi_dst.utils import Logger
38
40
 
39
- logger = logging.getLogger(cls.__name__)
41
+ # ---- Reinitialize a lightweight logger for the worker
42
+ worker_logger = Logger.default_logger(logger_name=cls.__name__) if hasattr(Logger, "default_logger") else logging.getLogger(cls.__name__)
43
+ worker_logger.set_level(logging.INFO)
44
+
45
+ # ---- Ensure fs is recreated if missing
46
+ fs = artifact_class_kwargs.get("fs")
47
+ if fs is None or isinstance(fs, str):
48
+ try:
49
+ fs_protocol = fs if isinstance(fs, str) else "file"
50
+ fs = fsspec.filesystem(fs_protocol)
51
+ except Exception:
52
+ fs = fsspec.filesystem("file")
53
+
54
+ # ---- Merge reconstructed environment into kwargs
55
+ artifact_kwargs_final = {
56
+ **artifact_class_kwargs,
57
+ "logger": worker_logger,
58
+ "fs": fs,
59
+ }
40
60
 
41
- start_wall = datetime.datetime.now()
42
- attempt_count = 0
43
- success = False
44
- error_msg = None
61
+ start_time = datetime.datetime.now()
62
+ success, error_msg, attempts = False, None, 0
45
63
 
46
64
  for attempt in range(1, retry.attempts + 1):
47
- attempt_count = attempt
65
+ attempts = attempt
48
66
  try:
49
67
  with ExitStack() as stack:
50
- inst = cls(**artifact_class_kwargs)
68
+ inst = cls(**artifact_kwargs_final)
51
69
  inst = stack.enter_context(inst)
52
70
  inst.update_parquet(period=period, **artifact_kwargs)
53
71
  success = True
@@ -59,31 +77,40 @@ def run_artifact_update(
59
77
  delay *= 1 + random.uniform(0, retry.jitter)
60
78
  time.sleep(delay)
61
79
 
62
- end_wall = datetime.datetime.now()
63
- duration = (end_wall - start_wall).total_seconds()
80
+ duration = (datetime.datetime.now() - start_time).total_seconds()
81
+ status = "😀" if success else "😩"
82
+ worker_logger.info(
83
+ f"{status} {cls.__name__} [{period}] finished in {duration:.2f}s ({attempts} attempt(s))"
84
+ )
64
85
 
65
86
  return {
66
87
  "artifact": cls.__name__,
67
88
  "period": period,
68
- "start": start_wall.isoformat(),
69
- "end": end_wall.isoformat(),
70
- "processing_time": duration,
71
- "retries": attempt_count - 1 if success else attempt_count,
72
89
  "success": success,
73
90
  "error": error_msg,
91
+ "attempts": attempts,
92
+ "duration_seconds": duration,
93
+ "started_at": start_time.isoformat(),
94
+ "ended_at": datetime.datetime.now().isoformat(),
74
95
  }
75
96
 
76
97
 
77
- class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
98
+ # ---------------- Async Orchestrator ----------------
99
+ class ArtifactUpdaterMultiWrapperAsync(DaskClientMixin, ManagedResource):
78
100
  """
79
- Async/Threaded orchestrator.
80
- Dask-enabled if a Client is passed (or created automatically).
101
+ Async orchestrator for concurrent artifact updates.
102
+
103
+ • Uses Dask client (via DaskClientMixin) or local threads.
104
+ • Automatically sanitizes non-picklable arguments (e.g., loggers, fs).
105
+ • Provides structured retries, async orchestration, and safe cleanup.
81
106
  """
82
107
 
83
108
  def __init__(
84
109
  self,
85
110
  wrapped_classes: Dict[str, Sequence[Type]],
86
111
  *,
112
+ logger: Logger,
113
+ fs,
87
114
  max_workers: int = 3,
88
115
  retry_attempts: int = 3,
89
116
  update_timeout_seconds: int = 600,
@@ -92,88 +119,127 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
92
119
  backoff_jitter: float = 0.15,
93
120
  priority_fn: Optional[Callable[[Type], int]] = None,
94
121
  artifact_class_kwargs: Optional[Dict[str, Any]] = None,
95
- dask_client: Optional[Client] = None,
96
122
  use_dask: bool = True,
123
+ dask_client: Optional[Any] = None,
124
+ debug: bool = False,
125
+ verbose: bool = False,
97
126
  **kwargs: Any,
98
127
  ) -> None:
99
- super().__init__(**kwargs)
128
+ super().__init__(logger=logger, fs=fs, debug=debug, verbose=verbose)
129
+
130
+ # ---- Client lifecycle management
131
+
132
+ self.own_dask_client = dask_client is None
133
+ self._init_dask_client(dask_client, logger=logger)
134
+ self.use_dask = use_dask
100
135
 
136
+ # ---- Core configuration
101
137
  self.wrapped_classes = wrapped_classes
102
- self.max_workers = int(max_workers)
103
- self.update_timeout_seconds = int(update_timeout_seconds)
138
+ self.max_workers = max_workers
104
139
  self.priority_fn = priority_fn
105
- self.use_dask = use_dask
106
- self.client: Optional[Client] = dask_client
107
- self._owns_client = False
140
+ self.update_timeout_seconds = update_timeout_seconds
108
141
 
142
+ # ---- Retry configuration
109
143
  self._retry = _RetryCfg(
110
- attempts=int(retry_attempts),
111
- backoff_base=float(backoff_base),
112
- backoff_max=float(backoff_max),
113
- jitter=float(backoff_jitter),
144
+ attempts=retry_attempts,
145
+ backoff_base=backoff_base,
146
+ backoff_max=backoff_max,
147
+ jitter=backoff_jitter,
114
148
  )
115
149
 
116
- # Safe kwargs for artifacts
117
- if self.use_dask:
118
- self.artifact_class_kwargs = {
119
- "debug": self.debug,
120
- "verbose": self.verbose,
121
- **(artifact_class_kwargs or {}),
122
- }
123
- else:
124
- self.artifact_class_kwargs = {
125
- "logger": self.logger,
126
- "fs": self.fs,
127
- "debug": self.debug,
128
- "verbose": self.verbose,
129
- **(artifact_class_kwargs or {}),
130
- }
150
+ # ---- Artifact instantiation arguments
151
+ self.artifact_class_kwargs = {
152
+ "logger": logger,
153
+ "fs": fs,
154
+ "debug": debug,
155
+ "verbose": verbose,
156
+ **(artifact_class_kwargs or {}),
157
+ }
131
158
 
159
+ # ---- Runtime tracking
132
160
  self.completion_secs: Dict[str, float] = {}
133
161
  self.failed: List[str] = []
134
- self._stop = asyncio.Event()
135
-
136
- if self.use_dask and Client is None:
137
- raise RuntimeError("Dask is not installed, cannot use Dask mode")
138
-
139
- # auto-start local client if requested
140
- if self.use_dask and not self.client:
141
- self.client = Client(
142
- LocalCluster(
143
- n_workers=max_workers,
144
- threads_per_worker=1,
145
- dashboard_address=None,
146
- )
147
- )
148
- self._owns_client = True
162
+ self._stop_event = asyncio.Event()
149
163
 
150
- # ---- Internals ------------------------------------------------------------
164
+ self.logger_extra = {"sibi_dst_component": self.__class__.__name__}
165
+
166
+ if self.use_dask:
167
+ self.logger.debug(f"Initialized with Dask client: {self.dask_client}")
168
+ else:
169
+ self.logger.debug(f"Running in local thread-based mode.")
170
+
171
+ async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
172
+ """Runs updates for all artifacts in a given period."""
173
+ self.completion_secs.clear()
174
+ self.failed.clear()
175
+ classes = self._classes_for(period)
176
+
177
+ self.logger.info(
178
+ f"Starting artifact updates for period '{period}' ({len(classes)} artifacts).",
179
+ extra=self.logger_extra,
180
+ )
151
181
 
152
- def _classes_for(self, period: str) -> List[Type]:
153
182
  try:
154
- classes = list(self.wrapped_classes[period])
155
- except KeyError:
156
- raise ValueError(f"Unsupported period '{period}'.")
157
- if not classes:
158
- raise ValueError(f"No artifact classes configured for '{period}'.")
159
- if self.priority_fn:
183
+ if self.use_dask:
184
+ futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
185
+ results = await asyncio.to_thread(lambda: self.dask_client.gather(futures))
186
+ else:
187
+ sem = asyncio.Semaphore(self.max_workers)
188
+ tasks = [self._run_one_async(cls, period, sem, kwargs) for cls in classes]
189
+ results = await asyncio.gather(*tasks)
190
+
191
+ self.logger.info(
192
+ f"Completed {len(results)} artifact updates for period '{period}'.",
193
+ extra=self.logger_extra,
194
+ )
195
+ return results
196
+
197
+ finally:
198
+ # Always cleanup if we own the client
199
+ if getattr(self, "own_dask_client", False):
200
+ self._close_dask_client()
201
+
202
+
203
+ def _sanitize_kwargs_for_dask(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
204
+ """
205
+ Removes non-picklable runtime objects (e.g., loggers, fs) before sending to Dask.
206
+ """
207
+ clean: Dict[str, Any] = {}
208
+ for k, v in kwargs.items():
160
209
  try:
161
- classes.sort(key=self.priority_fn)
162
- except Exception as e:
163
- self.logger.warning(f"priority_fn failed; using listed order: {e}")
164
- return classes
210
+ pickle.dumps(v)
211
+ clean[k] = v
212
+ except Exception:
213
+ self.logger.debug(f"Skipping non-picklable key '{k}' for Dask worker.")
214
+ return clean
165
215
 
166
216
  def _submit_one_dask(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]):
167
- return self.client.submit(
217
+ """Submit one artifact job to Dask."""
218
+ safe_kwargs = self._sanitize_kwargs_for_dask(self.artifact_class_kwargs)
219
+ return self.dask_client.submit(
168
220
  run_artifact_update,
169
221
  cls,
170
- dict(self.artifact_class_kwargs),
222
+ safe_kwargs,
171
223
  self._retry,
172
224
  period,
173
225
  artifact_kwargs,
174
226
  pure=False,
175
227
  )
176
228
 
229
+ def _classes_for(self, period: str) -> List[Type]:
230
+ """Selects artifact classes for the given period."""
231
+ try:
232
+ classes = list(self.wrapped_classes[period])
233
+ except KeyError:
234
+ raise ValueError(f"No artifacts configured for period '{period}'.")
235
+ if not classes:
236
+ raise ValueError(f"No artifact classes found for '{period}'.")
237
+
238
+ if self.priority_fn:
239
+ with suppress(Exception):
240
+ classes.sort(key=self.priority_fn)
241
+ return classes
242
+
177
243
  async def _run_one_async(
178
244
  self,
179
245
  cls: Type,
@@ -181,112 +247,70 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
181
247
  sem: asyncio.Semaphore,
182
248
  artifact_kwargs: Dict[str, Any],
183
249
  ) -> Dict[str, Any]:
184
- """Async/threaded fallback execution."""
250
+ """Fallback local async execution (no Dask)."""
185
251
  name = cls.__name__
186
- self.logger.info(f"▶️ Starting {name} for period '{period}'")
187
- start_wall = datetime.datetime.now()
188
-
189
- attempt_count = 0
190
- success = False
191
- error_msg = None
192
-
193
- try:
194
- async with sem:
195
- for attempt in range(1, self._retry.attempts + 1):
196
- attempt_count = attempt
197
- try:
198
- def _sync_block():
199
- with ExitStack() as stack:
200
- inst = cls(**self.artifact_class_kwargs)
201
- inst = stack.enter_context(inst)
202
- inst.update_parquet(period=period, **artifact_kwargs)
203
-
204
- await asyncio.wait_for(
205
- asyncio.to_thread(_sync_block),
206
- timeout=self.update_timeout_seconds,
207
- )
208
- success = True
209
- break
210
- except Exception as e:
211
- error_msg = str(e)
212
- if attempt < self._retry.attempts and not self._stop.is_set():
213
- delay = min(
214
- self._retry.backoff_base ** (attempt - 1),
215
- self._retry.backoff_max,
216
- )
217
- delay *= 1 + random.uniform(0, self._retry.jitter)
218
- await asyncio.sleep(delay)
219
- finally:
220
- end_wall = datetime.datetime.now()
221
- duration = (end_wall - start_wall).total_seconds()
222
-
223
- result = {
224
- "artifact": name,
225
- "period": period,
226
- "start": start_wall.isoformat(),
227
- "end": end_wall.isoformat(),
228
- "processing_time": duration,
229
- "retries": attempt_count - 1 if success else attempt_count,
230
- "success": success,
231
- "error": error_msg,
232
- }
233
-
234
- if success:
235
- self.logger.info(f"✅ Artifact {name} succeeded", extra=result)
236
- self.completion_secs[name] = duration
237
- else:
238
- self.logger.error(f"❌ Artifact {name} failed", extra=result)
239
- self.failed.append(name)
240
-
241
- return result
252
+ start_time = datetime.datetime.now()
253
+
254
+ async with sem:
255
+ for attempt in range(1, self._retry.attempts + 1):
256
+ try:
257
+ def _sync_block():
258
+ with ExitStack() as stack:
259
+ inst = cls(**self.artifact_class_kwargs)
260
+ inst = stack.enter_context(inst)
261
+ inst.update_parquet(period=period, **artifact_kwargs)
262
+
263
+ await asyncio.wait_for(
264
+ asyncio.to_thread(_sync_block),
265
+ timeout=self.update_timeout_seconds,
266
+ )
267
+ duration = (datetime.datetime.now() - start_time).total_seconds()
268
+ self.completion_secs[name] = duration
269
+ self.logger.info(f"✅ {name} completed in {duration:.2f}s")
270
+ return {
271
+ "artifact": name,
272
+ "period": period,
273
+ "success": True,
274
+ "attempts": attempt,
275
+ "duration_seconds": duration,
276
+ }
277
+
278
+ except Exception as e:
279
+ if attempt < self._retry.attempts:
280
+ delay = min(self._retry.backoff_base ** attempt, self._retry.backoff_max)
281
+ delay *= 1 + random.uniform(0, self._retry.jitter)
282
+ self.logger.warning(f"Retry {attempt}/{self._retry.attempts} for {name}: {e}")
283
+ await asyncio.sleep(delay)
284
+ else:
285
+ duration = (datetime.datetime.now() - start_time).total_seconds()
286
+ self.failed.append(name)
287
+ self.logger.error(f"❌ {name} failed after {attempt} attempts: {e}")
288
+ return {
289
+ "artifact": name,
290
+ "period": period,
291
+ "success": False,
292
+ "attempts": attempt,
293
+ "error": str(e),
294
+ "duration_seconds": duration,
295
+ }
242
296
 
243
- # ---- Public API -----------------------------------------------------------
244
-
245
- async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
246
- self.completion_secs.clear()
247
- self.failed.clear()
248
- classes = self._classes_for(period)
249
-
250
- try:
251
- if self.use_dask:
252
- futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
253
- results = await asyncio.to_thread(lambda: self.client.gather(futures))
254
- else:
255
- sem = asyncio.Semaphore(self.max_workers)
256
- tasks = [
257
- asyncio.create_task(self._run_one_async(cls, period, sem, kwargs))
258
- for cls in classes
259
- ]
260
- results = await asyncio.gather(*tasks)
261
- return results
262
- finally:
263
- # only shut down if we own the client
264
- if self._owns_client:
265
- self.close()
266
297
 
267
298
  def get_update_status(self) -> Dict[str, Any]:
299
+ """Returns summary of completed, failed, and pending artifacts."""
268
300
  done = set(self.completion_secs)
269
301
  fail = set(self.failed)
270
- all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
302
+ all_names = {cls.__name__ for v in self.wrapped_classes.values() for cls in v}
271
303
  return {
272
304
  "total": len(all_names),
273
305
  "completed": sorted(done),
274
306
  "failed": sorted(fail),
275
307
  "pending": sorted(all_names - done - fail),
276
- "completion_times": dict(self.completion_secs),
308
+ "completion_times": self.completion_secs,
277
309
  }
278
310
 
279
- # ---- Lifecycle ------------------------------------------------------------
280
-
281
311
  def _cleanup(self) -> None:
282
- """Release any resources created by this wrapper."""
283
- if self._owns_client and self.client is not None:
284
- try:
285
- cluster = getattr(self.client, "cluster", None)
286
- self.client.close()
287
- if cluster is not None:
288
- cluster.close()
289
- finally:
290
- self.client = None
291
- self._owns_client = False
312
+ """Ensures safe resource closure."""
313
+ with suppress(Exception):
314
+ if getattr(self, "own_dask_client", False):
315
+ self._close_dask_client()
292
316
 
@@ -1,7 +1,9 @@
1
1
  from .base_osm_map import BaseOsmMap
2
2
  from .utils import PBFHandler
3
+ from .route_path_builder import RoutePathBuilder, RoutePathBuilderConfig
3
4
  __all__ = [
4
5
  "BaseOsmMap",
5
- #"MapConfig",
6
+ "RoutePathBuilder",
7
+ "RoutePathBuilderConfig",
6
8
  "PBFHandler",
7
9
  ]
@@ -24,6 +24,7 @@ from .manifest_manager import MissingManifestManager
24
24
  __all__ = [
25
25
  "Logger",
26
26
  "ManagedResource",
27
+
27
28
  "ConfigManager",
28
29
  "ConfigLoader",
29
30
  "DateUtils",
@@ -42,5 +43,5 @@ __all__ = [
42
43
  "FsRegistry",
43
44
  "DataFromHttpSource",
44
45
  "WebDAVClient",
45
- "MissingManifestManager"
46
+ "MissingManifestManager",
46
47
  ]
@@ -93,7 +93,7 @@ class BasePipeline(ManagedResource):
93
93
  df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
94
94
  df["partition_date"] = df[self.date_field].dt.date.astype(str)
95
95
 
96
- out_path = self.storage_path.rstrip("/")+"/"+self._get_output_filename(fmt="parquet")
96
+ out_path = self.storage_path.rstrip("/")
97
97
  self.logger.info("Saving dataset to %s", out_path)
98
98
  ps = ParquetSaver(
99
99
  df_result=df,
@@ -111,7 +111,6 @@ class BasePipeline(ManagedResource):
111
111
  parquet_start_date=self.start_date,
112
112
  parquet_end_date=self.end_date,
113
113
  parquet_storage_path=self.storage_path,
114
- parquet_filename=self._get_output_filename(),
115
114
  fs=self.fs,
116
115
  debug=self.debug,
117
116
  logger=self.logger,
@@ -1,7 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  import asyncio
2
4
  from typing import List, Any, Dict
3
5
 
4
6
  import dask
7
+ #dask.config.set({"distributed.worker.daemon": False})
5
8
  import dask.dataframe as dd
6
9
 
7
10
  def _to_int_safe(x) -> int:
@@ -58,4 +61,124 @@ class UniqueValuesExtractor:
58
61
  return col, await self.compute_to_list(ser)
59
62
 
60
63
  pairs = await asyncio.gather(*(one(c) for c in columns))
61
- return dict(pairs)
64
+ return dict(pairs)
65
+
66
+ from contextlib import suppress
67
+ from dask.distributed import Client, LocalCluster, get_client
68
+ import os
69
+
70
+ class DaskClientMixin:
71
+ """
72
+ Provides shared Dask client lifecycle management.
73
+ Ensures reuse of an existing client if available,
74
+ or creates a local in-process Dask cluster for fallback.
75
+ """
76
+
77
+ def _init_dask_client(
78
+ self,
79
+ dask_client=None,
80
+ logger=None,
81
+ *,
82
+ n_workers: int = 1,
83
+ threads_per_worker: int = 1,
84
+ processes: bool = False,
85
+ asynchronous: bool = False,
86
+ memory_limit: str = "auto",
87
+ #dashboard_address: str | None = None,
88
+ local_directory: str | None = None,
89
+ silence_logs: str = "info",
90
+ resources: dict | None = None,
91
+ timeout: int = 30,
92
+ ):
93
+ self.dask_client = dask_client
94
+ self.own_dask_client = False
95
+ self.logger = logger
96
+
97
+ if self.dask_client is None:
98
+ with suppress(ValueError, RuntimeError):
99
+ # Try to attach to an existing client (common in shared Dask setups)
100
+ self.dask_client = get_client()
101
+
102
+ if self.dask_client is None:
103
+ # Default to half of logical cores if not specified
104
+ n_workers = n_workers or max(2, os.cpu_count() // 2)
105
+
106
+ cluster = LocalCluster(
107
+ n_workers=n_workers,
108
+ threads_per_worker=threads_per_worker,
109
+ processes=processes,
110
+ asynchronous=asynchronous,
111
+ memory_limit=memory_limit,
112
+ local_directory=local_directory,
113
+ silence_logs=silence_logs,
114
+ resources=resources,
115
+ timeout=timeout,
116
+ )
117
+
118
+ self.dask_client = Client(cluster)
119
+ self.own_dask_client = True
120
+
121
+ if self.logger:
122
+ self.logger.info(
123
+ f"Started local Dask cluster with {n_workers} workers × {threads_per_worker} threads "
124
+ f"({memory_limit} memory per worker). Dashboard: {self.dask_client.dashboard_link}"
125
+ )
126
+ else:
127
+ if self.logger:
128
+ self.logger.debug(
129
+ f"Using existing Dask client: {self.dask_client.dashboard_link}"
130
+ )
131
+
132
+ def _close_dask_client(self):
133
+ """Close the Dask client if this instance created it."""
134
+ if getattr(self, "own_dask_client", False) and self.dask_client is not None:
135
+ try:
136
+ cluster = getattr(self.dask_client, "cluster", None)
137
+ self.dask_client.close()
138
+ if cluster is not None:
139
+ cluster.close()
140
+ if self.logger:
141
+ self.logger.info("Closed local Dask client and cluster.")
142
+ except Exception as e:
143
+ if self.logger:
144
+ self.logger.warning(f"Error while closing Dask client: {e}")
145
+
146
+ # from contextlib import suppress
147
+ # from dask.distributed import Client, get_client
148
+ #
149
+ # class DaskClientMixin:
150
+ # """
151
+ # Provides shared Dask client lifecycle management.
152
+ # Ensures reuse of existing client when available, otherwise creates a lightweight local one.
153
+ # """
154
+ #
155
+ # def _init_dask_client(self, dask_client=None, logger=None):
156
+ # self.dask_client = dask_client
157
+ # self.own_dask_client = False
158
+ # self.logger = logger
159
+ #
160
+ # if self.dask_client is None:
161
+ # with suppress(ValueError, RuntimeError):
162
+ # # Try to attach to an existing active client if running inside a Dask context
163
+ # self.dask_client = get_client()
164
+ #
165
+ # if self.dask_client is None:
166
+ # # Start a local in-process scheduler for fallback
167
+ # self.dask_client = Client(processes=False)
168
+ # self.own_dask_client = True
169
+ # if self.logger:
170
+ # self.logger.info(f"Started local Dask client: {self.dask_client.dashboard_link}")
171
+ # else:
172
+ # if self.logger:
173
+ # self.logger.debug(f"Using existing Dask client: {self.dask_client.dashboard_link}")
174
+ #
175
+ # def _close_dask_client(self):
176
+ # """Close client only if this instance created it."""
177
+ # if getattr(self, "own_dask_client", False) and self.dask_client is not None:
178
+ # try:
179
+ # self.dask_client.close()
180
+ # if self.logger:
181
+ # self.logger.info("Closed local Dask client.")
182
+ # except Exception as e:
183
+ # if self.logger:
184
+ # self.logger.warning(f"Error while closing Dask client: {e}")
@@ -87,17 +87,6 @@ class DataWrapper(ManagedResource):
87
87
  "dataclass": self.dataclass.__name__
88
88
  })
89
89
 
90
- # --------------------- Context Management ---------------------
91
- def __exit__(self, exc_type, exc_val, exc_tb):
92
- """Ensure manifest is saved and resources are cleaned up on context exit."""
93
- if self.mmanifest:
94
- try:
95
- self.mmanifest.save()
96
- except Exception as e:
97
- self.logger.error(f"Failed to save manifest in __exit__: {e}", extra=self.logger_extra)
98
- # Call parent's __exit__ which triggers _cleanup
99
- return super().__exit__(exc_type, exc_val, exc_tb)
100
-
101
90
  # --------------------- Cleanup ---------------------
102
91
  def _cleanup(self) -> None:
103
92
  """Signal shutdown during class-specific cleanup."""
@@ -1,34 +1,31 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: sibi-dst
3
- Version: 2025.9.11
4
- Summary: Data Science Toolkit
5
- Author: Luis Valverde
6
- Author-email: lvalverdeb@gmail.com
7
- Requires-Python: >=3.11,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.11
10
- Classifier: Programming Language :: Python :: 3.12
11
- Classifier: Programming Language :: Python :: 3.13
12
- Requires-Dist: clickhouse-connect (>=0.8.18,<0.9.0)
13
- Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
14
- Requires-Dist: dask[complete] (>=2025.9.0,<2026.0.0)
15
- Requires-Dist: distributed (>=2025.9.1,<2026.0.0)
16
- Requires-Dist: mysqlclient (>=2.2.7,<3.0.0)
17
- Requires-Dist: opentelemetry-exporter-otlp (>=1.35.0,<2.0.0)
18
- Requires-Dist: opentelemetry-sdk (>=1.35.0,<2.0.0)
19
- Requires-Dist: pandas (>=2.3.1,<3.0.0)
20
- Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
21
- Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
22
- Requires-Dist: pydantic (>=2.11.7,<3.0.0)
23
- Requires-Dist: pyiceberg[hive,s3fs] (>=0.9.1,<0.10.0)
24
- Requires-Dist: pymysql (>=1.1.1,<2.0.0)
25
- Requires-Dist: pyrosm (>=0.6.2,<0.7.0)
26
- Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
27
- Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
28
- Requires-Dist: sse-starlette (>=3.0.2,<4.0.0)
29
- Requires-Dist: tqdm (>=4.67.1,<5.0.0)
30
- Requires-Dist: webdav4 (>=0.10.0,<0.11.0)
3
+ Version: 2025.9.12
4
+ Summary: A data science toolkit for scalable data processing and analysis.
5
+ Requires-Python: >=3.11
31
6
  Description-Content-Type: text/markdown
7
+ Requires-Dist: clickhouse-connect>=0.9.2
8
+ Requires-Dist: clickhouse-driver>=0.2.9
9
+ Requires-Dist: dask>=2025.9.1
10
+ Requires-Dist: distributed>=2025.9.1
11
+ Requires-Dist: fastapi>=0.118.0
12
+ Requires-Dist: folium>=0.20.0
13
+ Requires-Dist: mysqlclient>=2.2.7
14
+ Requires-Dist: opentelemetry-api>=1.37.0
15
+ Requires-Dist: opentelemetry-exporter-otlp>=1.37.0
16
+ Requires-Dist: opentelemetry-sdk>=1.37.0
17
+ Requires-Dist: pandas>=2.3.3
18
+ Requires-Dist: psycopg2>=2.9.10
19
+ Requires-Dist: pyarrow>=21.0.0
20
+ Requires-Dist: pydantic>=2.11.10
21
+ Requires-Dist: pymysql>=1.1.2
22
+ Requires-Dist: redis>=6.4.0
23
+ Requires-Dist: s3fs>=2025.9.0
24
+ Requires-Dist: sqlalchemy>=2.0.43
25
+ Requires-Dist: tqdm>=4.67.1
26
+ Requires-Dist: uvicorn>=0.37.0
27
+ Requires-Dist: webdav4>=0.10.0
28
+ Requires-Dist: wheel>=0.45.1
32
29
 
33
30
  ### SIBI-DST
34
31
 
@@ -60,4 +57,3 @@ pip install sibi-dst[dev,test,geospatial] # Install all optional dependencies
60
57
 
61
58
 
62
59
  ```
63
-
@@ -1,6 +1,6 @@
1
- sibi_dst/__init__.py,sha256=D4TMsAMGRl54J5PNYMDf_z0NcCnS_lZK1YHze2eJvpc,464
2
- sibi_dst/df_helper/__init__.py,sha256=aOQFmWpLeSq-7O8IYEKaxR1nvuIe9F7SJsOsgwj1ECg,579
3
- sibi_dst/df_helper/_artifact_updater_async.py,sha256=0VBwGMIQXKagr4MpshvfuhplwePD2VQbN2MOUTgQyJs,9992
1
+ sibi_dst/__init__.py,sha256=QQVT3Xlj8iZN17sSMfRQFSb_DHr8A7giJP8hn02K2Oo,585
2
+ sibi_dst/df_helper/__init__.py,sha256=7rUdMybgCNZhQL_J7IFTTHz_xtFin81xavi5-PUExkA,463
3
+ sibi_dst/df_helper/_artifact_updater_async.py,sha256=AZp0vM3vji0tjiaScr8a9SUMH15NjPIKYPdRQ7SJe3Y,11372
4
4
  sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
5
5
  sibi_dst/df_helper/_df_helper.py,sha256=rgVP4ggiCW6tTHmUz2UqUvLznwOtY5IyoVS3WSlg73U,17005
6
6
  sibi_dst/df_helper/_parquet_artifact.py,sha256=UXkhDSAVRNKp9DykVhJd3agnryCZT0Sj2qhdhUZomuM,19421
@@ -22,39 +22,30 @@ sibi_dst/df_helper/core/_defaults.py,sha256=9UMEMu2wXznO5UzEhnQ82f_ZazZ20JRyRXIi
22
22
  sibi_dst/df_helper/core/_filter_handler.py,sha256=9C30zrT8wSGy1X8ryiTWc0XfnbpeoHndHgoOcHKOPOo,19309
23
23
  sibi_dst/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxRrQKE5FQRxcEWsac,6736
24
24
  sibi_dst/df_helper/core/_query_config.py,sha256=1ApqmuSGXTC3CdF-xMsSbCa3V2Z5hOP3Wq5huhzZwqY,439
25
- sibi_dst/df_helper/data_cleaner.py,sha256=lkxQoXLvGzXCicFUimnA5nen5qkrO1oxgl_p2Be2o8w,5183
26
25
  sibi_dst/geopy_helper/__init__.py,sha256=Q1RJiUZIOlV0QNNLjxZ_2IZS5LqIe5jRbeQkfD1Vm60,112
27
26
  sibi_dst/geopy_helper/geo_location_service.py,sha256=1ArI980QF_gRw096ZsABHwJt-m55jrfOlB8tPwL1BvY,2959
28
27
  sibi_dst/geopy_helper/utils.py,sha256=Sb7qfSqIyWh-AZ4GBdB9-z5FrQPWtrdtQLLcNjph0yw,3351
29
- sibi_dst/osmnx_helper/__init__.py,sha256=On2_pD13HmzZjP-YrXV9BA9uFK-z26QkQE-MliGdv5w,134
28
+ sibi_dst/osmnx_helper/__init__.py,sha256=hTv1uCnN35MSUEYNWk30OHHSflDb1qidwfLDXhUU8cE,243
30
29
  sibi_dst/osmnx_helper/base_osm_map.py,sha256=L7g3VBiayHX41BcCBTOCS0iJOKzp2ZZYcrp8N-mnU90,19392
30
+ sibi_dst/osmnx_helper/route_path_builder.py,sha256=XJJyu4YXegAkCRjE-knyQncwXaxDVXZhalYacLcb7e0,3557
31
+ sibi_dst/osmnx_helper/utils.py,sha256=7-lFVhGn4rHjGz6FvpXtC2jY8UzGIVyKR3MVyEfB7nw,14407
31
32
  sibi_dst/osmnx_helper/basemaps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
33
  sibi_dst/osmnx_helper/basemaps/calendar_html.py,sha256=UArt6FDgoCgoRte45Xo3IHqd-RNzW0YgitgZYfOFasY,4031
33
34
  sibi_dst/osmnx_helper/basemaps/route_map_plotter.py,sha256=rsJidieojcqIoe0kBanZbrxcelrS6nWoAyWoQXWdPiQ,11849
34
35
  sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqTadrxMx-YK4djYhqPqfQ,10941
35
- sibi_dst/osmnx_helper/route_path_builder.py,sha256=XJJyu4YXegAkCRjE-knyQncwXaxDVXZhalYacLcb7e0,3557
36
- sibi_dst/osmnx_helper/utils.py,sha256=7-lFVhGn4rHjGz6FvpXtC2jY8UzGIVyKR3MVyEfB7nw,14407
37
36
  sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
37
  sibi_dst/tests/test_baseclass.py,sha256=5huAwjWo_SOEZR2_0y5w9qUmw5G7pVdm8X1OTG87JK0,11562
39
38
  sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
40
- sibi_dst/utils/__init__.py,sha256=vShNCOMPw8KKwlb4tq5XGrpjqakJ_OE8YDc_xDAWAxI,1302
39
+ sibi_dst/utils/__init__.py,sha256=eoW7iROrCUVYjNT1owMgGvW6U7lolbNy3FrBb_wInPs,1304
41
40
  sibi_dst/utils/async_utils.py,sha256=53aywfgq1Q6-0OVr9qR1Sf6g7Qv3I9qunAAR4fjFXBE,351
42
41
  sibi_dst/utils/base.py,sha256=sFngliI7Ku8bZMz0YdVhppuaPNZ0dvqRwCsPe9XdF1A,16256
43
- sibi_dst/utils/boilerplate/__init__.py,sha256=89oepkDCnjegnhzd6Kgga71AH17KSvNMlgD83CAAmg8,582
44
- sibi_dst/utils/boilerplate/base_attacher.py,sha256=iZftWNUx8y370OJP_kGCs5v3t2RgPuARIK_jQeFfbAU,2089
45
- sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
46
- sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
47
- sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
48
- sibi_dst/utils/boilerplate/base_pipeline.py,sha256=LQZBACksqHO3tQ8OhWShfqjiGyda7UhrmllRq3eWQfU,5690
49
- sibi_dst/utils/boilerplate/base_pipeline_template.py,sha256=D5HFA4odsR2wlTY6iLg1tm57Tsh91QkoYjjX8eUgrjU,1574
50
- sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
51
42
  sibi_dst/utils/business_days.py,sha256=DPZExTXTt7n3IbAaEuVacm-vZgbR_Ug2bJTPBUaoP3g,6694
52
43
  sibi_dst/utils/clickhouse_writer.py,sha256=8W_dTEOKQp4pXANznVSxRqFA2H5oD8UJifiBAONpXWY,17001
53
44
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
54
- sibi_dst/utils/dask_utils.py,sha256=QhFcmpH4fXAy6b3DugIX5JvH4h-P3M3hXKnBYTLRkq0,1991
45
+ sibi_dst/utils/dask_utils.py,sha256=UA8Bp0Qm7n8WSyjFmxNSrWdJ5TG_v6NCrmgLxPbnXlA,6692
55
46
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
56
47
  sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
57
- sibi_dst/utils/data_wrapper.py,sha256=-a5LkkjBVXUPwDYngn5K98oSNYsQf87zNaih9IJxs-I,18543
48
+ sibi_dst/utils/data_wrapper.py,sha256=9HTuDXgvfhmFAOyNG_GEOaHuojxE3639yyzOoBt7Unc,18000
58
49
  sibi_dst/utils/date_utils.py,sha256=hBVWu9_cqiZ-XsLR7QY9Iek09DQKLwrY1ZlYxWlXj7g,31101
59
50
  sibi_dst/utils/df_utils.py,sha256=bQGromLOEdRTvbVVcuHq0vQ0fIgqhwOoD_eIp5v7VEY,10899
60
51
  sibi_dst/utils/file_age_checker.py,sha256=44B3lwH_PLwzMfiKkgvJKjKx-qSgITIXxKfNbdf_VeA,11552
@@ -66,15 +57,23 @@ sibi_dst/utils/manifest_manager.py,sha256=9y4cV-Ig8O-ekhApp_UObTY-cTsl-bGnvKIThI
66
57
  sibi_dst/utils/parquet_saver.py,sha256=Itctsf8UnBCnD6NrP00FK0y9KzZgKYfjUk1CC-0x-F0,20486
67
58
  sibi_dst/utils/periods.py,sha256=8eTGi-bToa6_a8Vwyg4fkBPryyzft9Nzy-3ToxjqC8c,1434
68
59
  sibi_dst/utils/phone_formatter.py,sha256=oeM22nLjhObENrpItCNeVpkYS4pXRm5hSxdk0M4nvwU,4580
69
- sibi_dst/utils/progress/__init__.py,sha256=VELVxzo2cePN_-LL0veel8-F3po6tokY5MOOpu6pz1A,92
70
- sibi_dst/utils/progress/jobs.py,sha256=nE58ng9GPCPZhnaCDltr1tQgu3AJVqBJ1dWbGcCH4xo,3089
71
- sibi_dst/utils/progress/sse_runner.py,sha256=NttASZH_ayXo1Zi6I4tSwYnWySLxexOYQGlqzOZiXlI,4965
72
60
  sibi_dst/utils/storage_config.py,sha256=DLtP5jKVM0mdFdgRw6LQfRqyavMjJcCVU7GhsUCRH78,4427
73
61
  sibi_dst/utils/storage_hive.py,sha256=eZ3nq2YWLUUG-06iJubSC15cwSHEbKKdKIwoVhD_I_E,8568
74
62
  sibi_dst/utils/storage_manager.py,sha256=La1NY79bhRAmHWXp7QcXJZtbHoRboJMgoXOSXbIl1SA,6643
75
63
  sibi_dst/utils/update_planner.py,sha256=1UOh4MjZSfaA_ZO-nKailOGal5EY-xVR8KSCJzo7p_g,16834
76
64
  sibi_dst/utils/webdav_client.py,sha256=D9J5d1f1qQwHGm5FE5AMVpOPwcU5oD7K8JZoKGP8NpM,5811
77
65
  sibi_dst/utils/write_gatekeeper.py,sha256=V8sY9YMO-JuN8Ps7prqwVSjP4f1HGH9KiVV-aTPCC_k,569
66
+ sibi_dst/utils/boilerplate/__init__.py,sha256=89oepkDCnjegnhzd6Kgga71AH17KSvNMlgD83CAAmg8,582
67
+ sibi_dst/utils/boilerplate/base_attacher.py,sha256=iZftWNUx8y370OJP_kGCs5v3t2RgPuARIK_jQeFfbAU,2089
68
+ sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
69
+ sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
70
+ sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
71
+ sibi_dst/utils/boilerplate/base_pipeline.py,sha256=cPVvjGxd0tDDvP_c27MNDHNuNnM6k3yx5bivN5ZFgrQ,5587
72
+ sibi_dst/utils/boilerplate/base_pipeline_template.py,sha256=D5HFA4odsR2wlTY6iLg1tm57Tsh91QkoYjjX8eUgrjU,1574
73
+ sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
74
+ sibi_dst/utils/progress/__init__.py,sha256=VELVxzo2cePN_-LL0veel8-F3po6tokY5MOOpu6pz1A,92
75
+ sibi_dst/utils/progress/jobs.py,sha256=nE58ng9GPCPZhnaCDltr1tQgu3AJVqBJ1dWbGcCH4xo,3089
76
+ sibi_dst/utils/progress/sse_runner.py,sha256=NttASZH_ayXo1Zi6I4tSwYnWySLxexOYQGlqzOZiXlI,4965
78
77
  sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
79
78
  sibi_dst/v2/df_helper/__init__.py,sha256=XuH6jKYAPg2DdRbsxxBSxp9X3x-ARyaT0xe27uILrVo,99
80
79
  sibi_dst/v2/df_helper/_df_helper.py,sha256=9pED3bjQ2Z81zqzJrZ9e7SguoO4-hBmNTJK4WOKrr4M,9297
@@ -95,6 +94,7 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
95
94
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
96
95
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
97
96
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
98
- sibi_dst-2025.9.11.dist-info/METADATA,sha256=LPlDJ-9iwPB3hKALqJuAX3vvZX2EHv83BPD4hwVs6Hw,2761
99
- sibi_dst-2025.9.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
100
- sibi_dst-2025.9.11.dist-info/RECORD,,
97
+ sibi_dst-2025.9.12.dist-info/METADATA,sha256=6WESOFkRannoMbUj_V-dDpti2qQ5XIxUgHSTfaqKAHA,2413
98
+ sibi_dst-2025.9.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
99
+ sibi_dst-2025.9.12.dist-info/top_level.txt,sha256=g3Cj4R-rciuNyJgcxuxNgw5nhN0n4TCB0ujcTEjZNiU,9
100
+ sibi_dst-2025.9.12.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.1
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ sibi_dst
@@ -1,132 +0,0 @@
1
- import re
2
- from nltk.corpus import stopwords
3
- from nltk.stem import SnowballStemmer
4
- import dask.dataframe as dd
5
- from dask_ml.preprocessing import OneHotEncoder, LabelEncoder
6
- import nltk
7
-
8
- class DataCleaner:
9
- def __init__(self, dataframe):
10
- self.original_df = dataframe
11
- self.df = dataframe.copy()
12
- self.duplicates_df = None
13
-
14
- def handle_missing_values(self, strategy='mean'):
15
- if strategy == 'mean':
16
- self.df = self.df.fillna(self.df.mean())
17
- elif strategy == 'median':
18
- self.df = self.df.fillna(self.df.median())
19
- elif strategy == 'mode':
20
- self.df = self.df.fillna(self.df.mode().iloc[0])
21
- elif strategy == 'drop':
22
- self.df = self.df.dropna()
23
- return self
24
-
25
- def identify_duplicates(self, subset=None):
26
- self.duplicates_df = self.df.map_partitions(lambda df: df[df.duplicated(subset=subset, keep=False)])
27
- return self.duplicates_df
28
-
29
- def remove_duplicates(self):
30
- if self.duplicates_df is not None:
31
- self.df = self.df[~self.df.index.isin(self.duplicates_df.index)]
32
- return self
33
-
34
- def validate_date_fields(self, date_columns=None):
35
- if date_columns is None:
36
- date_columns = self.df.select_dtypes(include=['datetime', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns
37
- for col in date_columns:
38
- print('Validating date field: ', col)
39
- self.df[col] = dd.to_datetime(self.df[col], errors='coerce')
40
- return self
41
-
42
- def clean_text(self, text_columns=None, language='english'):
43
- nltk.download('stopwords')
44
- stop_words = set(stopwords.words(language))
45
- stemmer = SnowballStemmer(language)
46
-
47
- def clean_text(text):
48
- if isinstance(text, str):
49
- text = text.strip().lower() # Remove leading/trailing whitespace and convert to lowercase
50
- text = re.sub(r'[^\w\s]', '', text) # Remove special characters and punctuation
51
- words = text.split()
52
- words = [word for word in words if word not in stop_words] # Remove stop words
53
- words = [stemmer.stem(word) for word in words] # Apply stemming
54
- return ' '.join(words)
55
- return text
56
-
57
- if text_columns is None:
58
- text_columns = self.df.select_dtypes(include=['object', 'string']).columns
59
- text_columns = [col for col in text_columns if self.df[col].dtype != 'bool']
60
-
61
- for col in text_columns:
62
- print('Cleaning text field: ', col)
63
- self.df[col] = self.df[col].map(clean_text, meta=('cleaned_text', 'object'))
64
- return self
65
-
66
- def validate_numeric_fields(self, int_columns=None, float_columns=None):
67
- if int_columns is None:
68
- int_columns = self.df.select_dtypes(include=['int64', 'int32']).columns
69
- if float_columns is None:
70
- float_columns = self.df.select_dtypes(include=['float64', 'float32']).columns
71
-
72
- for col in int_columns:
73
- print('Validating integer field: ', col)
74
- self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='integer')
75
-
76
- for col in float_columns:
77
- print('Validating float field: ', col)
78
- self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='float')
79
-
80
- return self
81
-
82
- def detect_categorical_columns(self, threshold=0.05):
83
- """
84
- Detect columns that can be converted to 'category' dtype.
85
-
86
- Parameters:
87
- threshold (float): The maximum ratio of unique values to total values for a column to be considered categorical.
88
-
89
- Returns:
90
- List of column names that can be converted to 'category' dtype.
91
- """
92
- categorical_columns = []
93
-
94
- def unique_ratio(partition, col):
95
- return partition[col].nunique() / len(partition)
96
-
97
- for col in self.df.columns:
98
- print("Detecting categorical columns: ", col)
99
- unique_ratios = self.df.map_partitions(unique_ratio, col=col).compute()
100
- overall_unique_ratio = unique_ratios.sum() / len(self.df)
101
- if overall_unique_ratio < threshold:
102
- print(f'Column {col} is categorical')
103
- categorical_columns.append(col)
104
-
105
- return categorical_columns
106
-
107
- def handle_categorical_variables(self, columns=None, method='onehot', threshold=0.05):
108
- if columns is None:
109
- columns = self.detect_categorical_columns(threshold)
110
-
111
- if method == 'onehot':
112
- for col in columns:
113
- self.df[col] = self.df[col].astype('category')
114
- encoder = OneHotEncoder(sparse_output=False)
115
- self.df = encoder.fit_transform(self.df)
116
- elif method == 'label':
117
- encoder = LabelEncoder()
118
- for col in columns:
119
- self.df[col] = encoder.fit_transform(self.df[col])
120
- return self
121
-
122
- def analyze_dtypes(self):
123
- return self.df.dtypes
124
-
125
- def get_cleaned_dataframe(self):
126
- return self.df
127
-
128
- def get_original_dataframe(self):
129
- return self.original_df
130
-
131
- def get_duplicates_dataframe(self):
132
- return self.duplicates_df