sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
  3. sibi_dst/df_helper/_df_helper.py +417 -117
  4. sibi_dst/df_helper/_parquet_artifact.py +255 -283
  5. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  6. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  7. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  8. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  9. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  10. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  12. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  13. sibi_dst/utils/base.py +302 -96
  14. sibi_dst/utils/clickhouse_writer.py +472 -206
  15. sibi_dst/utils/data_utils.py +139 -186
  16. sibi_dst/utils/data_wrapper.py +317 -73
  17. sibi_dst/utils/date_utils.py +1 -0
  18. sibi_dst/utils/df_utils.py +193 -213
  19. sibi_dst/utils/file_utils.py +3 -2
  20. sibi_dst/utils/filepath_generator.py +314 -152
  21. sibi_dst/utils/log_utils.py +581 -242
  22. sibi_dst/utils/manifest_manager.py +60 -76
  23. sibi_dst/utils/parquet_saver.py +33 -27
  24. sibi_dst/utils/phone_formatter.py +88 -95
  25. sibi_dst/utils/update_planner.py +180 -178
  26. sibi_dst/utils/webdav_client.py +116 -166
  27. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
  28. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +29 -27
  29. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
@@ -1,61 +1,21 @@
1
- import datetime
1
+ import datetime as dt
2
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
3
  from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
4
+
4
5
  import pandas as pd
5
- from .date_utils import FileAgeChecker
6
- from pydantic import BaseModel, Field
7
- from rich.console import Console
8
- from rich.table import Table
9
6
 
10
7
  from sibi_dst.utils import ManagedResource
8
+ from .date_utils import FileAgeChecker
11
9
 
12
10
 
13
- class UpdateConfig(BaseModel):
14
- """
15
- A unified Pydantic model for the data update process configuration.
16
- Acts as a single source of truth for all settings.
17
- """
18
- overwrite: bool = False
19
- reverse_order: bool = True
20
- ignore_missing: bool = False
21
- history_days_threshold: int = 30
22
- max_age_minutes: int = 1440 # 24 hours
23
- show_progress: bool = False
24
- verbose: bool = False
25
- debug: bool = False
26
- start_date: datetime.date
27
- end_date: datetime.date
28
- custom_priority_map: Optional[Dict[str, int]] = None
29
- max_threads: int = 3
30
- timeout: float = 30.0
31
-
32
- class Config:
33
- arbitrary_types_allowed = True
34
-
35
11
  class UpdatePlanner(ManagedResource):
36
12
  """
37
- A utility class to scan a date-partitioned filesystem and
38
- generate an update plan indicating which dates need processing.
39
-
40
- Attributes:
41
- data_path: Base path (always ends with '/').
42
- filename: Filename inside each date folder.
43
- fs: fsspec filesystem instance.
44
- age_checker: FileAgeChecker for computing file ages.
45
- reference_date: The "today" date used for history windows (date or ISO string).
46
- history_days_threshold: Number of days considered "in history".
47
- max_age_minutes: File staleness threshold in minutes.
48
- overwrite: If True, forces updates for all dates.
49
- ignore_missing: If True, skips missing files outside history.
50
- reverse_order: If True, sorts dates descending in output.
51
- show_progress: If True, displays a tqdm progress bar.
52
- logger: Logger for informational messages.
53
-
54
- Note:
55
- generate_plan() will overwrite self.plan and self.df_req, and returns a DataFrame of required updates.
13
+ Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
14
+ Produces a Pandas DataFrame plan; it does *not* load data frames, so Dask-vs-Pandas
15
+ concerns do not apply here.
56
16
  """
57
17
 
58
- DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]]={
18
+ DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
59
19
  "file_is_recent": 0,
60
20
  "missing_ignored": 0,
61
21
  "overwrite_forced": 1,
@@ -68,183 +28,221 @@ class UpdatePlanner(ManagedResource):
68
28
  DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
69
29
 
70
30
  def __init__(
71
- self,
72
- data_path: str,
73
- filename: str,
74
- description: str = "Update Planner",
75
- reference_date: Union[str, datetime.date] = None,
76
- history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
77
- max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
78
- overwrite: bool = False,
79
- ignore_missing: bool = False,
80
- custom_priority_map: Optional[Dict[str, int]] = None,
81
- reverse_order: bool = False,
82
- show_progress: bool = False,
83
- skipped: Optional[List[str]] = None,
84
- **kwargs
31
+ self,
32
+ parquet_storage_path: str,
33
+ parquet_filename: str,
34
+ description: str = "Update Planner",
35
+ reference_date: Union[str, dt.date, None] = None,
36
+ history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
37
+ max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
38
+ overwrite: bool = False,
39
+ ignore_missing: bool = False,
40
+ custom_priority_map: Optional[Dict[str, int]] = None,
41
+ reverse_order: bool = False,
42
+ show_progress: bool = False,
43
+ skipped: Optional[List[str]] = None,
44
+ **kwargs,
85
45
  ):
86
- # Initialize state
87
46
  super().__init__(**kwargs)
88
- self.plan: pd.DataFrame = pd.DataFrame()
89
- self.df_req: pd.DataFrame = pd.DataFrame()
47
+
48
+ # Public-ish attributes
90
49
  self.description = description
91
- self.data_path = self._ensure_trailing_slash(data_path)
92
- self.filename = filename
50
+ self.data_path = self._ensure_trailing_slash(parquet_storage_path)
51
+ self.filename = parquet_filename
93
52
  self.reverse_order = reverse_order
94
53
  self.show_progress = show_progress
95
- self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
54
+ self.overwrite = overwrite
55
+ self.ignore_missing = ignore_missing
56
+ self.history_days_threshold = history_days_threshold
57
+ self.max_age_minutes = max_age_minutes
58
+ self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
59
+ self.skipped = set(skipped or [])
96
60
 
97
- # Normalize reference date
61
+ # Execution knobs from kwargs (fed by upstream config)
62
+ self.max_threads: int = int(kwargs.get("max_threads", 3))
63
+ self.timeout: float = float(kwargs.get("timeout", 30.0))
64
+
65
+ # Date window
66
+ self.start_date = kwargs.get("parquet_start_date")
67
+ self.end_date = kwargs.get("parquet_end_date")
68
+
69
+ # Reference "today"
98
70
  if reference_date is None:
99
- self.reference_date = datetime.date.today()
71
+ self.reference_date = dt.date.today()
100
72
  else:
101
73
  self.reference_date = pd.to_datetime(reference_date).date()
102
74
 
103
- # Thresholds and flags
104
- self.history_days_threshold = history_days_threshold
105
- self.max_age_minutes = max_age_minutes
106
- self.overwrite = overwrite
107
- self.ignore_missing = ignore_missing
108
- self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
109
- self.skipped = skipped or []
110
-
111
- @staticmethod
112
- def _ensure_trailing_slash(path: str) -> str:
113
- """Ensure that the provided path ends with a single '/'."""
114
- return path.rstrip('/') + '/'
115
-
116
- def _generate_plan(
117
- self,
118
- start: datetime.date,
119
- end: datetime.date,
120
- freq: str = "D"
121
- ) -> None:
122
- """
123
- Internal: populates self.plan with all dates, and self.df_req with only those needing update.
124
- """
125
- dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
126
- history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
127
- rows: List[Dict] = []
75
+ # Helpers & state
76
+ self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
77
+ self.plan: pd.DataFrame = pd.DataFrame()
78
+ self.df_req: pd.DataFrame = pd.DataFrame()
128
79
 
129
- # Parallel file status checks
130
- with ThreadPoolExecutor() as executor:
131
- futures = {executor.submit(self._get_file_status, d): d for d in dates}
132
- iterator = as_completed(futures)
133
- if self.show_progress:
134
- from tqdm import tqdm
135
- iterator = tqdm(
136
- iterator,
137
- total=len(futures),
138
- desc=f"Scanning dates for {self.description}",
139
- unit="date",
140
- leave=False
141
- )
142
- for future in iterator:
143
- d = futures[future]
144
- try:
145
- exists, age = future.result()
146
- rows.append(self._make_row(d, history_start, exists, age))
147
- except Exception as exc:
148
- self.logger.error(f"Error processing date {d}: {exc}")
149
- rows.append(self._make_row(d, history_start, False, None))
80
+ # internal run flag to print once per run if caller reuses instance
81
+ self._printed_this_run: bool = False
150
82
 
151
- df = pd.DataFrame(rows)
152
- df = df.sort_values(
153
- by=["update_priority", "date"],
154
- ascending=[True, not self.reverse_order]
155
- ).reset_index(drop=True)
83
+ # --------------------- public helpers ---------------------
84
+ def has_plan(self) -> bool:
85
+ """Safe truthiness for plan existence."""
86
+ return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
156
87
 
157
- self.plan = df
158
- self.df_req = df[df.update_required].copy()
88
+ def required_count(self) -> int:
89
+ return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
159
90
 
91
+ # --------------------- core API ---------------------
160
92
  def generate_plan(
161
- self,
162
- start: Union[str, datetime.date],
163
- end: Union[str, datetime.date]
93
+ self,
94
+ start: Union[str, dt.date, None] = None,
95
+ end: Union[str, dt.date, None] = None,
96
+ freq: str = "D",
164
97
  ) -> pd.DataFrame:
165
98
  """
166
- Generate and return a DataFrame of dates requiring updates between start and end,
167
- sorted by update_priority and date (descending if reverse_order=True).
99
+ Build a plan for [start, end]. Returns rows that require update (df_req).
168
100
  """
101
+ start = start or self.start_date
102
+ end = end or self.end_date
169
103
  sd = pd.to_datetime(start).date()
170
104
  ed = pd.to_datetime(end).date()
171
105
  if sd > ed:
172
106
  raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
173
107
 
174
108
  self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
175
- self._generate_plan(sd, ed)
109
+ self._generate_plan(sd, ed, freq=freq)
176
110
  self.logger.info(
177
111
  f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
178
112
  f"{len(self.df_req)} require update"
179
113
  )
180
-
181
114
  return self.df_req
182
115
 
183
116
  def show_update_plan(self) -> None:
184
- """
185
- Display the full update plan as a styled DataFrame.
186
- """
187
- if self.plan.empty:
188
- self.logger.warning("No update plan available. Call generate_plan() first.")
117
+ """Pretty-print the current plan once per run."""
118
+ if not self.has_plan():
119
+ self.logger.info("No update plan to show.")
120
+ return
121
+ if self._printed_this_run:
189
122
  return
190
123
 
191
- console = Console(record=True)
124
+ try:
125
+ from rich.console import Console
126
+ from rich.table import Table
127
+ except Exception:
128
+ # Fallback: plain text
129
+ self.logger.info(f"Update Plan (plain list):\n{self.plan.to_string(index=False)}")
130
+ self._printed_this_run = True
131
+ return
192
132
 
193
- table = Table(title=f"Update Plan for {self.data_path}", show_header=True, header_style="bold magenta")
133
+ table = Table(
134
+ title=f"Update Plan for {self.data_path}",
135
+ show_header=True,
136
+ header_style="bold magenta",
137
+ )
194
138
  for column in self.plan.columns:
195
139
  table.add_column(column, justify="left")
140
+
196
141
  for _, row in self.plan.iterrows():
197
- table.add_row(*(str(item) for item in row))
142
+ table.add_row(*(str(row[col]) for col in self.plan.columns))
198
143
 
199
144
  console = Console()
200
145
  with console.capture() as capture:
201
146
  console.print(table)
202
- plan_string = capture.get()
203
-
204
- self.logger.info(f"Full Update Plan:\n{plan_string.strip()}")
147
+ self.logger.info(f"Full Update Plan:\n{capture.get().strip()}")
148
+ self._printed_this_run = True
205
149
 
206
- def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[datetime.date]]]:
207
- """Yields batches of dates to be processed, grouped and sorted by priority."""
208
- if self.plan.empty:
150
+ def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
151
+ """
152
+ Yield (priority, [dates...]) batches, smallest priority first.
153
+ """
154
+ if not self.has_plan():
209
155
  return
210
-
211
- required_updates = self.plan[self.plan['update_required']].copy()
212
- if required_updates.empty:
156
+ req = self.plan[self.plan["update_required"]]
157
+ if req.empty:
213
158
  return
214
-
215
- for priority in sorted(required_updates["update_priority"].unique()):
216
- dates_df = required_updates[required_updates["update_priority"] == priority]
217
- # Sort dates within the priority group
218
- sorted_dates = dates_df.sort_values(by=["date"], ascending=not self.reverse_order)
219
- dates = sorted_dates["date"].tolist()
159
+ for priority in sorted(req["update_priority"].unique()):
160
+ dates_df = req[req["update_priority"] == priority]
161
+ # sort within group
162
+ dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
163
+ dates = dates_df["date"].tolist()
220
164
  if dates:
221
- yield priority, dates
165
+ yield int(priority), dates
166
+
167
+ # --------------------- internals ---------------------
168
+ @staticmethod
169
+ def _ensure_trailing_slash(path: str) -> str:
170
+ return path.rstrip("/") + "/"
222
171
 
223
- def _get_file_status(
224
- self,
225
- date: datetime.date
226
- ) -> Tuple[bool, Optional[float]]:
172
+ def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
173
+ """
174
+ Populate self.plan with all dates and self.df_req with the subset to update.
175
+ """
176
+ dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
177
+ history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
178
+ rows: List[Dict] = []
179
+
180
+ # bound threads
181
+ max_workers = max(1, int(self.max_threads))
182
+
183
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
184
+ futures = {executor.submit(self._get_file_status, d): d for d in dates}
185
+ iterator = as_completed(futures)
186
+ if self.show_progress:
187
+ try:
188
+ from tqdm import tqdm
189
+ iterator = tqdm(
190
+ iterator, total=len(futures),
191
+ desc=f"Scanning dates for {self.description}",
192
+ unit="date", leave=False
193
+ )
194
+ except Exception:
195
+ pass # no tqdm → proceed without progress bar
196
+
197
+ for future in iterator:
198
+ d = futures[future]
199
+ try:
200
+ exists, age = future.result(timeout=self.timeout)
201
+ rows.append(self._make_row(d, history_start, exists, age))
202
+ except Exception as exc:
203
+ self.logger.error(f"Error processing date {d}: {exc}")
204
+ rows.append(self._make_row(d, history_start, False, None))
205
+
206
+ df = pd.DataFrame(rows)
207
+ # consistent types
208
+ if not df.empty:
209
+ df["date"] = pd.to_datetime(df["date"]).dt.date
210
+ df["update_priority"] = df["update_priority"].astype(int)
211
+
212
+ df = df.sort_values(
213
+ by=["update_priority", "date"],
214
+ ascending=[True, not self.reverse_order],
215
+ kind="mergesort", # stable
216
+ ).reset_index(drop=True)
217
+
218
+ self.plan = df
219
+ self.df_req = df[df["update_required"]].copy()
220
+ self._printed_this_run = False
221
+
222
+ def _get_file_status(self, date: dt.date) -> Tuple[bool, Optional[float]]:
227
223
  """
228
224
  Check file existence and age for the given date.
229
225
  """
230
226
  just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
231
227
  if just_path in self.skipped:
232
- self.logger.debug(f"Update plan is skipping date {date} as it is in the skipped list.")
228
+ self.logger.debug(f"Skipping {date}: path in skipped list.")
233
229
  return False, None
230
+
234
231
  path = f"{just_path}{self.filename}"
235
232
  try:
236
233
  exists = self.fs.exists(path)
237
234
  age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
238
- return exists, age
239
- except Exception:
235
+ return bool(exists), age
236
+ except Exception as e:
237
+ self.logger.warning(f"exists/age check failed for {path}: {e}")
240
238
  return False, None
241
239
 
242
240
  def _make_row(
243
- self,
244
- date: datetime.date,
245
- history_start: datetime.date,
246
- file_exists: bool,
247
- file_age: Optional[float]
241
+ self,
242
+ date: dt.date,
243
+ history_start: dt.date,
244
+ file_exists: bool,
245
+ file_age: Optional[float],
248
246
  ) -> Dict:
249
247
  """
250
248
  Build a single plan row based on flags and thresholds.
@@ -252,11 +250,11 @@ class UpdatePlanner(ManagedResource):
252
250
  within_history = history_start <= date <= self.reference_date
253
251
  update_required = False
254
252
 
255
- # 1. Overwrite mode forces update
253
+ # 1) Overwrite forces update
256
254
  if self.overwrite:
257
255
  category = "overwrite_forced"
258
256
  update_required = True
259
- # 2. Within history window: missing or stale
257
+ # 2) Inside history window
260
258
  elif within_history:
261
259
  if not file_exists:
262
260
  category = "missing_in_history"
@@ -266,33 +264,37 @@ class UpdatePlanner(ManagedResource):
266
264
  update_required = True
267
265
  else:
268
266
  category = "file_is_recent"
269
- # 3. Outside history, missing file
267
+ # 3) Outside history, missing file (and not ignoring)
270
268
  elif not file_exists and not self.ignore_missing:
271
269
  category = "create_missing"
272
270
  update_required = True
273
- # 4. Everything else (existing files outside history, or ignored missing)
271
+ # 4) Everything else
274
272
  else:
275
273
  category = "missing_ignored" if not file_exists else "file_is_recent"
276
274
 
277
275
  return {
278
276
  "date": date,
279
- "file_exists": file_exists,
277
+ "file_exists": bool(file_exists),
280
278
  "file_age_minutes": file_age,
281
279
  "update_category": category,
282
280
  "update_priority": self.priority_map.get(category, 99),
283
- "update_required": update_required,
281
+ "update_required": bool(update_required),
284
282
  "description": self.description,
285
283
  }
286
284
 
287
- def exclude_dates(self, dates: Set[datetime.date]) -> None:
285
+ def exclude_dates(self, dates: Set[dt.date]) -> None:
288
286
  """
289
287
  Exclude specific dates from the update plan.
290
288
  """
291
289
  if not isinstance(dates, set):
292
- raise ValueError("dates must be a set of datetime.date objects.")
293
- if self.plan.empty:
294
- self.logger.warning("No update plan available. Call generate_plan() first.")
290
+ raise ValueError("dates must be a set[date].")
291
+ if not self.has_plan():
292
+ self.logger.info("No update plan to modify. Call generate_plan() first.")
295
293
  return
296
- self.plan = self.plan[~self.plan['date'].isin(dates)]
297
- self.df_req = self.plan[self.plan["update_required"]]
298
- self.logger.info(f"Excluded {len(dates)} dates from the update plan.")
294
+
295
+ before = len(self.plan)
296
+ self.plan = self.plan[~self.plan["date"].isin(dates)]
297
+ self.df_req = self.plan[self.plan["update_required"]].copy()
298
+ self.logger.info(
299
+ f"Excluded {len(dates)} dates from the update plan (from {before} to {len(self.plan)} rows)."
300
+ )