sibi-dst 0.3.58__py3-none-any.whl → 0.3.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,9 @@
1
1
  import datetime
2
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
- from typing import List, Optional, Dict, Union, Tuple
4
- import logging
5
- from sibi_dst.utils import Logger
6
-
3
+ from typing import List, Optional, Dict, Union, Tuple, Set
7
4
  import pandas as pd
8
5
  import fsspec
9
- from IPython.display import display
10
- from tqdm import tqdm
11
-
6
+ from sibi_dst.utils import Logger
12
7
  from .date_utils import FileAgeChecker
13
8
 
14
9
 
@@ -28,41 +23,45 @@ class UpdatePlanner:
28
23
  overwrite: If True, forces updates for all dates.
29
24
  ignore_missing: If True, skips missing files outside history.
30
25
  reverse_order: If True, sorts dates descending in output.
31
- priority_map: Maps category names to numeric priorities.
32
26
  show_progress: If True, displays a tqdm progress bar.
33
27
  logger: Logger for informational messages.
34
28
 
35
29
  Note:
36
- generate_plan() will overwrite self.plan and self.df_req.
30
+ generate_plan() will overwrite self.plan and self.df_req, and returns a DataFrame of required updates.
37
31
  """
38
32
 
39
33
  DEFAULT_PRIORITY_MAP = {
40
- "overwrite forced": 1,
41
- "missing_in_history": 2,
42
- "existing_but_stale": 3,
43
- "missing_outside_history": 4,
44
- "missing_ignored": 0,
45
- "file_is_recent": 0
34
+ "file_is_recent": 0,
35
+ "missing_ignored": 0,
36
+ "overwrite_forced": 1,
37
+ "create_missing": 2,
38
+ "missing_in_history": 3,
39
+ "stale_in_history": 4,
46
40
  }
47
41
 
42
+ DEFAULT_MAX_AGE_MINUTES = 1440
43
+ DEFAULT_HISTORY_DAYS_THRESHOLD = 30
44
+
48
45
  def __init__(
49
- self,
50
- data_path: str,
51
- filename: str,
52
- description: str = "Update Planner",
53
- fs: Optional[fsspec.AbstractFileSystem] = None,
54
- filesystem_type: str = "file",
55
- filesystem_options: Optional[Dict] = None,
56
- reference_date: Union[str, datetime.date] = None,
57
- history_days_threshold: int = 30,
58
- max_age_minutes: int = 1440,
59
- overwrite: bool = False,
60
- ignore_missing: bool = False,
61
- custom_priority_map: Optional[Dict[str, int]] = None,
62
- reverse_order: bool = False,
63
- show_progress: bool = False,
64
- debug: bool = False,
65
- logger: Optional[Logger] = None
46
+ self,
47
+ data_path: str,
48
+ filename: str,
49
+ description: str = "Update Planner",
50
+ fs: Optional[fsspec.AbstractFileSystem] = None,
51
+ filesystem_type: str = "file",
52
+ filesystem_options: Optional[Dict] = None,
53
+ reference_date: Union[str, datetime.date] = None,
54
+ history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
55
+ max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
56
+ overwrite: bool = False,
57
+ ignore_missing: bool = False,
58
+ custom_priority_map: Optional[Dict[str, int]] = None,
59
+ reverse_order: bool = False,
60
+ show_progress: bool = False,
61
+ verbose: bool = False,
62
+ debug: bool = False,
63
+ logger: Optional[Logger] = None,
64
+ skipped: Optional[List[str]] = None,
66
65
  ):
67
66
  # Initialize state
68
67
  self.plan: pd.DataFrame = pd.DataFrame()
@@ -77,7 +76,7 @@ class UpdatePlanner:
77
76
 
78
77
  # Filesystem and age helper
79
78
  self.fs = fs or fsspec.filesystem(filesystem_type, **(filesystem_options or {}))
80
- self.age_checker = FileAgeChecker(logger=self.logger)
79
+ self.age_checker = FileAgeChecker(debug=debug, logger=self.logger)
81
80
 
82
81
  # Normalize reference date
83
82
  if reference_date is None:
@@ -91,6 +90,7 @@ class UpdatePlanner:
91
90
  self.overwrite = overwrite
92
91
  self.ignore_missing = ignore_missing
93
92
  self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
93
+ self.skipped = skipped or []
94
94
 
95
95
  @staticmethod
96
96
  def _ensure_trailing_slash(path: str) -> str:
@@ -98,15 +98,14 @@ class UpdatePlanner:
98
98
  return path.rstrip('/') + '/'
99
99
 
100
100
  def _generate_plan(
101
- self,
102
- start: datetime.date,
103
- end: datetime.date,
104
- freq: str = "D"
101
+ self,
102
+ start: datetime.date,
103
+ end: datetime.date,
104
+ freq: str = "D"
105
105
  ) -> None:
106
106
  """
107
- Internal: populates self.plan and self.df_req with all dates and required subset.
107
+ Internal: populates self.plan with all dates, and self.df_req with only those needing update.
108
108
  """
109
- # Generate list of dates
110
109
  dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
111
110
  history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
112
111
  rows: List[Dict] = []
@@ -116,10 +115,11 @@ class UpdatePlanner:
116
115
  futures = {executor.submit(self._get_file_status, d): d for d in dates}
117
116
  iterator = as_completed(futures)
118
117
  if self.show_progress:
118
+ from tqdm import tqdm
119
119
  iterator = tqdm(
120
120
  iterator,
121
121
  total=len(futures),
122
- desc="Scanning dates",
122
+ desc=f"Scanning dates for {self.description}",
123
123
  unit="date",
124
124
  leave=False
125
125
  )
@@ -128,7 +128,6 @@ class UpdatePlanner:
128
128
  exists, age = future.result()
129
129
  rows.append(self._make_row(d, history_start, exists, age))
130
130
 
131
- # Build DataFrame and filtered subset
132
131
  df = pd.DataFrame(rows)
133
132
  df = df.sort_values(
134
133
  by=["update_priority", "date"],
@@ -139,44 +138,27 @@ class UpdatePlanner:
139
138
  self.df_req = df[df.update_required].copy()
140
139
 
141
140
  def generate_plan(
142
- self,
143
- start: Union[str, datetime.date],
144
- end: Union[str, datetime.date]
145
- ) -> List[Dict[str, Union[str, int]]]:
141
+ self,
142
+ start: Union[str, datetime.date],
143
+ end: Union[str, datetime.date]
144
+ ) -> pd.DataFrame:
146
145
  """
147
- Generate and return the update plan for dates between start and end.
148
-
149
- Returns:
150
- A list of dicts for dates requiring updates, each with:
151
- - date: str 'YYYY-MM-DD'
152
- - update_priority: int
153
- The list is sorted by update_priority ascending, then by date
154
- (descending if reverse_order=True).
155
-
156
- Raises:
157
- ValueError: if start > end.
146
+ Generate and return a DataFrame of dates requiring updates between start and end,
147
+ sorted by update_priority and date (descending if reverse_order=True).
158
148
  """
159
- # Normalize and validate inputs
160
149
  sd = pd.to_datetime(start).date()
161
150
  ed = pd.to_datetime(end).date()
162
151
  if sd > ed:
163
152
  raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
164
153
 
165
- if self.logger:
166
- self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
167
-
168
- # Populate plan
154
+ self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
169
155
  self._generate_plan(sd, ed)
156
+ self.logger.info(
157
+ f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
158
+ f"{len(self.df_req)} require update"
159
+ )
170
160
 
171
- if self.logger:
172
- self.logger.info(f"Plan built for {self.description}: {len(self.df_req)} dates require updates")
173
-
174
- # Format output
175
- output = [
176
- {"date": d.strftime("%Y-%m-%d"), "update_priority": int(p)}
177
- for d, p in zip(self.df_req.date, self.df_req.update_priority)
178
- ]
179
- return self.df_req[['date','update_priority','description']]
161
+ return self.df_req
180
162
 
181
163
  def show_update_plan(self) -> None:
182
164
  """
@@ -185,16 +167,21 @@ class UpdatePlanner:
185
167
  if self.plan.empty:
186
168
  self.logger.warning("No update plan available. Call generate_plan() first.")
187
169
  return
170
+ from IPython.display import display
188
171
  display(self.plan)
189
172
 
190
173
  def _get_file_status(
191
- self,
192
- date: datetime.date
174
+ self,
175
+ date: datetime.date
193
176
  ) -> Tuple[bool, Optional[float]]:
194
177
  """
195
178
  Check file existence and age for the given date.
196
179
  """
197
- path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/{self.filename}"
180
+ just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
181
+ if just_path in self.skipped:
182
+ self.logger.debug(f"Update plan is skipping date {date} as it is in the skipped list.")
183
+ return False, None
184
+ path = f"{just_path}{self.filename}"
198
185
  try:
199
186
  exists = self.fs.exists(path)
200
187
  age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
@@ -203,28 +190,39 @@ class UpdatePlanner:
203
190
  return False, None
204
191
 
205
192
  def _make_row(
206
- self,
207
- date: datetime.date,
208
- history_start: datetime.date,
209
- file_exists: bool,
210
- file_age: Optional[float]
193
+ self,
194
+ date: datetime.date,
195
+ history_start: datetime.date,
196
+ file_exists: bool,
197
+ file_age: Optional[float]
211
198
  ) -> Dict:
212
199
  """
213
200
  Build a single plan row based on flags and thresholds.
214
201
  """
215
- """Create a row for the update plan DataFrame"""
216
202
  within_history = history_start <= date <= self.reference_date
217
- category, update_required = "file_is_recent", False
203
+ update_required = False
218
204
 
205
+ # 1. Overwrite mode forces update
219
206
  if self.overwrite:
220
- category, update_required = "overwrite", True
207
+ category = "overwrite_forced"
208
+ update_required = True
209
+ # 2. Within history window: missing or stale
221
210
  elif within_history:
222
211
  if not file_exists:
223
- category, update_required = "missing_in_history", True
224
- elif file_age > self.max_age_minutes:
225
- category, update_required = "existing_but_stale", True
212
+ category = "missing_in_history"
213
+ update_required = True
214
+ elif file_age is not None and file_age > self.max_age_minutes:
215
+ category = "stale_in_history"
216
+ update_required = True
217
+ else:
218
+ category = "file_is_recent"
219
+ # 3. Outside history, missing file
226
220
  elif not file_exists and not self.ignore_missing:
227
- category, update_required = "missing_outside_history", True
221
+ category = "create_missing"
222
+ update_required = True
223
+ # 4. Everything else (existing files outside history, or ignored missing)
224
+ else:
225
+ category = "missing_ignored" if not file_exists else "file_is_recent"
228
226
 
229
227
  return {
230
228
  "date": date,
@@ -235,3 +233,16 @@ class UpdatePlanner:
235
233
  "update_required": update_required,
236
234
  "description": self.description,
237
235
  }
236
+
237
+ def exclude_dates(self, dates: Set[datetime.date]) -> None:
238
+ """
239
+ Exclude specific dates from the update plan.
240
+ """
241
+ if not isinstance(dates, set):
242
+ raise ValueError("dates must be a set of datetime.date objects.")
243
+ if self.plan.empty:
244
+ self.logger.warning("No update plan available. Call generate_plan() first.")
245
+ return
246
+ self.plan = self.plan[~self.plan['date'].isin(dates)]
247
+ self.df_req = self.plan[self.plan["update_required"]]
248
+ self.logger.info(f"Excluded {len(dates)} dates from the update plan.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.58
3
+ Version: 0.3.59
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,8 +1,8 @@
1
1
  sibi_dst/__init__.py,sha256=3pbriM7Ym5f9gew7n9cO4G_p9n-0bnxdmQ0hwBdJjr4,253
2
2
  sibi_dst/df_helper/__init__.py,sha256=McYrw2N0MsMgtawLrONXTGdyHfQWVOBUvIDbklfjb54,342
3
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=2iQ4bQLCPKnqfKMwmttJFLdNon0bc8Lz7UfDcVRG8pQ,10175
4
- sibi_dst/df_helper/_df_helper.py,sha256=WFqEyNhmE4hAXgMcqT8a7iYAhsmM3f_Z_DSQdlZu3P4,30968
5
- sibi_dst/df_helper/_parquet_artifact.py,sha256=i6QhQhGz6jyt7MWQk7CocQxUUZwG_50oSk9eyB42_kQ,11248
3
+ sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=-Y4i5KAxKY2BNkmoVeMEZxjTFD7zaM9oQ0aRsvUbQrs,9340
4
+ sibi_dst/df_helper/_df_helper.py,sha256=uKP6i-7dasZQ5zViD8-VJU0lNHumrdZG6IXvDFijZ18,31214
5
+ sibi_dst/df_helper/_parquet_artifact.py,sha256=6y8nJ-HDAdmy3XNSvnEdA2zBXDhUIVoUeKgXLmVMGCo,14879
6
6
  sibi_dst/df_helper/_parquet_reader.py,sha256=L6mr2FeKtTeIn37G9EGpvOx8PwMqXb6qnEECqBaiwxo,3954
7
7
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  sibi_dst/df_helper/backends/django/__init__.py,sha256=uWHi-DtQX5re7b2HcqoXUH3_FZWOw1VTmDf552FAkNs,256
@@ -16,7 +16,7 @@ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1u
16
16
  sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
17
17
  sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=TaU5_wG1Y3lQC8DVCItVvMnc6ZJmECLu3avssVEMbaM,10591
18
18
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=TuVp8Ce49dCIIxtyrtFGRblarQUl8QGcS-TDZd515IE,348
19
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=PgMbjmEBSu5g5Vi0DL6Gd6871j_KuK5DNxKH9WDbWGE,7986
19
+ sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=JID-urZLbWjMd2dXt7onp6cPxAWQ3jnsY88s_lCscn8,7980
20
20
  sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
21
21
  sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=BtiRSYA4kFIM-mBCdrwE20vzByfq8_Biv_jPLUCDv58,5466
22
22
  sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=ltqB5814PMecxwZgmsJL6nDhQf72V-w71YWFAf7aYZ8,6490
@@ -38,23 +38,24 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
38
38
  sibi_dst/osmnx_helper/utils.py,sha256=BzuY8CtYnBAAO8UAr_M7EOk6CP1zcifNLs8pkdFZEFg,20577
39
39
  sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
41
- sibi_dst/utils/__init__.py,sha256=RYVJ1f_bE3l-awgCpa-eV24HnBQC26hT4AFlwYghnPg,1120
41
+ sibi_dst/utils/__init__.py,sha256=I-LeSCEnnOKunblmCrv_KTBzbsr7Rx1MEWM9TypDqWM,1203
42
42
  sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWdQv10,7526
43
43
  sibi_dst/utils/clickhouse_writer.py,sha256=iAUe4_Kn2WR1xZjpLW2FOWCWfOTw6fCGMTUcWxIQJ60,9877
44
44
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
45
45
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
46
46
  sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
47
- sibi_dst/utils/data_wrapper.py,sha256=npOVesxPBdvAUQVGhFc_3B9TgBQSZlKfNWOSm-FNdRQ,9779
48
- sibi_dst/utils/date_utils.py,sha256=OCJqkWl5e8fE7z11Ufz4206DUeuLMd_Gf_JGZu914Pg,18539
47
+ sibi_dst/utils/data_wrapper.py,sha256=DFkqi84DIGxcrf36FfbgmeF9Hu7PZjMO9otNerV8ZYk,10546
48
+ sibi_dst/utils/date_utils.py,sha256=T3ij-WOQu3cIfmNAweSVMWWr-hVtuBcTGjEY-cMJIvU,18627
49
49
  sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
50
50
  sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
51
51
  sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
52
- sibi_dst/utils/log_utils.py,sha256=eSAbi_jmMpJ8RpycakzT4S4zNkqVZDj3FY8WwnxpdXc,4623
52
+ sibi_dst/utils/log_utils.py,sha256=77xACRagKU83H9vn7aVeBzkQjxWlbe4dg4KuxPRCgvw,4635
53
+ sibi_dst/utils/manifest_manager.py,sha256=abm97TuWgJqNViPXMbpl5W7ttrg1BeiJkf2SMGc4hd8,5512
53
54
  sibi_dst/utils/parquet_saver.py,sha256=O62xwPfphOpKgEiHqnts20CPSU96pxs49Cg7PVetLK0,8193
54
55
  sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
55
56
  sibi_dst/utils/storage_config.py,sha256=Cg8EOGLZ_5v9sunaQHZLYHdp5FDkgPrCVVNHF-ys5sQ,2181
56
57
  sibi_dst/utils/storage_manager.py,sha256=btecX7ggNb7rfu5EK9Xuu2q_FZA7r_rB_tfhQ8V96qc,6567
57
- sibi_dst/utils/update_planner.py,sha256=AaprHgUsKeyZNvA3nRHrCqnxy8GXwySkaV27X_k54Xc,8799
58
+ sibi_dst/utils/update_planner.py,sha256=dJXLC-KdbWrCs-MFe7Xa8F-ZhlNJq8P1szjLAzMJZk0,9684
58
59
  sibi_dst/utils/webdav_client.py,sha256=pYF1UsGOuxYeGLq7aBfwZFvkvD4meOcbbaiZ4d6GW9I,7107
59
60
  sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
61
  sibi_dst/v2/df_helper/__init__.py,sha256=XuH6jKYAPg2DdRbsxxBSxp9X3x-ARyaT0xe27uILrVo,99
@@ -76,6 +77,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
76
77
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
77
78
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
78
79
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
79
- sibi_dst-0.3.58.dist-info/METADATA,sha256=DAQ2TfVwh0aB5uiZ-tnvlPzYe6hdB16M5NXswXZr9ZE,4292
80
- sibi_dst-0.3.58.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
81
- sibi_dst-0.3.58.dist-info/RECORD,,
80
+ sibi_dst-0.3.59.dist-info/METADATA,sha256=zYb_0a1ImTPUxB4raHM8cvip0_oZSot52N8okr3r_ZY,4292
81
+ sibi_dst-0.3.59.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
82
+ sibi_dst-0.3.59.dist-info/RECORD,,