sibi-dst 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ import datetime
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from typing import List, Optional, Dict, Union, Tuple
4
+ import logging
5
+ from sibi_dst.utils import Logger
6
+
7
+ import pandas as pd
8
+ import fsspec
9
+ from IPython.display import display
10
+ from tqdm import tqdm
11
+
12
+ from .date_utils import FileAgeChecker
13
+
14
+
15
+ class UpdatePlanner:
16
+ """
17
+ A utility class to scan a date-partitioned filesystem and
18
+ generate an update plan indicating which dates need processing.
19
+
20
+ Attributes:
21
+ data_path: Base path (always ends with '/').
22
+ filename: Filename inside each date folder.
23
+ fs: fsspec filesystem instance.
24
+ age_checker: FileAgeChecker for computing file ages.
25
+ reference_date: The "today" date used for history windows (date or ISO string).
26
+ history_days_threshold: Number of days considered "in history".
27
+ max_age_minutes: File staleness threshold in minutes.
28
+ overwrite: If True, forces updates for all dates.
29
+ ignore_missing: If True, skips missing files outside history.
30
+ reverse_order: If True, sorts dates descending in output.
31
+ priority_map: Maps category names to numeric priorities.
32
+ show_progress: If True, displays a tqdm progress bar.
33
+ logger: Logger for informational messages.
34
+
35
+ Note:
36
+ generate_plan() will overwrite self.plan and self.df_req.
37
+ """
38
+
39
+ DEFAULT_PRIORITY_MAP = {
40
+ "overwrite forced": 1,
41
+ "missing_in_history": 2,
42
+ "existing_but_stale": 3,
43
+ "missing_outside_history": 4,
44
+ "missing_ignored": 0,
45
+ "file_is_recent": 0
46
+ }
47
+
48
+ def __init__(
49
+ self,
50
+ data_path: str,
51
+ filename: str,
52
+ description: str = "Update Planner",
53
+ fs: Optional[fsspec.AbstractFileSystem] = None,
54
+ filesystem_type: str = "file",
55
+ filesystem_options: Optional[Dict] = None,
56
+ reference_date: Union[str, datetime.date] = None,
57
+ history_days_threshold: int = 30,
58
+ max_age_minutes: int = 1440,
59
+ overwrite: bool = False,
60
+ ignore_missing: bool = False,
61
+ custom_priority_map: Optional[Dict[str, int]] = None,
62
+ reverse_order: bool = False,
63
+ show_progress: bool = False,
64
+ debug: bool = False,
65
+ logger: Optional[Logger] = None
66
+ ):
67
+ # Initialize state
68
+ self.plan: pd.DataFrame = pd.DataFrame()
69
+ self.df_req: pd.DataFrame = pd.DataFrame()
70
+ self.description = description
71
+ self.data_path = self._ensure_trailing_slash(data_path)
72
+ self.filename = filename
73
+ self.reverse_order = reverse_order
74
+ self.show_progress = show_progress
75
+ self.logger = logger or Logger.default_logger(logger_name="update_planner")
76
+ self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
77
+
78
+ # Filesystem and age helper
79
+ self.fs = fs or fsspec.filesystem(filesystem_type, **(filesystem_options or {}))
80
+ self.age_checker = FileAgeChecker(logger=self.logger)
81
+
82
+ # Normalize reference date
83
+ if reference_date is None:
84
+ self.reference_date = datetime.date.today()
85
+ else:
86
+ self.reference_date = pd.to_datetime(reference_date).date()
87
+
88
+ # Thresholds and flags
89
+ self.history_days_threshold = history_days_threshold
90
+ self.max_age_minutes = max_age_minutes
91
+ self.overwrite = overwrite
92
+ self.ignore_missing = ignore_missing
93
+ self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
94
+
95
+ @staticmethod
96
+ def _ensure_trailing_slash(path: str) -> str:
97
+ """Ensure that the provided path ends with a single '/'."""
98
+ return path.rstrip('/') + '/'
99
+
100
+ def _generate_plan(
101
+ self,
102
+ start: datetime.date,
103
+ end: datetime.date,
104
+ freq: str = "D"
105
+ ) -> None:
106
+ """
107
+ Internal: populates self.plan and self.df_req with all dates and required subset.
108
+ """
109
+ # Generate list of dates
110
+ dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
111
+ history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
112
+ rows: List[Dict] = []
113
+
114
+ # Parallel file status checks
115
+ with ThreadPoolExecutor() as executor:
116
+ futures = {executor.submit(self._get_file_status, d): d for d in dates}
117
+ iterator = as_completed(futures)
118
+ if self.show_progress:
119
+ iterator = tqdm(
120
+ iterator,
121
+ total=len(futures),
122
+ desc="Scanning dates",
123
+ unit="date",
124
+ leave=False
125
+ )
126
+ for future in iterator:
127
+ d = futures[future]
128
+ exists, age = future.result()
129
+ rows.append(self._make_row(d, history_start, exists, age))
130
+
131
+ # Build DataFrame and filtered subset
132
+ df = pd.DataFrame(rows)
133
+ df = df.sort_values(
134
+ by=["update_priority", "date"],
135
+ ascending=[True, not self.reverse_order]
136
+ ).reset_index(drop=True)
137
+
138
+ self.plan = df
139
+ self.df_req = df[df.update_required].copy()
140
+
141
+ def generate_plan(
142
+ self,
143
+ start: Union[str, datetime.date],
144
+ end: Union[str, datetime.date]
145
+ ) -> List[Dict[str, Union[str, int]]]:
146
+ """
147
+ Generate and return the update plan for dates between start and end.
148
+
149
+ Returns:
150
+ A list of dicts for dates requiring updates, each with:
151
+ - date: str 'YYYY-MM-DD'
152
+ - update_priority: int
153
+ The list is sorted by update_priority ascending, then by date
154
+ (descending if reverse_order=True).
155
+
156
+ Raises:
157
+ ValueError: if start > end.
158
+ """
159
+ # Normalize and validate inputs
160
+ sd = pd.to_datetime(start).date()
161
+ ed = pd.to_datetime(end).date()
162
+ if sd > ed:
163
+ raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
164
+
165
+ if self.logger:
166
+ self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
167
+
168
+ # Populate plan
169
+ self._generate_plan(sd, ed)
170
+
171
+ if self.logger:
172
+ self.logger.info(f"Plan built for {self.description}: {len(self.df_req)} dates require updates")
173
+
174
+ # Format output
175
+ output = [
176
+ {"date": d.strftime("%Y-%m-%d"), "update_priority": int(p)}
177
+ for d, p in zip(self.df_req.date, self.df_req.update_priority)
178
+ ]
179
+ return self.df_req[['date','update_priority','description']]
180
+
181
+ def show_update_plan(self) -> None:
182
+ """
183
+ Display the full update plan as a styled DataFrame.
184
+ """
185
+ if self.plan.empty:
186
+ self.logger.warning("No update plan available. Call generate_plan() first.")
187
+ return
188
+ display(self.plan)
189
+
190
+ def _get_file_status(
191
+ self,
192
+ date: datetime.date
193
+ ) -> Tuple[bool, Optional[float]]:
194
+ """
195
+ Check file existence and age for the given date.
196
+ """
197
+ path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/{self.filename}"
198
+ try:
199
+ exists = self.fs.exists(path)
200
+ age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
201
+ return exists, age
202
+ except Exception:
203
+ return False, None
204
+
205
+ def _make_row(
206
+ self,
207
+ date: datetime.date,
208
+ history_start: datetime.date,
209
+ file_exists: bool,
210
+ file_age: Optional[float]
211
+ ) -> Dict:
212
+ """
213
+ Build a single plan row based on flags and thresholds.
214
+ """
215
+ """Create a row for the update plan DataFrame"""
216
+ within_history = history_start <= date <= self.reference_date
217
+ category, update_required = "file_is_recent", False
218
+
219
+ if self.overwrite:
220
+ category, update_required = "overwrite", True
221
+ elif within_history:
222
+ if not file_exists:
223
+ category, update_required = "missing_in_history", True
224
+ elif file_age > self.max_age_minutes:
225
+ category, update_required = "existing_but_stale", True
226
+ elif not file_exists and not self.ignore_missing:
227
+ category, update_required = "missing_outside_history", True
228
+
229
+ return {
230
+ "date": date,
231
+ "file_exists": file_exists,
232
+ "file_age_minutes": file_age,
233
+ "update_category": category,
234
+ "update_priority": self.priority_map.get(category, 99),
235
+ "update_required": update_required,
236
+ "description": self.description,
237
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.55
3
+ Version: 0.3.57
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,8 +1,8 @@
1
1
  sibi_dst/__init__.py,sha256=3pbriM7Ym5f9gew7n9cO4G_p9n-0bnxdmQ0hwBdJjr4,253
2
2
  sibi_dst/df_helper/__init__.py,sha256=McYrw2N0MsMgtawLrONXTGdyHfQWVOBUvIDbklfjb54,342
3
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=toH2QvNF-CQNJ4Bc8xreytuWr37G0EWz4ciWVdFMVqU,11646
4
- sibi_dst/df_helper/_df_helper.py,sha256=D85n4oUdu92IN2QaPc6k9uJJ_Vm197me1aoHojuWEYs,29833
5
- sibi_dst/df_helper/_parquet_artifact.py,sha256=dDoR5Tq0EJViW52tD9XW_vno7hkOfA8WeAilR1mAb_g,10636
3
+ sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=f0yz5L1WE2a_N5Kkjr06ZwyJktdXy9xBLMDWJXQAaOI,10145
4
+ sibi_dst/df_helper/_df_helper.py,sha256=Th5IakQjF22dfxDF5mC7DJOQm1ZAxM09AMW9uDH83oc,30962
5
+ sibi_dst/df_helper/_parquet_artifact.py,sha256=i6QhQhGz6jyt7MWQk7CocQxUUZwG_50oSk9eyB42_kQ,11248
6
6
  sibi_dst/df_helper/_parquet_reader.py,sha256=L6mr2FeKtTeIn37G9EGpvOx8PwMqXb6qnEECqBaiwxo,3954
7
7
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  sibi_dst/df_helper/backends/django/__init__.py,sha256=uWHi-DtQX5re7b2HcqoXUH3_FZWOw1VTmDf552FAkNs,256
@@ -16,10 +16,10 @@ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1u
16
16
  sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
17
17
  sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=TaU5_wG1Y3lQC8DVCItVvMnc6ZJmECLu3avssVEMbaM,10591
18
18
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=TuVp8Ce49dCIIxtyrtFGRblarQUl8QGcS-TDZd515IE,348
19
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=LxuTh4U1rN-h921QO5Owck4SQcrABpcWUi4EjEBPf-c,2846
19
+ sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=PgMbjmEBSu5g5Vi0DL6Gd6871j_KuK5DNxKH9WDbWGE,7986
20
20
  sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
21
21
  sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=BtiRSYA4kFIM-mBCdrwE20vzByfq8_Biv_jPLUCDv58,5466
22
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=I2Us3RrxHci561yyZYBuUCrLVOhB0F3KBnae78m_ARw,6259
22
+ sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=ltqB5814PMecxwZgmsJL6nDhQf72V-w71YWFAf7aYZ8,6490
23
23
  sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=ksvJ0EvktrVsoJ9DTMIQHzHe8ghw2mzDIBD_YgWytgw,8402
24
24
  sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
25
25
  sibi_dst/df_helper/core/_defaults.py,sha256=eNpHD2sZxir-2xO0b3_V16ryw8YP_5FfpIKK0HNuiN4,7011
@@ -38,22 +38,23 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
38
38
  sibi_dst/osmnx_helper/utils.py,sha256=BzuY8CtYnBAAO8UAr_M7EOk6CP1zcifNLs8pkdFZEFg,20577
39
39
  sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
41
- sibi_dst/utils/__init__.py,sha256=_4kuTzjCfbRF9927ywXqi-JKoHNec8wf05LYh4DMbPI,1077
41
+ sibi_dst/utils/__init__.py,sha256=RYVJ1f_bE3l-awgCpa-eV24HnBQC26hT4AFlwYghnPg,1120
42
42
  sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWdQv10,7526
43
43
  sibi_dst/utils/clickhouse_writer.py,sha256=iAUe4_Kn2WR1xZjpLW2FOWCWfOTw6fCGMTUcWxIQJ60,9877
44
44
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
45
45
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
46
46
  sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
47
- sibi_dst/utils/data_wrapper.py,sha256=-RqK_sU3uuc6U9dFfnICFZkaIetk3JOvJain6_lTWlo,12446
47
+ sibi_dst/utils/data_wrapper.py,sha256=npOVesxPBdvAUQVGhFc_3B9TgBQSZlKfNWOSm-FNdRQ,9779
48
48
  sibi_dst/utils/date_utils.py,sha256=OCJqkWl5e8fE7z11Ufz4206DUeuLMd_Gf_JGZu914Pg,18539
49
49
  sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
50
50
  sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
51
51
  sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
52
52
  sibi_dst/utils/log_utils.py,sha256=eSAbi_jmMpJ8RpycakzT4S4zNkqVZDj3FY8WwnxpdXc,4623
53
- sibi_dst/utils/parquet_saver.py,sha256=EBtd9blzk7Wb65aDBVVU0ZMHFtqjfWi_fCUt1LvyAC4,8069
53
+ sibi_dst/utils/parquet_saver.py,sha256=O62xwPfphOpKgEiHqnts20CPSU96pxs49Cg7PVetLK0,8193
54
54
  sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
55
55
  sibi_dst/utils/storage_config.py,sha256=Cg8EOGLZ_5v9sunaQHZLYHdp5FDkgPrCVVNHF-ys5sQ,2181
56
56
  sibi_dst/utils/storage_manager.py,sha256=btecX7ggNb7rfu5EK9Xuu2q_FZA7r_rB_tfhQ8V96qc,6567
57
+ sibi_dst/utils/update_planner.py,sha256=AaprHgUsKeyZNvA3nRHrCqnxy8GXwySkaV27X_k54Xc,8799
57
58
  sibi_dst/utils/webdav_client.py,sha256=pYF1UsGOuxYeGLq7aBfwZFvkvD4meOcbbaiZ4d6GW9I,7107
58
59
  sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
60
  sibi_dst/v2/df_helper/__init__.py,sha256=XuH6jKYAPg2DdRbsxxBSxp9X3x-ARyaT0xe27uILrVo,99
@@ -75,6 +76,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
75
76
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
76
77
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
77
78
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
78
- sibi_dst-0.3.55.dist-info/METADATA,sha256=EHj1miIF5IHvHATok4rYdLhSo2kD8cSrAs9P1fER1_8,4292
79
- sibi_dst-0.3.55.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
80
- sibi_dst-0.3.55.dist-info/RECORD,,
79
+ sibi_dst-0.3.57.dist-info/METADATA,sha256=U6Ra-N2vtA8yJOdLmj1jNZ9skrrhlvQR3wauhkAbKgk,4292
80
+ sibi_dst-0.3.57.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
81
+ sibi_dst-0.3.57.dist-info/RECORD,,