sibi-dst 0.3.58__py3-none-any.whl → 0.3.59__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +211 -233
- sibi_dst/df_helper/_df_helper.py +7 -3
- sibi_dst/df_helper/_parquet_artifact.py +143 -52
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +3 -3
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/data_wrapper.py +149 -140
- sibi_dst/utils/date_utils.py +8 -8
- sibi_dst/utils/log_utils.py +1 -1
- sibi_dst/utils/manifest_manager.py +154 -0
- sibi_dst/utils/update_planner.py +96 -85
- {sibi_dst-0.3.58.dist-info → sibi_dst-0.3.59.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.58.dist-info → sibi_dst-0.3.59.dist-info}/RECORD +13 -12
- {sibi_dst-0.3.58.dist-info → sibi_dst-0.3.59.dist-info}/WHEEL +0 -0
sibi_dst/utils/update_planner.py
CHANGED
@@ -1,14 +1,9 @@
|
|
1
1
|
import datetime
|
2
2
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
|
-
from typing import List, Optional, Dict, Union, Tuple
|
4
|
-
import logging
|
5
|
-
from sibi_dst.utils import Logger
|
6
|
-
|
3
|
+
from typing import List, Optional, Dict, Union, Tuple, Set
|
7
4
|
import pandas as pd
|
8
5
|
import fsspec
|
9
|
-
from
|
10
|
-
from tqdm import tqdm
|
11
|
-
|
6
|
+
from sibi_dst.utils import Logger
|
12
7
|
from .date_utils import FileAgeChecker
|
13
8
|
|
14
9
|
|
@@ -28,41 +23,45 @@ class UpdatePlanner:
|
|
28
23
|
overwrite: If True, forces updates for all dates.
|
29
24
|
ignore_missing: If True, skips missing files outside history.
|
30
25
|
reverse_order: If True, sorts dates descending in output.
|
31
|
-
priority_map: Maps category names to numeric priorities.
|
32
26
|
show_progress: If True, displays a tqdm progress bar.
|
33
27
|
logger: Logger for informational messages.
|
34
28
|
|
35
29
|
Note:
|
36
|
-
generate_plan() will overwrite self.plan and self.df_req.
|
30
|
+
generate_plan() will overwrite self.plan and self.df_req, and returns a DataFrame of required updates.
|
37
31
|
"""
|
38
32
|
|
39
33
|
DEFAULT_PRIORITY_MAP = {
|
40
|
-
"
|
41
|
-
"
|
42
|
-
"
|
43
|
-
"
|
44
|
-
"
|
45
|
-
"
|
34
|
+
"file_is_recent": 0,
|
35
|
+
"missing_ignored": 0,
|
36
|
+
"overwrite_forced": 1,
|
37
|
+
"create_missing": 2,
|
38
|
+
"missing_in_history": 3,
|
39
|
+
"stale_in_history": 4,
|
46
40
|
}
|
47
41
|
|
42
|
+
DEFAULT_MAX_AGE_MINUTES = 1440
|
43
|
+
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
44
|
+
|
48
45
|
def __init__(
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
46
|
+
self,
|
47
|
+
data_path: str,
|
48
|
+
filename: str,
|
49
|
+
description: str = "Update Planner",
|
50
|
+
fs: Optional[fsspec.AbstractFileSystem] = None,
|
51
|
+
filesystem_type: str = "file",
|
52
|
+
filesystem_options: Optional[Dict] = None,
|
53
|
+
reference_date: Union[str, datetime.date] = None,
|
54
|
+
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
55
|
+
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
56
|
+
overwrite: bool = False,
|
57
|
+
ignore_missing: bool = False,
|
58
|
+
custom_priority_map: Optional[Dict[str, int]] = None,
|
59
|
+
reverse_order: bool = False,
|
60
|
+
show_progress: bool = False,
|
61
|
+
verbose: bool = False,
|
62
|
+
debug: bool = False,
|
63
|
+
logger: Optional[Logger] = None,
|
64
|
+
skipped: Optional[List[str]] = None,
|
66
65
|
):
|
67
66
|
# Initialize state
|
68
67
|
self.plan: pd.DataFrame = pd.DataFrame()
|
@@ -77,7 +76,7 @@ class UpdatePlanner:
|
|
77
76
|
|
78
77
|
# Filesystem and age helper
|
79
78
|
self.fs = fs or fsspec.filesystem(filesystem_type, **(filesystem_options or {}))
|
80
|
-
self.age_checker = FileAgeChecker(logger=self.logger)
|
79
|
+
self.age_checker = FileAgeChecker(debug=debug, logger=self.logger)
|
81
80
|
|
82
81
|
# Normalize reference date
|
83
82
|
if reference_date is None:
|
@@ -91,6 +90,7 @@ class UpdatePlanner:
|
|
91
90
|
self.overwrite = overwrite
|
92
91
|
self.ignore_missing = ignore_missing
|
93
92
|
self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
|
93
|
+
self.skipped = skipped or []
|
94
94
|
|
95
95
|
@staticmethod
|
96
96
|
def _ensure_trailing_slash(path: str) -> str:
|
@@ -98,15 +98,14 @@ class UpdatePlanner:
|
|
98
98
|
return path.rstrip('/') + '/'
|
99
99
|
|
100
100
|
def _generate_plan(
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
101
|
+
self,
|
102
|
+
start: datetime.date,
|
103
|
+
end: datetime.date,
|
104
|
+
freq: str = "D"
|
105
105
|
) -> None:
|
106
106
|
"""
|
107
|
-
Internal: populates self.plan and self.df_req with
|
107
|
+
Internal: populates self.plan with all dates, and self.df_req with only those needing update.
|
108
108
|
"""
|
109
|
-
# Generate list of dates
|
110
109
|
dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
111
110
|
history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
|
112
111
|
rows: List[Dict] = []
|
@@ -116,10 +115,11 @@ class UpdatePlanner:
|
|
116
115
|
futures = {executor.submit(self._get_file_status, d): d for d in dates}
|
117
116
|
iterator = as_completed(futures)
|
118
117
|
if self.show_progress:
|
118
|
+
from tqdm import tqdm
|
119
119
|
iterator = tqdm(
|
120
120
|
iterator,
|
121
121
|
total=len(futures),
|
122
|
-
desc="Scanning dates",
|
122
|
+
desc=f"Scanning dates for {self.description}",
|
123
123
|
unit="date",
|
124
124
|
leave=False
|
125
125
|
)
|
@@ -128,7 +128,6 @@ class UpdatePlanner:
|
|
128
128
|
exists, age = future.result()
|
129
129
|
rows.append(self._make_row(d, history_start, exists, age))
|
130
130
|
|
131
|
-
# Build DataFrame and filtered subset
|
132
131
|
df = pd.DataFrame(rows)
|
133
132
|
df = df.sort_values(
|
134
133
|
by=["update_priority", "date"],
|
@@ -139,44 +138,27 @@ class UpdatePlanner:
|
|
139
138
|
self.df_req = df[df.update_required].copy()
|
140
139
|
|
141
140
|
def generate_plan(
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
) ->
|
141
|
+
self,
|
142
|
+
start: Union[str, datetime.date],
|
143
|
+
end: Union[str, datetime.date]
|
144
|
+
) -> pd.DataFrame:
|
146
145
|
"""
|
147
|
-
Generate and return
|
148
|
-
|
149
|
-
Returns:
|
150
|
-
A list of dicts for dates requiring updates, each with:
|
151
|
-
- date: str 'YYYY-MM-DD'
|
152
|
-
- update_priority: int
|
153
|
-
The list is sorted by update_priority ascending, then by date
|
154
|
-
(descending if reverse_order=True).
|
155
|
-
|
156
|
-
Raises:
|
157
|
-
ValueError: if start > end.
|
146
|
+
Generate and return a DataFrame of dates requiring updates between start and end,
|
147
|
+
sorted by update_priority and date (descending if reverse_order=True).
|
158
148
|
"""
|
159
|
-
# Normalize and validate inputs
|
160
149
|
sd = pd.to_datetime(start).date()
|
161
150
|
ed = pd.to_datetime(end).date()
|
162
151
|
if sd > ed:
|
163
152
|
raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
164
153
|
|
165
|
-
|
166
|
-
self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
|
167
|
-
|
168
|
-
# Populate plan
|
154
|
+
self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
|
169
155
|
self._generate_plan(sd, ed)
|
156
|
+
self.logger.info(
|
157
|
+
f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
|
158
|
+
f"{len(self.df_req)} require update"
|
159
|
+
)
|
170
160
|
|
171
|
-
|
172
|
-
self.logger.info(f"Plan built for {self.description}: {len(self.df_req)} dates require updates")
|
173
|
-
|
174
|
-
# Format output
|
175
|
-
output = [
|
176
|
-
{"date": d.strftime("%Y-%m-%d"), "update_priority": int(p)}
|
177
|
-
for d, p in zip(self.df_req.date, self.df_req.update_priority)
|
178
|
-
]
|
179
|
-
return self.df_req[['date','update_priority','description']]
|
161
|
+
return self.df_req
|
180
162
|
|
181
163
|
def show_update_plan(self) -> None:
|
182
164
|
"""
|
@@ -185,16 +167,21 @@ class UpdatePlanner:
|
|
185
167
|
if self.plan.empty:
|
186
168
|
self.logger.warning("No update plan available. Call generate_plan() first.")
|
187
169
|
return
|
170
|
+
from IPython.display import display
|
188
171
|
display(self.plan)
|
189
172
|
|
190
173
|
def _get_file_status(
|
191
|
-
|
192
|
-
|
174
|
+
self,
|
175
|
+
date: datetime.date
|
193
176
|
) -> Tuple[bool, Optional[float]]:
|
194
177
|
"""
|
195
178
|
Check file existence and age for the given date.
|
196
179
|
"""
|
197
|
-
|
180
|
+
just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
181
|
+
if just_path in self.skipped:
|
182
|
+
self.logger.debug(f"Update plan is skipping date {date} as it is in the skipped list.")
|
183
|
+
return False, None
|
184
|
+
path = f"{just_path}{self.filename}"
|
198
185
|
try:
|
199
186
|
exists = self.fs.exists(path)
|
200
187
|
age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
|
@@ -203,28 +190,39 @@ class UpdatePlanner:
|
|
203
190
|
return False, None
|
204
191
|
|
205
192
|
def _make_row(
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
193
|
+
self,
|
194
|
+
date: datetime.date,
|
195
|
+
history_start: datetime.date,
|
196
|
+
file_exists: bool,
|
197
|
+
file_age: Optional[float]
|
211
198
|
) -> Dict:
|
212
199
|
"""
|
213
200
|
Build a single plan row based on flags and thresholds.
|
214
201
|
"""
|
215
|
-
"""Create a row for the update plan DataFrame"""
|
216
202
|
within_history = history_start <= date <= self.reference_date
|
217
|
-
|
203
|
+
update_required = False
|
218
204
|
|
205
|
+
# 1. Overwrite mode forces update
|
219
206
|
if self.overwrite:
|
220
|
-
category
|
207
|
+
category = "overwrite_forced"
|
208
|
+
update_required = True
|
209
|
+
# 2. Within history window: missing or stale
|
221
210
|
elif within_history:
|
222
211
|
if not file_exists:
|
223
|
-
category
|
224
|
-
|
225
|
-
|
212
|
+
category = "missing_in_history"
|
213
|
+
update_required = True
|
214
|
+
elif file_age is not None and file_age > self.max_age_minutes:
|
215
|
+
category = "stale_in_history"
|
216
|
+
update_required = True
|
217
|
+
else:
|
218
|
+
category = "file_is_recent"
|
219
|
+
# 3. Outside history, missing file
|
226
220
|
elif not file_exists and not self.ignore_missing:
|
227
|
-
category
|
221
|
+
category = "create_missing"
|
222
|
+
update_required = True
|
223
|
+
# 4. Everything else (existing files outside history, or ignored missing)
|
224
|
+
else:
|
225
|
+
category = "missing_ignored" if not file_exists else "file_is_recent"
|
228
226
|
|
229
227
|
return {
|
230
228
|
"date": date,
|
@@ -235,3 +233,16 @@ class UpdatePlanner:
|
|
235
233
|
"update_required": update_required,
|
236
234
|
"description": self.description,
|
237
235
|
}
|
236
|
+
|
237
|
+
def exclude_dates(self, dates: Set[datetime.date]) -> None:
|
238
|
+
"""
|
239
|
+
Exclude specific dates from the update plan.
|
240
|
+
"""
|
241
|
+
if not isinstance(dates, set):
|
242
|
+
raise ValueError("dates must be a set of datetime.date objects.")
|
243
|
+
if self.plan.empty:
|
244
|
+
self.logger.warning("No update plan available. Call generate_plan() first.")
|
245
|
+
return
|
246
|
+
self.plan = self.plan[~self.plan['date'].isin(dates)]
|
247
|
+
self.df_req = self.plan[self.plan["update_required"]]
|
248
|
+
self.logger.info(f"Excluded {len(dates)} dates from the update plan.")
|
@@ -1,8 +1,8 @@
|
|
1
1
|
sibi_dst/__init__.py,sha256=3pbriM7Ym5f9gew7n9cO4G_p9n-0bnxdmQ0hwBdJjr4,253
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=McYrw2N0MsMgtawLrONXTGdyHfQWVOBUvIDbklfjb54,342
|
3
|
-
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256
|
4
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
5
|
-
sibi_dst/df_helper/_parquet_artifact.py,sha256=
|
3
|
+
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=-Y4i5KAxKY2BNkmoVeMEZxjTFD7zaM9oQ0aRsvUbQrs,9340
|
4
|
+
sibi_dst/df_helper/_df_helper.py,sha256=uKP6i-7dasZQ5zViD8-VJU0lNHumrdZG6IXvDFijZ18,31214
|
5
|
+
sibi_dst/df_helper/_parquet_artifact.py,sha256=6y8nJ-HDAdmy3XNSvnEdA2zBXDhUIVoUeKgXLmVMGCo,14879
|
6
6
|
sibi_dst/df_helper/_parquet_reader.py,sha256=L6mr2FeKtTeIn37G9EGpvOx8PwMqXb6qnEECqBaiwxo,3954
|
7
7
|
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
sibi_dst/df_helper/backends/django/__init__.py,sha256=uWHi-DtQX5re7b2HcqoXUH3_FZWOw1VTmDf552FAkNs,256
|
@@ -16,7 +16,7 @@ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1u
|
|
16
16
|
sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
|
17
17
|
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=TaU5_wG1Y3lQC8DVCItVvMnc6ZJmECLu3avssVEMbaM,10591
|
18
18
|
sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=TuVp8Ce49dCIIxtyrtFGRblarQUl8QGcS-TDZd515IE,348
|
19
|
-
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=
|
19
|
+
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=JID-urZLbWjMd2dXt7onp6cPxAWQ3jnsY88s_lCscn8,7980
|
20
20
|
sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
|
21
21
|
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=BtiRSYA4kFIM-mBCdrwE20vzByfq8_Biv_jPLUCDv58,5466
|
22
22
|
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=ltqB5814PMecxwZgmsJL6nDhQf72V-w71YWFAf7aYZ8,6490
|
@@ -38,23 +38,24 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
|
|
38
38
|
sibi_dst/osmnx_helper/utils.py,sha256=BzuY8CtYnBAAO8UAr_M7EOk6CP1zcifNLs8pkdFZEFg,20577
|
39
39
|
sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
40
|
sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
|
41
|
-
sibi_dst/utils/__init__.py,sha256=
|
41
|
+
sibi_dst/utils/__init__.py,sha256=I-LeSCEnnOKunblmCrv_KTBzbsr7Rx1MEWM9TypDqWM,1203
|
42
42
|
sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWdQv10,7526
|
43
43
|
sibi_dst/utils/clickhouse_writer.py,sha256=iAUe4_Kn2WR1xZjpLW2FOWCWfOTw6fCGMTUcWxIQJ60,9877
|
44
44
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
45
45
|
sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
|
46
46
|
sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
|
47
|
-
sibi_dst/utils/data_wrapper.py,sha256=
|
48
|
-
sibi_dst/utils/date_utils.py,sha256=
|
47
|
+
sibi_dst/utils/data_wrapper.py,sha256=DFkqi84DIGxcrf36FfbgmeF9Hu7PZjMO9otNerV8ZYk,10546
|
48
|
+
sibi_dst/utils/date_utils.py,sha256=T3ij-WOQu3cIfmNAweSVMWWr-hVtuBcTGjEY-cMJIvU,18627
|
49
49
|
sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
|
50
50
|
sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
|
51
51
|
sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
|
52
|
-
sibi_dst/utils/log_utils.py,sha256=
|
52
|
+
sibi_dst/utils/log_utils.py,sha256=77xACRagKU83H9vn7aVeBzkQjxWlbe4dg4KuxPRCgvw,4635
|
53
|
+
sibi_dst/utils/manifest_manager.py,sha256=abm97TuWgJqNViPXMbpl5W7ttrg1BeiJkf2SMGc4hd8,5512
|
53
54
|
sibi_dst/utils/parquet_saver.py,sha256=O62xwPfphOpKgEiHqnts20CPSU96pxs49Cg7PVetLK0,8193
|
54
55
|
sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
|
55
56
|
sibi_dst/utils/storage_config.py,sha256=Cg8EOGLZ_5v9sunaQHZLYHdp5FDkgPrCVVNHF-ys5sQ,2181
|
56
57
|
sibi_dst/utils/storage_manager.py,sha256=btecX7ggNb7rfu5EK9Xuu2q_FZA7r_rB_tfhQ8V96qc,6567
|
57
|
-
sibi_dst/utils/update_planner.py,sha256=
|
58
|
+
sibi_dst/utils/update_planner.py,sha256=dJXLC-KdbWrCs-MFe7Xa8F-ZhlNJq8P1szjLAzMJZk0,9684
|
58
59
|
sibi_dst/utils/webdav_client.py,sha256=pYF1UsGOuxYeGLq7aBfwZFvkvD4meOcbbaiZ4d6GW9I,7107
|
59
60
|
sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
60
61
|
sibi_dst/v2/df_helper/__init__.py,sha256=XuH6jKYAPg2DdRbsxxBSxp9X3x-ARyaT0xe27uILrVo,99
|
@@ -76,6 +77,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
76
77
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
77
78
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
78
79
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
79
|
-
sibi_dst-0.3.
|
80
|
-
sibi_dst-0.3.
|
81
|
-
sibi_dst-0.3.
|
80
|
+
sibi_dst-0.3.59.dist-info/METADATA,sha256=zYb_0a1ImTPUxB4raHM8cvip0_oZSot52N8okr3r_ZY,4292
|
81
|
+
sibi_dst-0.3.59.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
82
|
+
sibi_dst-0.3.59.dist-info/RECORD,,
|
File without changes
|