sibi-dst 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +165 -166
- sibi_dst/df_helper/_df_helper.py +55 -23
- sibi_dst/df_helper/_parquet_artifact.py +29 -11
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +185 -57
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +6 -2
- sibi_dst/utils/__init__.py +2 -0
- sibi_dst/utils/data_wrapper.py +33 -92
- sibi_dst/utils/parquet_saver.py +15 -12
- sibi_dst/utils/update_planner.py +237 -0
- {sibi_dst-0.3.55.dist-info → sibi_dst-0.3.57.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.55.dist-info → sibi_dst-0.3.57.dist-info}/RECORD +12 -11
- {sibi_dst-0.3.55.dist-info → sibi_dst-0.3.57.dist-info}/WHEEL +0 -0
@@ -0,0 +1,237 @@
|
|
1
|
+
import datetime
|
2
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
|
+
from typing import List, Optional, Dict, Union, Tuple
|
4
|
+
import logging
|
5
|
+
from sibi_dst.utils import Logger
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import fsspec
|
9
|
+
from IPython.display import display
|
10
|
+
from tqdm import tqdm
|
11
|
+
|
12
|
+
from .date_utils import FileAgeChecker
|
13
|
+
|
14
|
+
|
15
|
+
class UpdatePlanner:
|
16
|
+
"""
|
17
|
+
A utility class to scan a date-partitioned filesystem and
|
18
|
+
generate an update plan indicating which dates need processing.
|
19
|
+
|
20
|
+
Attributes:
|
21
|
+
data_path: Base path (always ends with '/').
|
22
|
+
filename: Filename inside each date folder.
|
23
|
+
fs: fsspec filesystem instance.
|
24
|
+
age_checker: FileAgeChecker for computing file ages.
|
25
|
+
reference_date: The "today" date used for history windows (date or ISO string).
|
26
|
+
history_days_threshold: Number of days considered "in history".
|
27
|
+
max_age_minutes: File staleness threshold in minutes.
|
28
|
+
overwrite: If True, forces updates for all dates.
|
29
|
+
ignore_missing: If True, skips missing files outside history.
|
30
|
+
reverse_order: If True, sorts dates descending in output.
|
31
|
+
priority_map: Maps category names to numeric priorities.
|
32
|
+
show_progress: If True, displays a tqdm progress bar.
|
33
|
+
logger: Logger for informational messages.
|
34
|
+
|
35
|
+
Note:
|
36
|
+
generate_plan() will overwrite self.plan and self.df_req.
|
37
|
+
"""
|
38
|
+
|
39
|
+
DEFAULT_PRIORITY_MAP = {
|
40
|
+
"overwrite forced": 1,
|
41
|
+
"missing_in_history": 2,
|
42
|
+
"existing_but_stale": 3,
|
43
|
+
"missing_outside_history": 4,
|
44
|
+
"missing_ignored": 0,
|
45
|
+
"file_is_recent": 0
|
46
|
+
}
|
47
|
+
|
48
|
+
def __init__(
|
49
|
+
self,
|
50
|
+
data_path: str,
|
51
|
+
filename: str,
|
52
|
+
description: str = "Update Planner",
|
53
|
+
fs: Optional[fsspec.AbstractFileSystem] = None,
|
54
|
+
filesystem_type: str = "file",
|
55
|
+
filesystem_options: Optional[Dict] = None,
|
56
|
+
reference_date: Union[str, datetime.date] = None,
|
57
|
+
history_days_threshold: int = 30,
|
58
|
+
max_age_minutes: int = 1440,
|
59
|
+
overwrite: bool = False,
|
60
|
+
ignore_missing: bool = False,
|
61
|
+
custom_priority_map: Optional[Dict[str, int]] = None,
|
62
|
+
reverse_order: bool = False,
|
63
|
+
show_progress: bool = False,
|
64
|
+
debug: bool = False,
|
65
|
+
logger: Optional[Logger] = None
|
66
|
+
):
|
67
|
+
# Initialize state
|
68
|
+
self.plan: pd.DataFrame = pd.DataFrame()
|
69
|
+
self.df_req: pd.DataFrame = pd.DataFrame()
|
70
|
+
self.description = description
|
71
|
+
self.data_path = self._ensure_trailing_slash(data_path)
|
72
|
+
self.filename = filename
|
73
|
+
self.reverse_order = reverse_order
|
74
|
+
self.show_progress = show_progress
|
75
|
+
self.logger = logger or Logger.default_logger(logger_name="update_planner")
|
76
|
+
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
77
|
+
|
78
|
+
# Filesystem and age helper
|
79
|
+
self.fs = fs or fsspec.filesystem(filesystem_type, **(filesystem_options or {}))
|
80
|
+
self.age_checker = FileAgeChecker(logger=self.logger)
|
81
|
+
|
82
|
+
# Normalize reference date
|
83
|
+
if reference_date is None:
|
84
|
+
self.reference_date = datetime.date.today()
|
85
|
+
else:
|
86
|
+
self.reference_date = pd.to_datetime(reference_date).date()
|
87
|
+
|
88
|
+
# Thresholds and flags
|
89
|
+
self.history_days_threshold = history_days_threshold
|
90
|
+
self.max_age_minutes = max_age_minutes
|
91
|
+
self.overwrite = overwrite
|
92
|
+
self.ignore_missing = ignore_missing
|
93
|
+
self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
|
94
|
+
|
95
|
+
@staticmethod
|
96
|
+
def _ensure_trailing_slash(path: str) -> str:
|
97
|
+
"""Ensure that the provided path ends with a single '/'."""
|
98
|
+
return path.rstrip('/') + '/'
|
99
|
+
|
100
|
+
def _generate_plan(
|
101
|
+
self,
|
102
|
+
start: datetime.date,
|
103
|
+
end: datetime.date,
|
104
|
+
freq: str = "D"
|
105
|
+
) -> None:
|
106
|
+
"""
|
107
|
+
Internal: populates self.plan and self.df_req with all dates and required subset.
|
108
|
+
"""
|
109
|
+
# Generate list of dates
|
110
|
+
dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
111
|
+
history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
|
112
|
+
rows: List[Dict] = []
|
113
|
+
|
114
|
+
# Parallel file status checks
|
115
|
+
with ThreadPoolExecutor() as executor:
|
116
|
+
futures = {executor.submit(self._get_file_status, d): d for d in dates}
|
117
|
+
iterator = as_completed(futures)
|
118
|
+
if self.show_progress:
|
119
|
+
iterator = tqdm(
|
120
|
+
iterator,
|
121
|
+
total=len(futures),
|
122
|
+
desc="Scanning dates",
|
123
|
+
unit="date",
|
124
|
+
leave=False
|
125
|
+
)
|
126
|
+
for future in iterator:
|
127
|
+
d = futures[future]
|
128
|
+
exists, age = future.result()
|
129
|
+
rows.append(self._make_row(d, history_start, exists, age))
|
130
|
+
|
131
|
+
# Build DataFrame and filtered subset
|
132
|
+
df = pd.DataFrame(rows)
|
133
|
+
df = df.sort_values(
|
134
|
+
by=["update_priority", "date"],
|
135
|
+
ascending=[True, not self.reverse_order]
|
136
|
+
).reset_index(drop=True)
|
137
|
+
|
138
|
+
self.plan = df
|
139
|
+
self.df_req = df[df.update_required].copy()
|
140
|
+
|
141
|
+
def generate_plan(
|
142
|
+
self,
|
143
|
+
start: Union[str, datetime.date],
|
144
|
+
end: Union[str, datetime.date]
|
145
|
+
) -> List[Dict[str, Union[str, int]]]:
|
146
|
+
"""
|
147
|
+
Generate and return the update plan for dates between start and end.
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
A list of dicts for dates requiring updates, each with:
|
151
|
+
- date: str 'YYYY-MM-DD'
|
152
|
+
- update_priority: int
|
153
|
+
The list is sorted by update_priority ascending, then by date
|
154
|
+
(descending if reverse_order=True).
|
155
|
+
|
156
|
+
Raises:
|
157
|
+
ValueError: if start > end.
|
158
|
+
"""
|
159
|
+
# Normalize and validate inputs
|
160
|
+
sd = pd.to_datetime(start).date()
|
161
|
+
ed = pd.to_datetime(end).date()
|
162
|
+
if sd > ed:
|
163
|
+
raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
164
|
+
|
165
|
+
if self.logger:
|
166
|
+
self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
|
167
|
+
|
168
|
+
# Populate plan
|
169
|
+
self._generate_plan(sd, ed)
|
170
|
+
|
171
|
+
if self.logger:
|
172
|
+
self.logger.info(f"Plan built for {self.description}: {len(self.df_req)} dates require updates")
|
173
|
+
|
174
|
+
# Format output
|
175
|
+
output = [
|
176
|
+
{"date": d.strftime("%Y-%m-%d"), "update_priority": int(p)}
|
177
|
+
for d, p in zip(self.df_req.date, self.df_req.update_priority)
|
178
|
+
]
|
179
|
+
return self.df_req[['date','update_priority','description']]
|
180
|
+
|
181
|
+
def show_update_plan(self) -> None:
|
182
|
+
"""
|
183
|
+
Display the full update plan as a styled DataFrame.
|
184
|
+
"""
|
185
|
+
if self.plan.empty:
|
186
|
+
self.logger.warning("No update plan available. Call generate_plan() first.")
|
187
|
+
return
|
188
|
+
display(self.plan)
|
189
|
+
|
190
|
+
def _get_file_status(
|
191
|
+
self,
|
192
|
+
date: datetime.date
|
193
|
+
) -> Tuple[bool, Optional[float]]:
|
194
|
+
"""
|
195
|
+
Check file existence and age for the given date.
|
196
|
+
"""
|
197
|
+
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/{self.filename}"
|
198
|
+
try:
|
199
|
+
exists = self.fs.exists(path)
|
200
|
+
age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
|
201
|
+
return exists, age
|
202
|
+
except Exception:
|
203
|
+
return False, None
|
204
|
+
|
205
|
+
def _make_row(
|
206
|
+
self,
|
207
|
+
date: datetime.date,
|
208
|
+
history_start: datetime.date,
|
209
|
+
file_exists: bool,
|
210
|
+
file_age: Optional[float]
|
211
|
+
) -> Dict:
|
212
|
+
"""
|
213
|
+
Build a single plan row based on flags and thresholds.
|
214
|
+
"""
|
215
|
+
"""Create a row for the update plan DataFrame"""
|
216
|
+
within_history = history_start <= date <= self.reference_date
|
217
|
+
category, update_required = "file_is_recent", False
|
218
|
+
|
219
|
+
if self.overwrite:
|
220
|
+
category, update_required = "overwrite", True
|
221
|
+
elif within_history:
|
222
|
+
if not file_exists:
|
223
|
+
category, update_required = "missing_in_history", True
|
224
|
+
elif file_age > self.max_age_minutes:
|
225
|
+
category, update_required = "existing_but_stale", True
|
226
|
+
elif not file_exists and not self.ignore_missing:
|
227
|
+
category, update_required = "missing_outside_history", True
|
228
|
+
|
229
|
+
return {
|
230
|
+
"date": date,
|
231
|
+
"file_exists": file_exists,
|
232
|
+
"file_age_minutes": file_age,
|
233
|
+
"update_category": category,
|
234
|
+
"update_priority": self.priority_map.get(category, 99),
|
235
|
+
"update_required": update_required,
|
236
|
+
"description": self.description,
|
237
|
+
}
|
@@ -1,8 +1,8 @@
|
|
1
1
|
sibi_dst/__init__.py,sha256=3pbriM7Ym5f9gew7n9cO4G_p9n-0bnxdmQ0hwBdJjr4,253
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=McYrw2N0MsMgtawLrONXTGdyHfQWVOBUvIDbklfjb54,342
|
3
|
-
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=
|
4
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
5
|
-
sibi_dst/df_helper/_parquet_artifact.py,sha256=
|
3
|
+
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=f0yz5L1WE2a_N5Kkjr06ZwyJktdXy9xBLMDWJXQAaOI,10145
|
4
|
+
sibi_dst/df_helper/_df_helper.py,sha256=Th5IakQjF22dfxDF5mC7DJOQm1ZAxM09AMW9uDH83oc,30962
|
5
|
+
sibi_dst/df_helper/_parquet_artifact.py,sha256=i6QhQhGz6jyt7MWQk7CocQxUUZwG_50oSk9eyB42_kQ,11248
|
6
6
|
sibi_dst/df_helper/_parquet_reader.py,sha256=L6mr2FeKtTeIn37G9EGpvOx8PwMqXb6qnEECqBaiwxo,3954
|
7
7
|
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
sibi_dst/df_helper/backends/django/__init__.py,sha256=uWHi-DtQX5re7b2HcqoXUH3_FZWOw1VTmDf552FAkNs,256
|
@@ -16,10 +16,10 @@ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1u
|
|
16
16
|
sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
|
17
17
|
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=TaU5_wG1Y3lQC8DVCItVvMnc6ZJmECLu3avssVEMbaM,10591
|
18
18
|
sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=TuVp8Ce49dCIIxtyrtFGRblarQUl8QGcS-TDZd515IE,348
|
19
|
-
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=
|
19
|
+
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=PgMbjmEBSu5g5Vi0DL6Gd6871j_KuK5DNxKH9WDbWGE,7986
|
20
20
|
sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
|
21
21
|
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=BtiRSYA4kFIM-mBCdrwE20vzByfq8_Biv_jPLUCDv58,5466
|
22
|
-
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=
|
22
|
+
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=ltqB5814PMecxwZgmsJL6nDhQf72V-w71YWFAf7aYZ8,6490
|
23
23
|
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=ksvJ0EvktrVsoJ9DTMIQHzHe8ghw2mzDIBD_YgWytgw,8402
|
24
24
|
sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
|
25
25
|
sibi_dst/df_helper/core/_defaults.py,sha256=eNpHD2sZxir-2xO0b3_V16ryw8YP_5FfpIKK0HNuiN4,7011
|
@@ -38,22 +38,23 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
|
|
38
38
|
sibi_dst/osmnx_helper/utils.py,sha256=BzuY8CtYnBAAO8UAr_M7EOk6CP1zcifNLs8pkdFZEFg,20577
|
39
39
|
sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
40
|
sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
|
41
|
-
sibi_dst/utils/__init__.py,sha256=
|
41
|
+
sibi_dst/utils/__init__.py,sha256=RYVJ1f_bE3l-awgCpa-eV24HnBQC26hT4AFlwYghnPg,1120
|
42
42
|
sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWdQv10,7526
|
43
43
|
sibi_dst/utils/clickhouse_writer.py,sha256=iAUe4_Kn2WR1xZjpLW2FOWCWfOTw6fCGMTUcWxIQJ60,9877
|
44
44
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
45
45
|
sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
|
46
46
|
sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
|
47
|
-
sibi_dst/utils/data_wrapper.py,sha256
|
47
|
+
sibi_dst/utils/data_wrapper.py,sha256=npOVesxPBdvAUQVGhFc_3B9TgBQSZlKfNWOSm-FNdRQ,9779
|
48
48
|
sibi_dst/utils/date_utils.py,sha256=OCJqkWl5e8fE7z11Ufz4206DUeuLMd_Gf_JGZu914Pg,18539
|
49
49
|
sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
|
50
50
|
sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
|
51
51
|
sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
|
52
52
|
sibi_dst/utils/log_utils.py,sha256=eSAbi_jmMpJ8RpycakzT4S4zNkqVZDj3FY8WwnxpdXc,4623
|
53
|
-
sibi_dst/utils/parquet_saver.py,sha256=
|
53
|
+
sibi_dst/utils/parquet_saver.py,sha256=O62xwPfphOpKgEiHqnts20CPSU96pxs49Cg7PVetLK0,8193
|
54
54
|
sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
|
55
55
|
sibi_dst/utils/storage_config.py,sha256=Cg8EOGLZ_5v9sunaQHZLYHdp5FDkgPrCVVNHF-ys5sQ,2181
|
56
56
|
sibi_dst/utils/storage_manager.py,sha256=btecX7ggNb7rfu5EK9Xuu2q_FZA7r_rB_tfhQ8V96qc,6567
|
57
|
+
sibi_dst/utils/update_planner.py,sha256=AaprHgUsKeyZNvA3nRHrCqnxy8GXwySkaV27X_k54Xc,8799
|
57
58
|
sibi_dst/utils/webdav_client.py,sha256=pYF1UsGOuxYeGLq7aBfwZFvkvD4meOcbbaiZ4d6GW9I,7107
|
58
59
|
sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
59
60
|
sibi_dst/v2/df_helper/__init__.py,sha256=XuH6jKYAPg2DdRbsxxBSxp9X3x-ARyaT0xe27uILrVo,99
|
@@ -75,6 +76,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
75
76
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
76
77
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
77
78
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
78
|
-
sibi_dst-0.3.
|
79
|
-
sibi_dst-0.3.
|
80
|
-
sibi_dst-0.3.
|
79
|
+
sibi_dst-0.3.57.dist-info/METADATA,sha256=U6Ra-N2vtA8yJOdLmj1jNZ9skrrhlvQR3wauhkAbKgk,4292
|
80
|
+
sibi_dst-0.3.57.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
81
|
+
sibi_dst-0.3.57.dist-info/RECORD,,
|
File without changes
|