sibi-dst 0.3.37__tar.gz → 0.3.39__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/PKG-INFO +1 -1
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/pyproject.toml +1 -1
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/data_wrapper.py +87 -96
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/date_utils.py +154 -1
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/README.md +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/airflow_manager.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/utils/storage_manager.py +0 -0
@@ -1,13 +1,13 @@
|
|
1
1
|
import datetime
|
2
2
|
from concurrent.futures import ThreadPoolExecutor
|
3
|
-
from typing import Type, Any, Dict, Optional
|
4
|
-
|
3
|
+
from typing import Type, Any, Dict, Optional, Union
|
4
|
+
from threading import Lock
|
5
5
|
import fsspec
|
6
6
|
import pandas as pd
|
7
7
|
from IPython.display import display
|
8
8
|
from tqdm import tqdm
|
9
9
|
|
10
|
-
from sibi_dst.utils import Logger
|
10
|
+
from sibi_dst.utils import Logger, DateUtils
|
11
11
|
from sibi_dst.utils import ParquetSaver
|
12
12
|
|
13
13
|
|
@@ -87,18 +87,18 @@ class DataWrapper:
|
|
87
87
|
reverse_order: bool = False,
|
88
88
|
overwrite: bool = False,
|
89
89
|
ignore_missing: bool = False,
|
90
|
-
logger:
|
90
|
+
logger: Logger = None,
|
91
91
|
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
92
92
|
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
93
93
|
show_progress: bool = False,
|
94
|
-
timeout:
|
94
|
+
timeout: float = 300):
|
95
95
|
self.dataclass = dataclass
|
96
96
|
self.date_field = date_field
|
97
97
|
self.data_path = self.ensure_forward_slash(data_path)
|
98
98
|
self.parquet_filename = parquet_filename
|
99
99
|
self.filesystem_type = filesystem_type
|
100
100
|
self.filesystem_options = filesystem_options or {}
|
101
|
-
self.fs = fs
|
101
|
+
self.fs = fs
|
102
102
|
self.verbose = verbose
|
103
103
|
self.class_params = class_params or {}
|
104
104
|
self.load_params = load_params or {}
|
@@ -113,9 +113,16 @@ class DataWrapper:
|
|
113
113
|
|
114
114
|
self.start_date = self.convert_to_date(start_date)
|
115
115
|
self.end_date = self.convert_to_date(end_date)
|
116
|
+
self._lock = Lock()
|
117
|
+
self.processed_dates = []
|
118
|
+
self.date_utils = DateUtils(logger=self.logger)
|
119
|
+
if self.fs is None:
|
120
|
+
with self._lock:
|
121
|
+
if self.fs is None:
|
122
|
+
self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
116
123
|
|
117
124
|
@staticmethod
|
118
|
-
def convert_to_date(date:
|
125
|
+
def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
119
126
|
if isinstance(date, datetime.date):
|
120
127
|
return date
|
121
128
|
try:
|
@@ -135,7 +142,7 @@ class DataWrapper:
|
|
135
142
|
for date in date_range:
|
136
143
|
yield date.date()
|
137
144
|
|
138
|
-
def process(self):
|
145
|
+
def process(self, max_retries: int = 3):
|
139
146
|
"""
|
140
147
|
Processes update tasks by generating an update plan, filtering required updates, and distributing
|
141
148
|
the workload across threads based on priority levels.
|
@@ -145,8 +152,8 @@ class DataWrapper:
|
|
145
152
|
Each thread handles the updates for a specific priority level, ensuring a streamlined approach
|
146
153
|
to handling the updates efficiently.
|
147
154
|
|
155
|
+
:param max_retries: Maximum number of retries for a task after a timeout. Defaults to 3.
|
148
156
|
:raises TimeoutError: If a thread processing a priority level exceeds the allowed timeout duration.
|
149
|
-
|
150
157
|
:return: None
|
151
158
|
"""
|
152
159
|
update_plan_table = self.generate_update_plan_with_conditions()
|
@@ -156,12 +163,14 @@ class DataWrapper:
|
|
156
163
|
display(update_plan_table)
|
157
164
|
|
158
165
|
# Filter out rows that do not require updates (priority 0 means skip)
|
159
|
-
|
160
|
-
|
161
|
-
|
166
|
+
with self._lock:
|
167
|
+
update_plan_table = update_plan_table[
|
168
|
+
(update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
|
169
|
+
]
|
162
170
|
|
163
171
|
# Group by priority
|
164
|
-
|
172
|
+
with self._lock:
|
173
|
+
priorities = sorted(update_plan_table["update_priority"].unique())
|
165
174
|
|
166
175
|
# We will process each priority level in its own thread.
|
167
176
|
# Each thread will handle all dates associated with that priority.
|
@@ -174,7 +183,8 @@ class DataWrapper:
|
|
174
183
|
# If show_progress is True, wrap in a progress bar
|
175
184
|
date_iterator = dates_to_process
|
176
185
|
if self.show_progress:
|
177
|
-
date_iterator = tqdm(date_iterator,
|
186
|
+
date_iterator = tqdm(date_iterator,
|
187
|
+
desc=f"Processing priority {priority}:{self.dataclass.__name__}",
|
178
188
|
unit="date")
|
179
189
|
|
180
190
|
# Process each date for this priority
|
@@ -184,63 +194,31 @@ class DataWrapper:
|
|
184
194
|
# Launch a separate thread for each priority
|
185
195
|
with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
|
186
196
|
futures = {executor.submit(process_priority, p): p for p in priorities}
|
187
|
-
for
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
if "mtime" in info: # Local filesystem
|
213
|
-
file_modification_time = info["mtime"]
|
214
|
-
file_modification_datetime = datetime.datetime.fromtimestamp(
|
215
|
-
file_modification_time, tz=datetime.timezone.utc
|
216
|
-
)
|
217
|
-
elif "LastModified" in info: # S3-compatible filesystem
|
218
|
-
file_modification_datetime = (
|
219
|
-
info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
|
220
|
-
else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
221
|
-
)
|
222
|
-
self.logger.info(f"S3 File modification time: {file_modification_datetime}")
|
223
|
-
else:
|
224
|
-
self.logger.warning(f"Modification time not available for {file_path}.")
|
225
|
-
return True # Assume file is too old if we cannot determine its age
|
226
|
-
|
227
|
-
# Compare file age
|
228
|
-
current_time = datetime.datetime.now(datetime.timezone.utc)
|
229
|
-
file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
|
230
|
-
self.logger.info(
|
231
|
-
f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
|
232
|
-
f"(threshold: {self.max_age_minutes} minutes)"
|
233
|
-
)
|
234
|
-
return file_age_minutes > self.max_age_minutes
|
235
|
-
|
236
|
-
except FileNotFoundError:
|
237
|
-
self.logger.warning(f"File {file_path} not found.")
|
238
|
-
if self.ignore_missing:
|
239
|
-
return False
|
240
|
-
return True # File is considered old if it doesn't exist
|
241
|
-
except Exception as e:
|
242
|
-
self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
|
243
|
-
return True
|
197
|
+
retries = {p: 0 for p in priorities} # Track retry counts for each priority
|
198
|
+
|
199
|
+
while futures:
|
200
|
+
for future in list(futures.keys()):
|
201
|
+
try:
|
202
|
+
future.result(timeout=self.timeout)
|
203
|
+
del futures[future] # Remove completed future
|
204
|
+
except TimeoutError:
|
205
|
+
priority = futures[future]
|
206
|
+
retries[priority] += 1
|
207
|
+
|
208
|
+
if retries[priority] <= max_retries:
|
209
|
+
self.logger.warning(
|
210
|
+
f"Thread for priority {priority} timed out. Retrying ({retries[priority]}/{max_retries})..."
|
211
|
+
)
|
212
|
+
new_future = executor.submit(process_priority, priority)
|
213
|
+
futures[new_future] = priority
|
214
|
+
else:
|
215
|
+
self.logger.error(
|
216
|
+
f"Thread for priority {priority} timed out. Max retries ({max_retries}) exceeded. Skipping."
|
217
|
+
)
|
218
|
+
del futures[future] # Remove the timed-out future
|
219
|
+
except Exception as e:
|
220
|
+
self.logger.error(f"Error processing priority {futures[future]}: {e}")
|
221
|
+
del futures[future] # Remove the failed future
|
244
222
|
|
245
223
|
def process_date(self, date: datetime.date):
|
246
224
|
"""
|
@@ -261,6 +239,7 @@ class DataWrapper:
|
|
261
239
|
full_parquet_filename = f"{folder}{self.parquet_filename}"
|
262
240
|
|
263
241
|
start_time = datetime.datetime.now()
|
242
|
+
self.logger.info(f"Processing date: {date}")
|
264
243
|
self.logger.info(f"Processing {full_parquet_filename}...")
|
265
244
|
|
266
245
|
data_object = self.dataclass(**self.class_params)
|
@@ -270,14 +249,18 @@ class DataWrapper:
|
|
270
249
|
self.logger.error("No data found for the specified date.")
|
271
250
|
return
|
272
251
|
|
273
|
-
|
274
|
-
|
252
|
+
with self._lock:
|
253
|
+
parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
|
254
|
+
parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
|
275
255
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
256
|
+
end_time = datetime.datetime.now()
|
257
|
+
duration_seconds = (end_time - start_time).total_seconds()
|
258
|
+
self.logger.info(
|
259
|
+
f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
|
260
|
+
)
|
261
|
+
|
262
|
+
self.processed_dates.append(date)
|
263
|
+
self.logger.info(f"Finished processing date: {date}")
|
281
264
|
|
282
265
|
def generate_update_plan_with_conditions(self):
|
283
266
|
"""
|
@@ -297,7 +280,12 @@ class DataWrapper:
|
|
297
280
|
|
298
281
|
today = datetime.date.today()
|
299
282
|
history_start_date = today - datetime.timedelta(days=self.history_days_threshold)
|
300
|
-
|
283
|
+
priority_map = {
|
284
|
+
"file is recent":0,
|
285
|
+
"overwrite": 1,
|
286
|
+
"history_days": 2,
|
287
|
+
"missing_files": 3
|
288
|
+
}
|
301
289
|
date_range = self.generate_date_range()
|
302
290
|
if self.show_progress:
|
303
291
|
date_range = tqdm(date_range, desc=f"Evaluating update plan:{self.dataclass.__name__}", unit="date")
|
@@ -317,12 +305,23 @@ class DataWrapper:
|
|
317
305
|
update_required = True
|
318
306
|
# Hierarchy 2: History threshold evaluation
|
319
307
|
elif within_history:
|
320
|
-
if
|
321
|
-
|
322
|
-
|
308
|
+
if file_exists:
|
309
|
+
if self.date_utils.is_file_older_than(
|
310
|
+
full_parquet_filename,
|
311
|
+
max_age_minutes=self.max_age_minutes,
|
312
|
+
fs=self.fs,
|
313
|
+
ignore_missing=self.ignore_missing,
|
314
|
+
verbose=self.verbose
|
315
|
+
):
|
316
|
+
category = "history_days"
|
317
|
+
update_required = True
|
318
|
+
else:
|
319
|
+
category = "file is recent"
|
320
|
+
update_required = False
|
323
321
|
else:
|
324
|
-
category = "
|
325
|
-
update_required =
|
322
|
+
category = "missing_files"
|
323
|
+
update_required = True
|
324
|
+
|
326
325
|
# Hierarchy 3: Missing files
|
327
326
|
elif missing_file and current_date <= today:
|
328
327
|
category = "missing_files"
|
@@ -332,25 +331,17 @@ class DataWrapper:
|
|
332
331
|
update_required = False
|
333
332
|
|
334
333
|
# Collect condition descriptions for the update plan table
|
335
|
-
|
334
|
+
row = {
|
336
335
|
"date": current_date,
|
337
336
|
"file_exists": file_exists,
|
338
337
|
"within_history": within_history,
|
339
338
|
"missing_file": missing_file,
|
340
339
|
"update_required": update_required,
|
341
340
|
"update_category": category,
|
342
|
-
"datawrapper class": self.dataclass.__name__
|
343
|
-
|
344
|
-
priority_map = {
|
345
|
-
"overwrite": 1,
|
346
|
-
"history_days": 2,
|
347
|
-
"missing_files": 3
|
341
|
+
"datawrapper class": self.dataclass.__name__,
|
342
|
+
"update_priority": priority_map.get(category, 0)
|
348
343
|
}
|
349
|
-
|
350
|
-
for row in rows:
|
351
|
-
category = row.get("update_category")
|
352
|
-
# Default to None if no category assigned (no update required)
|
353
|
-
row["update_priority"] = priority_map.get(category, 0)
|
344
|
+
rows.append(row)
|
354
345
|
|
355
346
|
update_plan_table = pd.DataFrame(rows)
|
356
347
|
return update_plan_table
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import datetime
|
2
|
-
from typing import Union, Tuple, Callable, Dict
|
2
|
+
from typing import Union, Tuple, Callable, Dict, Optional
|
3
3
|
|
4
|
+
import fsspec
|
4
5
|
import numpy as np
|
5
6
|
import pandas as pd
|
6
7
|
|
@@ -143,6 +144,158 @@ class DateUtils:
|
|
143
144
|
'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
|
144
145
|
}
|
145
146
|
|
147
|
+
def is_file_older_than(self, file_path: str, max_age_minutes: int, fs: Optional[fsspec.AbstractFileSystem] = None,
|
148
|
+
ignore_missing: bool = False, verbose: bool = False) -> bool:
|
149
|
+
"""
|
150
|
+
Check if a file or a partitioned Parquet dataset is older than the specified max_age_minutes.
|
151
|
+
|
152
|
+
:param file_path: Path to the file or dataset.
|
153
|
+
:param max_age_minutes: Maximum allowed age in minutes.
|
154
|
+
:param fs: Filesystem object (e.g., S3, local). If not provided, defaults to the local filesystem.
|
155
|
+
:param ignore_missing: If True, treat missing files as not old. Defaults to False.
|
156
|
+
:param verbose: If True, log detailed information. Defaults to False.
|
157
|
+
:return: True if the file or dataset is older than max_age_minutes, False otherwise.
|
158
|
+
"""
|
159
|
+
fs = fs or fsspec.filesystem("file")
|
160
|
+
self.logger.info(f"Checking age for {file_path}...")
|
161
|
+
|
162
|
+
try:
|
163
|
+
if not fs.exists(file_path):
|
164
|
+
self.logger.info(f"Path not found: {file_path}.")
|
165
|
+
return not ignore_missing
|
166
|
+
|
167
|
+
if fs.isdir(file_path):
|
168
|
+
self.logger.info(f"Found that {file_path} is a directory...")
|
169
|
+
return self._is_directory_older_than(file_path, max_age_minutes, fs, verbose)
|
170
|
+
|
171
|
+
elif fs.isfile(file_path):
|
172
|
+
return self._is_file_older_than(file_path, max_age_minutes, fs, verbose)
|
173
|
+
|
174
|
+
else:
|
175
|
+
self.logger.warning(f"Path {file_path} is neither a file nor a directory.")
|
176
|
+
return True
|
177
|
+
|
178
|
+
except Exception as e:
|
179
|
+
self.logger.warning(f"Error checking age for {file_path}: {str(e)}")
|
180
|
+
return True
|
181
|
+
|
182
|
+
def _is_directory_older_than(self, dir_path: str, max_age_minutes: int, fs: fsspec.AbstractFileSystem,
|
183
|
+
verbose: bool) -> bool:
|
184
|
+
"""
|
185
|
+
Check if the oldest file in a directory is older than the specified max_age_minutes.
|
186
|
+
|
187
|
+
:param dir_path: Path to the directory.
|
188
|
+
:param max_age_minutes: Maximum allowed age in minutes.
|
189
|
+
:param fs: Filesystem object.
|
190
|
+
:param verbose: If True, log detailed information.
|
191
|
+
:return: True if the oldest file is older than max_age_minutes, False otherwise.
|
192
|
+
"""
|
193
|
+
all_files = fs.ls(dir_path)
|
194
|
+
if not all_files:
|
195
|
+
self.logger.info(f"No files found in dataset: {dir_path}.")
|
196
|
+
return True
|
197
|
+
|
198
|
+
modification_times = [
|
199
|
+
self._get_modification_time(fs.info(file), file)
|
200
|
+
for file in all_files
|
201
|
+
if self._is_valid_file(file, fs)
|
202
|
+
]
|
203
|
+
|
204
|
+
if not modification_times:
|
205
|
+
self.logger.warning(f"No valid modification times found for dataset: {dir_path}. Assuming dataset is old.")
|
206
|
+
return True
|
207
|
+
|
208
|
+
oldest_modification_time = min(modification_times)
|
209
|
+
dataset_age_minutes = (datetime.datetime.now(
|
210
|
+
datetime.timezone.utc) - oldest_modification_time).total_seconds() / 60
|
211
|
+
|
212
|
+
if verbose:
|
213
|
+
self.logger.info(
|
214
|
+
f"Oldest file in dataset {dir_path} is {round(dataset_age_minutes, 2)} minutes old "
|
215
|
+
f"(threshold: {max_age_minutes} minutes)"
|
216
|
+
)
|
217
|
+
|
218
|
+
return dataset_age_minutes > max_age_minutes
|
219
|
+
|
220
|
+
def _is_file_older_than(self, file_path: str, max_age_minutes: int, fs: fsspec.AbstractFileSystem,
|
221
|
+
verbose: bool) -> bool:
|
222
|
+
"""
|
223
|
+
Check if a single file is older than the specified max_age_minutes.
|
224
|
+
|
225
|
+
:param file_path: Path to the file.
|
226
|
+
:param max_age_minutes: Maximum allowed age in minutes.
|
227
|
+
:param fs: Filesystem object.
|
228
|
+
:param verbose: If True, log detailed information.
|
229
|
+
:return: True if the file is older than max_age_minutes, False otherwise.
|
230
|
+
"""
|
231
|
+
info = fs.info(file_path)
|
232
|
+
if verbose:
|
233
|
+
self.logger.debug(f"File info for {file_path}: {info}")
|
234
|
+
|
235
|
+
file_modification_datetime = self._get_modification_time(info, file_path)
|
236
|
+
file_age_minutes = (datetime.datetime.now(
|
237
|
+
datetime.timezone.utc) - file_modification_datetime).total_seconds() / 60
|
238
|
+
|
239
|
+
if verbose:
|
240
|
+
self.logger.debug(
|
241
|
+
f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
|
242
|
+
f"(threshold: {max_age_minutes} minutes)"
|
243
|
+
)
|
244
|
+
|
245
|
+
return file_age_minutes > max_age_minutes
|
246
|
+
|
247
|
+
def _is_valid_file(self, file_path: str, fs: fsspec.AbstractFileSystem) -> bool:
|
248
|
+
"""
|
249
|
+
Check if a file is valid (exists and has a valid modification time).
|
250
|
+
|
251
|
+
:param file_path: Path to the file.
|
252
|
+
:param fs: Filesystem object.
|
253
|
+
:return: True if the file is valid, False otherwise.
|
254
|
+
"""
|
255
|
+
try:
|
256
|
+
fs.info(file_path)
|
257
|
+
return True
|
258
|
+
except Exception as e:
|
259
|
+
self.logger.warning(f"Error checking file age for {file_path}: {str(e)}")
|
260
|
+
return False
|
261
|
+
|
262
|
+
def _get_modification_time(self, info: Dict, file_path: str) -> datetime.datetime:
|
263
|
+
"""
|
264
|
+
Extract the modification time from file info.
|
265
|
+
|
266
|
+
:param info: File info dictionary.
|
267
|
+
:param file_path: Path to the file (for logging purposes).
|
268
|
+
:return: Modification time as a timezone-aware datetime object.
|
269
|
+
"""
|
270
|
+
if "LastModified" in info: # S3-compatible filesystem
|
271
|
+
last_modified = info["LastModified"]
|
272
|
+
if isinstance(last_modified, datetime.datetime):
|
273
|
+
return last_modified
|
274
|
+
else:
|
275
|
+
return datetime.datetime.strptime(last_modified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
|
276
|
+
tzinfo=datetime.timezone.utc)
|
277
|
+
|
278
|
+
elif "mtime" in info: # Local filesystem
|
279
|
+
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
280
|
+
|
281
|
+
elif "modified" in info: # FTP or SSH filesystem
|
282
|
+
modified_str = info["modified"]
|
283
|
+
try:
|
284
|
+
return datetime.datetime.strptime(modified_str, "%Y-%m-%d %H:%M:%S").replace(
|
285
|
+
tzinfo=datetime.timezone.utc)
|
286
|
+
except ValueError:
|
287
|
+
try:
|
288
|
+
return datetime.datetime.strptime(modified_str, "%b %d %H:%M").replace(
|
289
|
+
year=datetime.datetime.now().year, tzinfo=datetime.timezone.utc
|
290
|
+
)
|
291
|
+
except ValueError:
|
292
|
+
self.logger.warning(f"Unsupported modification time format for {file_path}: {modified_str}")
|
293
|
+
raise ValueError("Unsupported modification time format")
|
294
|
+
|
295
|
+
else: # Fallback for unsupported filesystems
|
296
|
+
self.logger.warning(f"Modification time not available for {file_path}.")
|
297
|
+
raise ValueError("Modification time not available")
|
298
|
+
|
146
299
|
|
147
300
|
class BusinessDays:
|
148
301
|
"""
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/django/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.37 → sibi_dst-0.3.39}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|