sibi-dst 0.3.36__tar.gz → 0.3.38__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/PKG-INFO +1 -2
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/pyproject.toml +1 -2
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/data_wrapper.py +96 -108
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/date_utils.py +154 -1
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/README.md +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/airflow_manager.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/utils/storage_manager.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.38
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -10,7 +10,6 @@ Classifier: Programming Language :: Python :: 3.11
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.12
|
11
11
|
Classifier: Programming Language :: Python :: 3.13
|
12
12
|
Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
|
13
|
-
Requires-Dist: boto3 (>=1.36.3,<2.0.0)
|
14
13
|
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
15
14
|
Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
16
15
|
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.38"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -43,7 +43,6 @@ geopandas = "^1.0.1"
|
|
43
43
|
osmnx = "^2.0.1"
|
44
44
|
gunicorn = "^23.0.0"
|
45
45
|
uvicorn-worker = "^0.3.0"
|
46
|
-
boto3 = "^1.36.3"
|
47
46
|
|
48
47
|
|
49
48
|
[build-system]
|
@@ -1,13 +1,13 @@
|
|
1
1
|
import datetime
|
2
2
|
from concurrent.futures import ThreadPoolExecutor
|
3
|
-
from typing import Type, Any, Dict, Optional
|
4
|
-
|
3
|
+
from typing import Type, Any, Dict, Optional, Union
|
4
|
+
from threading import Lock
|
5
5
|
import fsspec
|
6
6
|
import pandas as pd
|
7
7
|
from IPython.display import display
|
8
8
|
from tqdm import tqdm
|
9
9
|
|
10
|
-
from sibi_dst.utils import Logger
|
10
|
+
from sibi_dst.utils import Logger, DateUtils
|
11
11
|
from sibi_dst.utils import ParquetSaver
|
12
12
|
|
13
13
|
|
@@ -87,11 +87,11 @@ class DataWrapper:
|
|
87
87
|
reverse_order: bool = False,
|
88
88
|
overwrite: bool = False,
|
89
89
|
ignore_missing: bool = False,
|
90
|
-
logger:
|
90
|
+
logger: Logger = None,
|
91
91
|
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
92
92
|
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
93
93
|
show_progress: bool = False,
|
94
|
-
timeout:
|
94
|
+
timeout: float = 300):
|
95
95
|
self.dataclass = dataclass
|
96
96
|
self.date_field = date_field
|
97
97
|
self.data_path = self.ensure_forward_slash(data_path)
|
@@ -113,9 +113,12 @@ class DataWrapper:
|
|
113
113
|
|
114
114
|
self.start_date = self.convert_to_date(start_date)
|
115
115
|
self.end_date = self.convert_to_date(end_date)
|
116
|
+
self._lock = Lock()
|
117
|
+
self.processed_dates = []
|
118
|
+
self.date_utils = DateUtils(logger=self.logger)
|
116
119
|
|
117
120
|
@staticmethod
|
118
|
-
def convert_to_date(date:
|
121
|
+
def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
119
122
|
if isinstance(date, datetime.date):
|
120
123
|
return date
|
121
124
|
try:
|
@@ -135,7 +138,7 @@ class DataWrapper:
|
|
135
138
|
for date in date_range:
|
136
139
|
yield date.date()
|
137
140
|
|
138
|
-
def process(self):
|
141
|
+
def process(self, max_retries: int = 3):
|
139
142
|
"""
|
140
143
|
Processes update tasks by generating an update plan, filtering required updates, and distributing
|
141
144
|
the workload across threads based on priority levels.
|
@@ -145,8 +148,8 @@ class DataWrapper:
|
|
145
148
|
Each thread handles the updates for a specific priority level, ensuring a streamlined approach
|
146
149
|
to handling the updates efficiently.
|
147
150
|
|
151
|
+
:param max_retries: Maximum number of retries for a task after a timeout. Defaults to 3.
|
148
152
|
:raises TimeoutError: If a thread processing a priority level exceeds the allowed timeout duration.
|
149
|
-
|
150
153
|
:return: None
|
151
154
|
"""
|
152
155
|
update_plan_table = self.generate_update_plan_with_conditions()
|
@@ -156,91 +159,63 @@ class DataWrapper:
|
|
156
159
|
display(update_plan_table)
|
157
160
|
|
158
161
|
# Filter out rows that do not require updates (priority 0 means skip)
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
+
with self._lock:
|
163
|
+
update_plan_table = update_plan_table[
|
164
|
+
(update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
|
165
|
+
]
|
162
166
|
|
163
167
|
# Group by priority
|
164
|
-
|
168
|
+
with self._lock:
|
169
|
+
priorities = sorted(update_plan_table["update_priority"].unique())
|
165
170
|
|
166
171
|
# We will process each priority level in its own thread.
|
167
172
|
# Each thread will handle all dates associated with that priority.
|
168
173
|
def process_priority(priority):
|
169
174
|
# Extract dates for the current priority
|
170
|
-
|
171
|
-
update_plan_table[
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
175
|
+
with self._lock:
|
176
|
+
dates_to_process = update_plan_table[
|
177
|
+
update_plan_table["update_priority"] == priority
|
178
|
+
]["date"].tolist()
|
179
|
+
|
180
|
+
# If show_progress is True, wrap in a progress bar
|
181
|
+
date_iterator = dates_to_process
|
182
|
+
if self.show_progress:
|
183
|
+
date_iterator = tqdm(date_iterator,
|
184
|
+
desc=f"Processing priority {priority}:{self.dataclass.__name__}",
|
185
|
+
unit="date")
|
186
|
+
|
187
|
+
# Process each date for this priority
|
188
|
+
for current_date in date_iterator:
|
189
|
+
self.process_date(current_date)
|
183
190
|
|
184
191
|
# Launch a separate thread for each priority
|
185
192
|
with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
|
186
193
|
futures = {executor.submit(process_priority, p): p for p in priorities}
|
187
|
-
for
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
if "mtime" in info: # Local filesystem
|
213
|
-
file_modification_time = info["mtime"]
|
214
|
-
file_modification_datetime = datetime.datetime.fromtimestamp(
|
215
|
-
file_modification_time, tz=datetime.timezone.utc
|
216
|
-
)
|
217
|
-
elif "LastModified" in info: # S3-compatible filesystem
|
218
|
-
file_modification_datetime = (
|
219
|
-
info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
|
220
|
-
else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
221
|
-
)
|
222
|
-
self.logger.info(f"S3 File modification time: {file_modification_datetime}")
|
223
|
-
else:
|
224
|
-
self.logger.warning(f"Modification time not available for {file_path}.")
|
225
|
-
return True # Assume file is too old if we cannot determine its age
|
226
|
-
|
227
|
-
# Compare file age
|
228
|
-
current_time = datetime.datetime.now(datetime.timezone.utc)
|
229
|
-
file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
|
230
|
-
self.logger.info(
|
231
|
-
f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
|
232
|
-
f"(threshold: {self.max_age_minutes} minutes)"
|
233
|
-
)
|
234
|
-
return file_age_minutes > self.max_age_minutes
|
235
|
-
|
236
|
-
except FileNotFoundError:
|
237
|
-
self.logger.warning(f"File {file_path} not found.")
|
238
|
-
if self.ignore_missing:
|
239
|
-
return False
|
240
|
-
return True # File is considered old if it doesn't exist
|
241
|
-
except Exception as e:
|
242
|
-
self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
|
243
|
-
return True
|
194
|
+
retries = {p: 0 for p in priorities} # Track retry counts for each priority
|
195
|
+
|
196
|
+
while futures:
|
197
|
+
for future in list(futures.keys()):
|
198
|
+
try:
|
199
|
+
future.result(timeout=self.timeout)
|
200
|
+
del futures[future] # Remove completed future
|
201
|
+
except TimeoutError:
|
202
|
+
priority = futures[future]
|
203
|
+
retries[priority] += 1
|
204
|
+
|
205
|
+
if retries[priority] <= max_retries:
|
206
|
+
self.logger.warning(
|
207
|
+
f"Thread for priority {priority} timed out. Retrying ({retries[priority]}/{max_retries})..."
|
208
|
+
)
|
209
|
+
new_future = executor.submit(process_priority, priority)
|
210
|
+
futures[new_future] = priority
|
211
|
+
else:
|
212
|
+
self.logger.error(
|
213
|
+
f"Thread for priority {priority} timed out. Max retries ({max_retries}) exceeded. Skipping."
|
214
|
+
)
|
215
|
+
del futures[future] # Remove the timed-out future
|
216
|
+
except Exception as e:
|
217
|
+
self.logger.error(f"Error processing priority {futures[future]}: {e}")
|
218
|
+
del futures[future] # Remove the failed future
|
244
219
|
|
245
220
|
def process_date(self, date: datetime.date):
|
246
221
|
"""
|
@@ -257,27 +232,31 @@ class DataWrapper:
|
|
257
232
|
:type date: datetime.date
|
258
233
|
:return: None
|
259
234
|
"""
|
260
|
-
|
261
|
-
|
235
|
+
with self._lock:
|
236
|
+
folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
|
237
|
+
full_parquet_filename = f"{folder}{self.parquet_filename}"
|
262
238
|
|
263
|
-
|
264
|
-
|
239
|
+
start_time = datetime.datetime.now()
|
240
|
+
self.logger.info(f"Processing date: {date}")
|
241
|
+
self.logger.info(f"Processing {full_parquet_filename}...")
|
265
242
|
|
266
|
-
|
267
|
-
|
243
|
+
data_object = self.dataclass(**self.class_params)
|
244
|
+
df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
268
245
|
|
269
|
-
|
270
|
-
|
271
|
-
|
246
|
+
if len(df.index) == 0:
|
247
|
+
self.logger.error("No data found for the specified date.")
|
248
|
+
return
|
272
249
|
|
273
|
-
|
274
|
-
|
250
|
+
parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
|
251
|
+
parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
|
275
252
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
253
|
+
end_time = datetime.datetime.now()
|
254
|
+
duration_seconds = (end_time - start_time).total_seconds()
|
255
|
+
self.logger.info(
|
256
|
+
f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
|
257
|
+
)
|
258
|
+
self.processed_dates.append(date)
|
259
|
+
self.logger.info(f"Finished processing date: {date}")
|
281
260
|
|
282
261
|
def generate_update_plan_with_conditions(self):
|
283
262
|
"""
|
@@ -297,7 +276,12 @@ class DataWrapper:
|
|
297
276
|
|
298
277
|
today = datetime.date.today()
|
299
278
|
history_start_date = today - datetime.timedelta(days=self.history_days_threshold)
|
300
|
-
|
279
|
+
priority_map = {
|
280
|
+
"file is recent":0,
|
281
|
+
"overwrite": 1,
|
282
|
+
"history_days": 2,
|
283
|
+
"missing_files": 3
|
284
|
+
}
|
301
285
|
date_range = self.generate_date_range()
|
302
286
|
if self.show_progress:
|
303
287
|
date_range = tqdm(date_range, desc=f"Evaluating update plan:{self.dataclass.__name__}", unit="date")
|
@@ -317,11 +301,17 @@ class DataWrapper:
|
|
317
301
|
update_required = True
|
318
302
|
# Hierarchy 2: History threshold evaluation
|
319
303
|
elif within_history:
|
320
|
-
if self.is_file_older_than(
|
304
|
+
if self.date_utils.is_file_older_than(
|
305
|
+
full_parquet_filename,
|
306
|
+
max_age_minutes=self.max_age_minutes,
|
307
|
+
fs=self.fs,
|
308
|
+
ignore_missing=self.ignore_missing,
|
309
|
+
verbose=self.verbose
|
310
|
+
):
|
321
311
|
category = "history_days"
|
322
312
|
update_required = True
|
323
313
|
else:
|
324
|
-
category = "file
|
314
|
+
category = "file is recent"
|
325
315
|
update_required = False
|
326
316
|
# Hierarchy 3: Missing files
|
327
317
|
elif missing_file and current_date <= today:
|
@@ -332,20 +322,18 @@ class DataWrapper:
|
|
332
322
|
update_required = False
|
333
323
|
|
334
324
|
# Collect condition descriptions for the update plan table
|
335
|
-
|
325
|
+
row = {
|
336
326
|
"date": current_date,
|
337
327
|
"file_exists": file_exists,
|
338
328
|
"within_history": within_history,
|
339
329
|
"missing_file": missing_file,
|
340
330
|
"update_required": update_required,
|
341
331
|
"update_category": category,
|
342
|
-
"datawrapper class": self.dataclass.__name__
|
343
|
-
|
344
|
-
priority_map = {
|
345
|
-
"overwrite": 1,
|
346
|
-
"history_days": 2,
|
347
|
-
"missing_files": 3
|
332
|
+
"datawrapper class": self.dataclass.__name__,
|
333
|
+
"update_priority": priority_map.get(category, 0)
|
348
334
|
}
|
335
|
+
rows.append(row)
|
336
|
+
|
349
337
|
|
350
338
|
for row in rows:
|
351
339
|
category = row.get("update_category")
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import datetime
|
2
|
-
from typing import Union, Tuple, Callable, Dict
|
2
|
+
from typing import Union, Tuple, Callable, Dict, Optional
|
3
3
|
|
4
|
+
import fsspec
|
4
5
|
import numpy as np
|
5
6
|
import pandas as pd
|
6
7
|
|
@@ -143,6 +144,158 @@ class DateUtils:
|
|
143
144
|
'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
|
144
145
|
}
|
145
146
|
|
147
|
+
def is_file_older_than(self, file_path: str, max_age_minutes: int, fs: Optional[fsspec.AbstractFileSystem] = None,
|
148
|
+
ignore_missing: bool = False, verbose: bool = False) -> bool:
|
149
|
+
"""
|
150
|
+
Check if a file or a partitioned Parquet dataset is older than the specified max_age_minutes.
|
151
|
+
|
152
|
+
:param file_path: Path to the file or dataset.
|
153
|
+
:param max_age_minutes: Maximum allowed age in minutes.
|
154
|
+
:param fs: Filesystem object (e.g., S3, local). If not provided, defaults to the local filesystem.
|
155
|
+
:param ignore_missing: If True, treat missing files as not old. Defaults to False.
|
156
|
+
:param verbose: If True, log detailed information. Defaults to False.
|
157
|
+
:return: True if the file or dataset is older than max_age_minutes, False otherwise.
|
158
|
+
"""
|
159
|
+
fs = fs or fsspec.filesystem("file")
|
160
|
+
self.logger.info(f"Checking age for {file_path}...")
|
161
|
+
|
162
|
+
try:
|
163
|
+
if not fs.exists(file_path):
|
164
|
+
self.logger.info(f"Path not found: {file_path}.")
|
165
|
+
return not ignore_missing
|
166
|
+
|
167
|
+
if fs.isdir(file_path):
|
168
|
+
self.logger.info(f"Found that {file_path} is a directory...")
|
169
|
+
return self._is_directory_older_than(file_path, max_age_minutes, fs, verbose)
|
170
|
+
|
171
|
+
elif fs.isfile(file_path):
|
172
|
+
return self._is_file_older_than(file_path, max_age_minutes, fs, verbose)
|
173
|
+
|
174
|
+
else:
|
175
|
+
self.logger.warning(f"Path {file_path} is neither a file nor a directory.")
|
176
|
+
return True
|
177
|
+
|
178
|
+
except Exception as e:
|
179
|
+
self.logger.warning(f"Error checking age for {file_path}: {str(e)}")
|
180
|
+
return True
|
181
|
+
|
182
|
+
def _is_directory_older_than(self, dir_path: str, max_age_minutes: int, fs: fsspec.AbstractFileSystem,
|
183
|
+
verbose: bool) -> bool:
|
184
|
+
"""
|
185
|
+
Check if the oldest file in a directory is older than the specified max_age_minutes.
|
186
|
+
|
187
|
+
:param dir_path: Path to the directory.
|
188
|
+
:param max_age_minutes: Maximum allowed age in minutes.
|
189
|
+
:param fs: Filesystem object.
|
190
|
+
:param verbose: If True, log detailed information.
|
191
|
+
:return: True if the oldest file is older than max_age_minutes, False otherwise.
|
192
|
+
"""
|
193
|
+
all_files = fs.ls(dir_path)
|
194
|
+
if not all_files:
|
195
|
+
self.logger.info(f"No files found in dataset: {dir_path}.")
|
196
|
+
return True
|
197
|
+
|
198
|
+
modification_times = [
|
199
|
+
self._get_modification_time(fs.info(file), file)
|
200
|
+
for file in all_files
|
201
|
+
if self._is_valid_file(file, fs)
|
202
|
+
]
|
203
|
+
|
204
|
+
if not modification_times:
|
205
|
+
self.logger.warning(f"No valid modification times found for dataset: {dir_path}. Assuming dataset is old.")
|
206
|
+
return True
|
207
|
+
|
208
|
+
oldest_modification_time = min(modification_times)
|
209
|
+
dataset_age_minutes = (datetime.datetime.now(
|
210
|
+
datetime.timezone.utc) - oldest_modification_time).total_seconds() / 60
|
211
|
+
|
212
|
+
if verbose:
|
213
|
+
self.logger.info(
|
214
|
+
f"Oldest file in dataset {dir_path} is {round(dataset_age_minutes, 2)} minutes old "
|
215
|
+
f"(threshold: {max_age_minutes} minutes)"
|
216
|
+
)
|
217
|
+
|
218
|
+
return dataset_age_minutes > max_age_minutes
|
219
|
+
|
220
|
+
def _is_file_older_than(self, file_path: str, max_age_minutes: int, fs: fsspec.AbstractFileSystem,
|
221
|
+
verbose: bool) -> bool:
|
222
|
+
"""
|
223
|
+
Check if a single file is older than the specified max_age_minutes.
|
224
|
+
|
225
|
+
:param file_path: Path to the file.
|
226
|
+
:param max_age_minutes: Maximum allowed age in minutes.
|
227
|
+
:param fs: Filesystem object.
|
228
|
+
:param verbose: If True, log detailed information.
|
229
|
+
:return: True if the file is older than max_age_minutes, False otherwise.
|
230
|
+
"""
|
231
|
+
info = fs.info(file_path)
|
232
|
+
if verbose:
|
233
|
+
self.logger.debug(f"File info for {file_path}: {info}")
|
234
|
+
|
235
|
+
file_modification_datetime = self._get_modification_time(info, file_path)
|
236
|
+
file_age_minutes = (datetime.datetime.now(
|
237
|
+
datetime.timezone.utc) - file_modification_datetime).total_seconds() / 60
|
238
|
+
|
239
|
+
if verbose:
|
240
|
+
self.logger.debug(
|
241
|
+
f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
|
242
|
+
f"(threshold: {max_age_minutes} minutes)"
|
243
|
+
)
|
244
|
+
|
245
|
+
return file_age_minutes > max_age_minutes
|
246
|
+
|
247
|
+
def _is_valid_file(self, file_path: str, fs: fsspec.AbstractFileSystem) -> bool:
|
248
|
+
"""
|
249
|
+
Check if a file is valid (exists and has a valid modification time).
|
250
|
+
|
251
|
+
:param file_path: Path to the file.
|
252
|
+
:param fs: Filesystem object.
|
253
|
+
:return: True if the file is valid, False otherwise.
|
254
|
+
"""
|
255
|
+
try:
|
256
|
+
fs.info(file_path)
|
257
|
+
return True
|
258
|
+
except Exception as e:
|
259
|
+
self.logger.warning(f"Error checking file age for {file_path}: {str(e)}")
|
260
|
+
return False
|
261
|
+
|
262
|
+
def _get_modification_time(self, info: Dict, file_path: str) -> datetime.datetime:
|
263
|
+
"""
|
264
|
+
Extract the modification time from file info.
|
265
|
+
|
266
|
+
:param info: File info dictionary.
|
267
|
+
:param file_path: Path to the file (for logging purposes).
|
268
|
+
:return: Modification time as a timezone-aware datetime object.
|
269
|
+
"""
|
270
|
+
if "LastModified" in info: # S3-compatible filesystem
|
271
|
+
last_modified = info["LastModified"]
|
272
|
+
if isinstance(last_modified, datetime.datetime):
|
273
|
+
return last_modified
|
274
|
+
else:
|
275
|
+
return datetime.datetime.strptime(last_modified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
|
276
|
+
tzinfo=datetime.timezone.utc)
|
277
|
+
|
278
|
+
elif "mtime" in info: # Local filesystem
|
279
|
+
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
280
|
+
|
281
|
+
elif "modified" in info: # FTP or SSH filesystem
|
282
|
+
modified_str = info["modified"]
|
283
|
+
try:
|
284
|
+
return datetime.datetime.strptime(modified_str, "%Y-%m-%d %H:%M:%S").replace(
|
285
|
+
tzinfo=datetime.timezone.utc)
|
286
|
+
except ValueError:
|
287
|
+
try:
|
288
|
+
return datetime.datetime.strptime(modified_str, "%b %d %H:%M").replace(
|
289
|
+
year=datetime.datetime.now().year, tzinfo=datetime.timezone.utc
|
290
|
+
)
|
291
|
+
except ValueError:
|
292
|
+
self.logger.warning(f"Unsupported modification time format for {file_path}: {modified_str}")
|
293
|
+
raise ValueError("Unsupported modification time format")
|
294
|
+
|
295
|
+
else: # Fallback for unsupported filesystems
|
296
|
+
self.logger.warning(f"Modification time not available for {file_path}.")
|
297
|
+
raise ValueError("Modification time not available")
|
298
|
+
|
146
299
|
|
147
300
|
class BusinessDays:
|
148
301
|
"""
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/django/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.36 → sibi_dst-0.3.38}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|