sibi-dst 0.3.37__py3-none-any.whl → 0.3.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  import datetime
2
2
  from concurrent.futures import ThreadPoolExecutor
3
- from typing import Type, Any, Dict, Optional
4
-
3
+ from typing import Type, Any, Dict, Optional, Union
4
+ from threading import Lock
5
5
  import fsspec
6
6
  import pandas as pd
7
7
  from IPython.display import display
8
8
  from tqdm import tqdm
9
9
 
10
- from sibi_dst.utils import Logger
10
+ from sibi_dst.utils import Logger, DateUtils
11
11
  from sibi_dst.utils import ParquetSaver
12
12
 
13
13
 
@@ -87,11 +87,11 @@ class DataWrapper:
87
87
  reverse_order: bool = False,
88
88
  overwrite: bool = False,
89
89
  ignore_missing: bool = False,
90
- logger: Optional[Logger] = None,
90
+ logger: Logger = None,
91
91
  max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
92
92
  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
93
93
  show_progress: bool = False,
94
- timeout: Optional[int] = 300):
94
+ timeout: float = 300):
95
95
  self.dataclass = dataclass
96
96
  self.date_field = date_field
97
97
  self.data_path = self.ensure_forward_slash(data_path)
@@ -113,9 +113,12 @@ class DataWrapper:
113
113
 
114
114
  self.start_date = self.convert_to_date(start_date)
115
115
  self.end_date = self.convert_to_date(end_date)
116
+ self._lock = Lock()
117
+ self.processed_dates = []
118
+ self.date_utils = DateUtils(logger=self.logger)
116
119
 
117
120
  @staticmethod
118
- def convert_to_date(date: Any) -> datetime.date:
121
+ def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
119
122
  if isinstance(date, datetime.date):
120
123
  return date
121
124
  try:
@@ -135,7 +138,7 @@ class DataWrapper:
135
138
  for date in date_range:
136
139
  yield date.date()
137
140
 
138
- def process(self):
141
+ def process(self, max_retries: int = 3):
139
142
  """
140
143
  Processes update tasks by generating an update plan, filtering required updates, and distributing
141
144
  the workload across threads based on priority levels.
@@ -145,8 +148,8 @@ class DataWrapper:
145
148
  Each thread handles the updates for a specific priority level, ensuring a streamlined approach
146
149
  to handling the updates efficiently.
147
150
 
151
+ :param max_retries: Maximum number of retries for a task after a timeout. Defaults to 3.
148
152
  :raises TimeoutError: If a thread processing a priority level exceeds the allowed timeout duration.
149
-
150
153
  :return: None
151
154
  """
152
155
  update_plan_table = self.generate_update_plan_with_conditions()
@@ -156,91 +159,63 @@ class DataWrapper:
156
159
  display(update_plan_table)
157
160
 
158
161
  # Filter out rows that do not require updates (priority 0 means skip)
159
- update_plan_table = update_plan_table[
160
- (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
161
- ]
162
+ with self._lock:
163
+ update_plan_table = update_plan_table[
164
+ (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
165
+ ]
162
166
 
163
167
  # Group by priority
164
- priorities = sorted(update_plan_table["update_priority"].unique())
168
+ with self._lock:
169
+ priorities = sorted(update_plan_table["update_priority"].unique())
165
170
 
166
171
  # We will process each priority level in its own thread.
167
172
  # Each thread will handle all dates associated with that priority.
168
173
  def process_priority(priority):
169
174
  # Extract dates for the current priority
170
- dates_to_process = update_plan_table[
171
- update_plan_table["update_priority"] == priority
172
- ]["date"].tolist()
173
-
174
- # If show_progress is True, wrap in a progress bar
175
- date_iterator = dates_to_process
176
- if self.show_progress:
177
- date_iterator = tqdm(date_iterator, desc=f"Processing priority {priority}:{self.dataclass.__name__}",
178
- unit="date")
179
-
180
- # Process each date for this priority
181
- for current_date in date_iterator:
182
- self.process_date(current_date)
175
+ with self._lock:
176
+ dates_to_process = update_plan_table[
177
+ update_plan_table["update_priority"] == priority
178
+ ]["date"].tolist()
179
+
180
+ # If show_progress is True, wrap in a progress bar
181
+ date_iterator = dates_to_process
182
+ if self.show_progress:
183
+ date_iterator = tqdm(date_iterator,
184
+ desc=f"Processing priority {priority}:{self.dataclass.__name__}",
185
+ unit="date")
186
+
187
+ # Process each date for this priority
188
+ for current_date in date_iterator:
189
+ self.process_date(current_date)
183
190
 
184
191
  # Launch a separate thread for each priority
185
192
  with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
186
193
  futures = {executor.submit(process_priority, p): p for p in priorities}
187
- for future in futures:
188
- try:
189
- future.result(timeout=self.timeout)
190
- except TimeoutError:
191
- self.logger.error(f"Thread for {self.dataclass.__name__} timed out. Thread cancelled.")
192
- future.cancel()
193
- priority = futures[future]
194
- new_future = executor.submit(process_priority, priority)
195
- futures[new_future] = priority
196
- self.logger.info(f"Resubmitted task for priority {priority} after timeout.")
197
-
198
- def is_file_older_than(self, file_path: str) -> bool:
199
- """
200
- Check if a file is older than the specified max_age_minutes.
201
-
202
- :param file_path: Path to the file.
203
- :return: True if the file is older than max_age_minutes, False otherwise.
204
- """
205
- try:
206
- # Get file info
207
- info = self.fs.info(file_path)
208
- self.logger.info(f"File info for {file_path}: {info}")
209
-
210
- # Determine the modification time from available keys
211
- file_modification_time = None
212
- if "mtime" in info: # Local filesystem
213
- file_modification_time = info["mtime"]
214
- file_modification_datetime = datetime.datetime.fromtimestamp(
215
- file_modification_time, tz=datetime.timezone.utc
216
- )
217
- elif "LastModified" in info: # S3-compatible filesystem
218
- file_modification_datetime = (
219
- info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
220
- else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
221
- )
222
- self.logger.info(f"S3 File modification time: {file_modification_datetime}")
223
- else:
224
- self.logger.warning(f"Modification time not available for {file_path}.")
225
- return True # Assume file is too old if we cannot determine its age
226
-
227
- # Compare file age
228
- current_time = datetime.datetime.now(datetime.timezone.utc)
229
- file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
230
- self.logger.info(
231
- f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
232
- f"(threshold: {self.max_age_minutes} minutes)"
233
- )
234
- return file_age_minutes > self.max_age_minutes
235
-
236
- except FileNotFoundError:
237
- self.logger.warning(f"File {file_path} not found.")
238
- if self.ignore_missing:
239
- return False
240
- return True # File is considered old if it doesn't exist
241
- except Exception as e:
242
- self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
243
- return True
194
+ retries = {p: 0 for p in priorities} # Track retry counts for each priority
195
+
196
+ while futures:
197
+ for future in list(futures.keys()):
198
+ try:
199
+ future.result(timeout=self.timeout)
200
+ del futures[future] # Remove completed future
201
+ except TimeoutError:
202
+ priority = futures[future]
203
+ retries[priority] += 1
204
+
205
+ if retries[priority] <= max_retries:
206
+ self.logger.warning(
207
+ f"Thread for priority {priority} timed out. Retrying ({retries[priority]}/{max_retries})..."
208
+ )
209
+ new_future = executor.submit(process_priority, priority)
210
+ futures[new_future] = priority
211
+ else:
212
+ self.logger.error(
213
+ f"Thread for priority {priority} timed out. Max retries ({max_retries}) exceeded. Skipping."
214
+ )
215
+ del futures[future] # Remove the timed-out future
216
+ except Exception as e:
217
+ self.logger.error(f"Error processing priority {futures[future]}: {e}")
218
+ del futures[future] # Remove the failed future
244
219
 
245
220
  def process_date(self, date: datetime.date):
246
221
  """
@@ -257,27 +232,31 @@ class DataWrapper:
257
232
  :type date: datetime.date
258
233
  :return: None
259
234
  """
260
- folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
261
- full_parquet_filename = f"{folder}{self.parquet_filename}"
235
+ with self._lock:
236
+ folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
237
+ full_parquet_filename = f"{folder}{self.parquet_filename}"
262
238
 
263
- start_time = datetime.datetime.now()
264
- self.logger.info(f"Processing {full_parquet_filename}...")
239
+ start_time = datetime.datetime.now()
240
+ self.logger.info(f"Processing date: {date}")
241
+ self.logger.info(f"Processing {full_parquet_filename}...")
265
242
 
266
- data_object = self.dataclass(**self.class_params)
267
- df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
243
+ data_object = self.dataclass(**self.class_params)
244
+ df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
268
245
 
269
- if len(df.index) == 0:
270
- self.logger.error("No data found for the specified date.")
271
- return
246
+ if len(df.index) == 0:
247
+ self.logger.error("No data found for the specified date.")
248
+ return
272
249
 
273
- parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
274
- parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
250
+ parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
251
+ parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
275
252
 
276
- end_time = datetime.datetime.now()
277
- duration_seconds = (end_time - start_time).total_seconds()
278
- self.logger.info(
279
- f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
280
- )
253
+ end_time = datetime.datetime.now()
254
+ duration_seconds = (end_time - start_time).total_seconds()
255
+ self.logger.info(
256
+ f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
257
+ )
258
+ self.processed_dates.append(date)
259
+ self.logger.info(f"Finished processing date: {date}")
281
260
 
282
261
  def generate_update_plan_with_conditions(self):
283
262
  """
@@ -297,7 +276,12 @@ class DataWrapper:
297
276
 
298
277
  today = datetime.date.today()
299
278
  history_start_date = today - datetime.timedelta(days=self.history_days_threshold)
300
-
279
+ priority_map = {
280
+ "file is recent":0,
281
+ "overwrite": 1,
282
+ "history_days": 2,
283
+ "missing_files": 3
284
+ }
301
285
  date_range = self.generate_date_range()
302
286
  if self.show_progress:
303
287
  date_range = tqdm(date_range, desc=f"Evaluating update plan:{self.dataclass.__name__}", unit="date")
@@ -317,11 +301,17 @@ class DataWrapper:
317
301
  update_required = True
318
302
  # Hierarchy 2: History threshold evaluation
319
303
  elif within_history:
320
- if self.is_file_older_than(full_parquet_filename):
304
+ if self.date_utils.is_file_older_than(
305
+ full_parquet_filename,
306
+ max_age_minutes=self.max_age_minutes,
307
+ fs=self.fs,
308
+ ignore_missing=self.ignore_missing,
309
+ verbose=self.verbose
310
+ ):
321
311
  category = "history_days"
322
312
  update_required = True
323
313
  else:
324
- category = "file age is recent"
314
+ category = "file is recent"
325
315
  update_required = False
326
316
  # Hierarchy 3: Missing files
327
317
  elif missing_file and current_date <= today:
@@ -332,20 +322,18 @@ class DataWrapper:
332
322
  update_required = False
333
323
 
334
324
  # Collect condition descriptions for the update plan table
335
- rows.append({
325
+ row = {
336
326
  "date": current_date,
337
327
  "file_exists": file_exists,
338
328
  "within_history": within_history,
339
329
  "missing_file": missing_file,
340
330
  "update_required": update_required,
341
331
  "update_category": category,
342
- "datawrapper class": self.dataclass.__name__
343
- })
344
- priority_map = {
345
- "overwrite": 1,
346
- "history_days": 2,
347
- "missing_files": 3
332
+ "datawrapper class": self.dataclass.__name__,
333
+ "update_priority": priority_map.get(category, 0)
348
334
  }
335
+ rows.append(row)
336
+
349
337
 
350
338
  for row in rows:
351
339
  category = row.get("update_category")
@@ -1,6 +1,7 @@
1
1
  import datetime
2
- from typing import Union, Tuple, Callable, Dict
2
+ from typing import Union, Tuple, Callable, Dict, Optional
3
3
 
4
+ import fsspec
4
5
  import numpy as np
5
6
  import pandas as pd
6
7
 
@@ -143,6 +144,158 @@ class DateUtils:
143
144
  'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
144
145
  }
145
146
 
147
+ def is_file_older_than(self, file_path: str, max_age_minutes: int, fs: Optional[fsspec.AbstractFileSystem] = None,
148
+ ignore_missing: bool = False, verbose: bool = False) -> bool:
149
+ """
150
+ Check if a file or a partitioned Parquet dataset is older than the specified max_age_minutes.
151
+
152
+ :param file_path: Path to the file or dataset.
153
+ :param max_age_minutes: Maximum allowed age in minutes.
154
+ :param fs: Filesystem object (e.g., S3, local). If not provided, defaults to the local filesystem.
155
+ :param ignore_missing: If True, treat missing files as not old. Defaults to False.
156
+ :param verbose: If True, log detailed information. Defaults to False.
157
+ :return: True if the file or dataset is older than max_age_minutes, False otherwise.
158
+ """
159
+ fs = fs or fsspec.filesystem("file")
160
+ self.logger.info(f"Checking age for {file_path}...")
161
+
162
+ try:
163
+ if not fs.exists(file_path):
164
+ self.logger.info(f"Path not found: {file_path}.")
165
+ return not ignore_missing
166
+
167
+ if fs.isdir(file_path):
168
+ self.logger.info(f"Found that {file_path} is a directory...")
169
+ return self._is_directory_older_than(file_path, max_age_minutes, fs, verbose)
170
+
171
+ elif fs.isfile(file_path):
172
+ return self._is_file_older_than(file_path, max_age_minutes, fs, verbose)
173
+
174
+ else:
175
+ self.logger.warning(f"Path {file_path} is neither a file nor a directory.")
176
+ return True
177
+
178
+ except Exception as e:
179
+ self.logger.warning(f"Error checking age for {file_path}: {str(e)}")
180
+ return True
181
+
182
+ def _is_directory_older_than(self, dir_path: str, max_age_minutes: int, fs: fsspec.AbstractFileSystem,
183
+ verbose: bool) -> bool:
184
+ """
185
+ Check if the oldest file in a directory is older than the specified max_age_minutes.
186
+
187
+ :param dir_path: Path to the directory.
188
+ :param max_age_minutes: Maximum allowed age in minutes.
189
+ :param fs: Filesystem object.
190
+ :param verbose: If True, log detailed information.
191
+ :return: True if the oldest file is older than max_age_minutes, False otherwise.
192
+ """
193
+ all_files = fs.ls(dir_path)
194
+ if not all_files:
195
+ self.logger.info(f"No files found in dataset: {dir_path}.")
196
+ return True
197
+
198
+ modification_times = [
199
+ self._get_modification_time(fs.info(file), file)
200
+ for file in all_files
201
+ if self._is_valid_file(file, fs)
202
+ ]
203
+
204
+ if not modification_times:
205
+ self.logger.warning(f"No valid modification times found for dataset: {dir_path}. Assuming dataset is old.")
206
+ return True
207
+
208
+ oldest_modification_time = min(modification_times)
209
+ dataset_age_minutes = (datetime.datetime.now(
210
+ datetime.timezone.utc) - oldest_modification_time).total_seconds() / 60
211
+
212
+ if verbose:
213
+ self.logger.info(
214
+ f"Oldest file in dataset {dir_path} is {round(dataset_age_minutes, 2)} minutes old "
215
+ f"(threshold: {max_age_minutes} minutes)"
216
+ )
217
+
218
+ return dataset_age_minutes > max_age_minutes
219
+
220
+ def _is_file_older_than(self, file_path: str, max_age_minutes: int, fs: fsspec.AbstractFileSystem,
221
+ verbose: bool) -> bool:
222
+ """
223
+ Check if a single file is older than the specified max_age_minutes.
224
+
225
+ :param file_path: Path to the file.
226
+ :param max_age_minutes: Maximum allowed age in minutes.
227
+ :param fs: Filesystem object.
228
+ :param verbose: If True, log detailed information.
229
+ :return: True if the file is older than max_age_minutes, False otherwise.
230
+ """
231
+ info = fs.info(file_path)
232
+ if verbose:
233
+ self.logger.debug(f"File info for {file_path}: {info}")
234
+
235
+ file_modification_datetime = self._get_modification_time(info, file_path)
236
+ file_age_minutes = (datetime.datetime.now(
237
+ datetime.timezone.utc) - file_modification_datetime).total_seconds() / 60
238
+
239
+ if verbose:
240
+ self.logger.debug(
241
+ f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
242
+ f"(threshold: {max_age_minutes} minutes)"
243
+ )
244
+
245
+ return file_age_minutes > max_age_minutes
246
+
247
+ def _is_valid_file(self, file_path: str, fs: fsspec.AbstractFileSystem) -> bool:
248
+ """
249
+ Check if a file is valid (exists and has a valid modification time).
250
+
251
+ :param file_path: Path to the file.
252
+ :param fs: Filesystem object.
253
+ :return: True if the file is valid, False otherwise.
254
+ """
255
+ try:
256
+ fs.info(file_path)
257
+ return True
258
+ except Exception as e:
259
+ self.logger.warning(f"Error checking file age for {file_path}: {str(e)}")
260
+ return False
261
+
262
+ def _get_modification_time(self, info: Dict, file_path: str) -> datetime.datetime:
263
+ """
264
+ Extract the modification time from file info.
265
+
266
+ :param info: File info dictionary.
267
+ :param file_path: Path to the file (for logging purposes).
268
+ :return: Modification time as a timezone-aware datetime object.
269
+ """
270
+ if "LastModified" in info: # S3-compatible filesystem
271
+ last_modified = info["LastModified"]
272
+ if isinstance(last_modified, datetime.datetime):
273
+ return last_modified
274
+ else:
275
+ return datetime.datetime.strptime(last_modified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
276
+ tzinfo=datetime.timezone.utc)
277
+
278
+ elif "mtime" in info: # Local filesystem
279
+ return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
280
+
281
+ elif "modified" in info: # FTP or SSH filesystem
282
+ modified_str = info["modified"]
283
+ try:
284
+ return datetime.datetime.strptime(modified_str, "%Y-%m-%d %H:%M:%S").replace(
285
+ tzinfo=datetime.timezone.utc)
286
+ except ValueError:
287
+ try:
288
+ return datetime.datetime.strptime(modified_str, "%b %d %H:%M").replace(
289
+ year=datetime.datetime.now().year, tzinfo=datetime.timezone.utc
290
+ )
291
+ except ValueError:
292
+ self.logger.warning(f"Unsupported modification time format for {file_path}: {modified_str}")
293
+ raise ValueError("Unsupported modification time format")
294
+
295
+ else: # Fallback for unsupported filesystems
296
+ self.logger.warning(f"Modification time not available for {file_path}.")
297
+ raise ValueError("Modification time not available")
298
+
146
299
 
147
300
  class BusinessDays:
148
301
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.37
3
+ Version: 0.3.38
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -42,14 +42,14 @@ sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWd
42
42
  sibi_dst/utils/clickhouse_writer.py,sha256=syXGN9NG1FS8soHuMj6QNRqTRWi-thuYUF-_BWDc_KI,9883
43
43
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
44
44
  sibi_dst/utils/data_utils.py,sha256=j-lEKt6EJL2fm0z7adcjtVG7yFYLRpQL8xSgh2CVmJg,8769
45
- sibi_dst/utils/data_wrapper.py,sha256=V9p1CKvlmT3yyfZ5d2BIJSvz0Ee1TIA4hufpTIYo6v8,16684
46
- sibi_dst/utils/date_utils.py,sha256=ei7WgzIUk1tRa3sHniaVm_lNmfTGq12b_HzmMV91k18,12407
45
+ sibi_dst/utils/data_wrapper.py,sha256=k9fTKbF2njAmRTeLAtW3HU-vwF0y43KnROvwWLqR7Bk,16340
46
+ sibi_dst/utils/date_utils.py,sha256=UppOs1vfm41Si9JITAM7Qn9qqOi9yb6ukYz1E2mnA1I,19214
47
47
  sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10989
48
48
  sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
49
49
  sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEBRY5FOUQ,6673
50
50
  sibi_dst/utils/log_utils.py,sha256=XUbeXa1JsOlcEJyW8jnBlWo295rLUnuYi-HMzyhHwJg,3145
51
51
  sibi_dst/utils/parquet_saver.py,sha256=_QkXL3IiC2b4m7sxHpCSeqPwBWxXeiP5sH_WheSMEm4,8042
52
52
  sibi_dst/utils/storage_manager.py,sha256=-zlMrRo_6o6mCd_OHknKqNQl7m0I9VW89grDAUO1V5c,4229
53
- sibi_dst-0.3.37.dist-info/METADATA,sha256=A09_8PykbkfOjpCHIpiij4Wn-nlS2tU6gsR0-MngZUQ,2564
54
- sibi_dst-0.3.37.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
- sibi_dst-0.3.37.dist-info/RECORD,,
53
+ sibi_dst-0.3.38.dist-info/METADATA,sha256=ydwsS0rynG0HRVwwyRRxu_Kj4nA-0mGPb8JohGnUWEQ,2564
54
+ sibi_dst-0.3.38.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
+ sibi_dst-0.3.38.dist-info/RECORD,,