sibi-dst 0.3.37__py3-none-any.whl → 0.3.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  import datetime
2
2
  from concurrent.futures import ThreadPoolExecutor
3
- from typing import Type, Any, Dict, Optional
4
-
3
+ from typing import Type, Any, Dict, Optional, Union
4
+ from threading import Lock
5
5
  import fsspec
6
6
  import pandas as pd
7
7
  from IPython.display import display
8
8
  from tqdm import tqdm
9
9
 
10
- from sibi_dst.utils import Logger
10
+ from sibi_dst.utils import Logger, DateUtils
11
11
  from sibi_dst.utils import ParquetSaver
12
12
 
13
13
 
@@ -87,18 +87,18 @@ class DataWrapper:
87
87
  reverse_order: bool = False,
88
88
  overwrite: bool = False,
89
89
  ignore_missing: bool = False,
90
- logger: Optional[Logger] = None,
90
+ logger: Logger = None,
91
91
  max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
92
92
  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
93
93
  show_progress: bool = False,
94
- timeout: Optional[int] = 300):
94
+ timeout: float = 300):
95
95
  self.dataclass = dataclass
96
96
  self.date_field = date_field
97
97
  self.data_path = self.ensure_forward_slash(data_path)
98
98
  self.parquet_filename = parquet_filename
99
99
  self.filesystem_type = filesystem_type
100
100
  self.filesystem_options = filesystem_options or {}
101
- self.fs = fs or fsspec.filesystem(filesystem_type, **self.filesystem_options)
101
+ self.fs = fs
102
102
  self.verbose = verbose
103
103
  self.class_params = class_params or {}
104
104
  self.load_params = load_params or {}
@@ -113,9 +113,16 @@ class DataWrapper:
113
113
 
114
114
  self.start_date = self.convert_to_date(start_date)
115
115
  self.end_date = self.convert_to_date(end_date)
116
+ self._lock = Lock()
117
+ self.processed_dates = []
118
+ self.date_utils = DateUtils(logger=self.logger)
119
+ if self.fs is None:
120
+ with self._lock:
121
+ if self.fs is None:
122
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
116
123
 
117
124
  @staticmethod
118
- def convert_to_date(date: Any) -> datetime.date:
125
+ def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
119
126
  if isinstance(date, datetime.date):
120
127
  return date
121
128
  try:
@@ -135,7 +142,7 @@ class DataWrapper:
135
142
  for date in date_range:
136
143
  yield date.date()
137
144
 
138
- def process(self):
145
+ def process(self, max_retries: int = 3):
139
146
  """
140
147
  Processes update tasks by generating an update plan, filtering required updates, and distributing
141
148
  the workload across threads based on priority levels.
@@ -145,8 +152,8 @@ class DataWrapper:
145
152
  Each thread handles the updates for a specific priority level, ensuring a streamlined approach
146
153
  to handling the updates efficiently.
147
154
 
155
+ :param max_retries: Maximum number of retries for a task after a timeout. Defaults to 3.
148
156
  :raises TimeoutError: If a thread processing a priority level exceeds the allowed timeout duration.
149
-
150
157
  :return: None
151
158
  """
152
159
  update_plan_table = self.generate_update_plan_with_conditions()
@@ -156,12 +163,14 @@ class DataWrapper:
156
163
  display(update_plan_table)
157
164
 
158
165
  # Filter out rows that do not require updates (priority 0 means skip)
159
- update_plan_table = update_plan_table[
160
- (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
161
- ]
166
+ with self._lock:
167
+ update_plan_table = update_plan_table[
168
+ (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
169
+ ]
162
170
 
163
171
  # Group by priority
164
- priorities = sorted(update_plan_table["update_priority"].unique())
172
+ with self._lock:
173
+ priorities = sorted(update_plan_table["update_priority"].unique())
165
174
 
166
175
  # We will process each priority level in its own thread.
167
176
  # Each thread will handle all dates associated with that priority.
@@ -174,7 +183,8 @@ class DataWrapper:
174
183
  # If show_progress is True, wrap in a progress bar
175
184
  date_iterator = dates_to_process
176
185
  if self.show_progress:
177
- date_iterator = tqdm(date_iterator, desc=f"Processing priority {priority}:{self.dataclass.__name__}",
186
+ date_iterator = tqdm(date_iterator,
187
+ desc=f"Processing priority {priority}:{self.dataclass.__name__}",
178
188
  unit="date")
179
189
 
180
190
  # Process each date for this priority
@@ -184,63 +194,31 @@ class DataWrapper:
184
194
  # Launch a separate thread for each priority
185
195
  with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
186
196
  futures = {executor.submit(process_priority, p): p for p in priorities}
187
- for future in futures:
188
- try:
189
- future.result(timeout=self.timeout)
190
- except TimeoutError:
191
- self.logger.error(f"Thread for {self.dataclass.__name__} timed out. Thread cancelled.")
192
- future.cancel()
193
- priority = futures[future]
194
- new_future = executor.submit(process_priority, priority)
195
- futures[new_future] = priority
196
- self.logger.info(f"Resubmitted task for priority {priority} after timeout.")
197
-
198
- def is_file_older_than(self, file_path: str) -> bool:
199
- """
200
- Check if a file is older than the specified max_age_minutes.
201
-
202
- :param file_path: Path to the file.
203
- :return: True if the file is older than max_age_minutes, False otherwise.
204
- """
205
- try:
206
- # Get file info
207
- info = self.fs.info(file_path)
208
- self.logger.info(f"File info for {file_path}: {info}")
209
-
210
- # Determine the modification time from available keys
211
- file_modification_time = None
212
- if "mtime" in info: # Local filesystem
213
- file_modification_time = info["mtime"]
214
- file_modification_datetime = datetime.datetime.fromtimestamp(
215
- file_modification_time, tz=datetime.timezone.utc
216
- )
217
- elif "LastModified" in info: # S3-compatible filesystem
218
- file_modification_datetime = (
219
- info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
220
- else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
221
- )
222
- self.logger.info(f"S3 File modification time: {file_modification_datetime}")
223
- else:
224
- self.logger.warning(f"Modification time not available for {file_path}.")
225
- return True # Assume file is too old if we cannot determine its age
226
-
227
- # Compare file age
228
- current_time = datetime.datetime.now(datetime.timezone.utc)
229
- file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
230
- self.logger.info(
231
- f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
232
- f"(threshold: {self.max_age_minutes} minutes)"
233
- )
234
- return file_age_minutes > self.max_age_minutes
235
-
236
- except FileNotFoundError:
237
- self.logger.warning(f"File {file_path} not found.")
238
- if self.ignore_missing:
239
- return False
240
- return True # File is considered old if it doesn't exist
241
- except Exception as e:
242
- self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
243
- return True
197
+ retries = {p: 0 for p in priorities} # Track retry counts for each priority
198
+
199
+ while futures:
200
+ for future in list(futures.keys()):
201
+ try:
202
+ future.result(timeout=self.timeout)
203
+ del futures[future] # Remove completed future
204
+ except TimeoutError:
205
+ priority = futures[future]
206
+ retries[priority] += 1
207
+
208
+ if retries[priority] <= max_retries:
209
+ self.logger.warning(
210
+ f"Thread for priority {priority} timed out. Retrying ({retries[priority]}/{max_retries})..."
211
+ )
212
+ new_future = executor.submit(process_priority, priority)
213
+ futures[new_future] = priority
214
+ else:
215
+ self.logger.error(
216
+ f"Thread for priority {priority} timed out. Max retries ({max_retries}) exceeded. Skipping."
217
+ )
218
+ del futures[future] # Remove the timed-out future
219
+ except Exception as e:
220
+ self.logger.error(f"Error processing priority {futures[future]}: {e}")
221
+ del futures[future] # Remove the failed future
244
222
 
245
223
  def process_date(self, date: datetime.date):
246
224
  """
@@ -261,6 +239,7 @@ class DataWrapper:
261
239
  full_parquet_filename = f"{folder}{self.parquet_filename}"
262
240
 
263
241
  start_time = datetime.datetime.now()
242
+ self.logger.info(f"Processing date: {date}")
264
243
  self.logger.info(f"Processing {full_parquet_filename}...")
265
244
 
266
245
  data_object = self.dataclass(**self.class_params)
@@ -270,14 +249,18 @@ class DataWrapper:
270
249
  self.logger.error("No data found for the specified date.")
271
250
  return
272
251
 
273
- parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
274
- parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
252
+ with self._lock:
253
+ parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
254
+ parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
275
255
 
276
- end_time = datetime.datetime.now()
277
- duration_seconds = (end_time - start_time).total_seconds()
278
- self.logger.info(
279
- f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
280
- )
256
+ end_time = datetime.datetime.now()
257
+ duration_seconds = (end_time - start_time).total_seconds()
258
+ self.logger.info(
259
+ f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
260
+ )
261
+
262
+ self.processed_dates.append(date)
263
+ self.logger.info(f"Finished processing date: {date}")
281
264
 
282
265
  def generate_update_plan_with_conditions(self):
283
266
  """
@@ -297,7 +280,12 @@ class DataWrapper:
297
280
 
298
281
  today = datetime.date.today()
299
282
  history_start_date = today - datetime.timedelta(days=self.history_days_threshold)
300
-
283
+ priority_map = {
284
+ "file is recent":0,
285
+ "overwrite": 1,
286
+ "history_days": 2,
287
+ "missing_files": 3
288
+ }
301
289
  date_range = self.generate_date_range()
302
290
  if self.show_progress:
303
291
  date_range = tqdm(date_range, desc=f"Evaluating update plan:{self.dataclass.__name__}", unit="date")
@@ -317,12 +305,23 @@ class DataWrapper:
317
305
  update_required = True
318
306
  # Hierarchy 2: History threshold evaluation
319
307
  elif within_history:
320
- if self.is_file_older_than(full_parquet_filename):
321
- category = "history_days"
322
- update_required = True
308
+ if file_exists:
309
+ if self.date_utils.is_file_older_than(
310
+ full_parquet_filename,
311
+ max_age_minutes=self.max_age_minutes,
312
+ fs=self.fs,
313
+ ignore_missing=self.ignore_missing,
314
+ verbose=self.verbose
315
+ ):
316
+ category = "history_days"
317
+ update_required = True
318
+ else:
319
+ category = "file is recent"
320
+ update_required = False
323
321
  else:
324
- category = "file age is recent"
325
- update_required = False
322
+ category = "missing_files"
323
+ update_required = True
324
+
326
325
  # Hierarchy 3: Missing files
327
326
  elif missing_file and current_date <= today:
328
327
  category = "missing_files"
@@ -332,25 +331,17 @@ class DataWrapper:
332
331
  update_required = False
333
332
 
334
333
  # Collect condition descriptions for the update plan table
335
- rows.append({
334
+ row = {
336
335
  "date": current_date,
337
336
  "file_exists": file_exists,
338
337
  "within_history": within_history,
339
338
  "missing_file": missing_file,
340
339
  "update_required": update_required,
341
340
  "update_category": category,
342
- "datawrapper class": self.dataclass.__name__
343
- })
344
- priority_map = {
345
- "overwrite": 1,
346
- "history_days": 2,
347
- "missing_files": 3
341
+ "datawrapper class": self.dataclass.__name__,
342
+ "update_priority": priority_map.get(category, 0)
348
343
  }
349
-
350
- for row in rows:
351
- category = row.get("update_category")
352
- # Default to None if no category assigned (no update required)
353
- row["update_priority"] = priority_map.get(category, 0)
344
+ rows.append(row)
354
345
 
355
346
  update_plan_table = pd.DataFrame(rows)
356
347
  return update_plan_table
@@ -1,6 +1,7 @@
1
1
  import datetime
2
- from typing import Union, Tuple, Callable, Dict
2
+ from typing import Union, Tuple, Callable, Dict, Optional
3
3
 
4
+ import fsspec
4
5
  import numpy as np
5
6
  import pandas as pd
6
7
 
@@ -143,6 +144,158 @@ class DateUtils:
143
144
  'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
144
145
  }
145
146
 
147
+ def is_file_older_than(self, file_path: str, max_age_minutes: int, fs: Optional[fsspec.AbstractFileSystem] = None,
148
+ ignore_missing: bool = False, verbose: bool = False) -> bool:
149
+ """
150
+ Check if a file or a partitioned Parquet dataset is older than the specified max_age_minutes.
151
+
152
+ :param file_path: Path to the file or dataset.
153
+ :param max_age_minutes: Maximum allowed age in minutes.
154
+ :param fs: Filesystem object (e.g., S3, local). If not provided, defaults to the local filesystem.
155
+ :param ignore_missing: If True, treat missing files as not old. Defaults to False.
156
+ :param verbose: If True, log detailed information. Defaults to False.
157
+ :return: True if the file or dataset is older than max_age_minutes, False otherwise.
158
+ """
159
+ fs = fs or fsspec.filesystem("file")
160
+ self.logger.info(f"Checking age for {file_path}...")
161
+
162
+ try:
163
+ if not fs.exists(file_path):
164
+ self.logger.info(f"Path not found: {file_path}.")
165
+ return not ignore_missing
166
+
167
+ if fs.isdir(file_path):
168
+ self.logger.info(f"Found that {file_path} is a directory...")
169
+ return self._is_directory_older_than(file_path, max_age_minutes, fs, verbose)
170
+
171
+ elif fs.isfile(file_path):
172
+ return self._is_file_older_than(file_path, max_age_minutes, fs, verbose)
173
+
174
+ else:
175
+ self.logger.warning(f"Path {file_path} is neither a file nor a directory.")
176
+ return True
177
+
178
+ except Exception as e:
179
+ self.logger.warning(f"Error checking age for {file_path}: {str(e)}")
180
+ return True
181
+
182
+ def _is_directory_older_than(self, dir_path: str, max_age_minutes: int, fs: fsspec.AbstractFileSystem,
183
+ verbose: bool) -> bool:
184
+ """
185
+ Check if the oldest file in a directory is older than the specified max_age_minutes.
186
+
187
+ :param dir_path: Path to the directory.
188
+ :param max_age_minutes: Maximum allowed age in minutes.
189
+ :param fs: Filesystem object.
190
+ :param verbose: If True, log detailed information.
191
+ :return: True if the oldest file is older than max_age_minutes, False otherwise.
192
+ """
193
+ all_files = fs.ls(dir_path)
194
+ if not all_files:
195
+ self.logger.info(f"No files found in dataset: {dir_path}.")
196
+ return True
197
+
198
+ modification_times = [
199
+ self._get_modification_time(fs.info(file), file)
200
+ for file in all_files
201
+ if self._is_valid_file(file, fs)
202
+ ]
203
+
204
+ if not modification_times:
205
+ self.logger.warning(f"No valid modification times found for dataset: {dir_path}. Assuming dataset is old.")
206
+ return True
207
+
208
+ oldest_modification_time = min(modification_times)
209
+ dataset_age_minutes = (datetime.datetime.now(
210
+ datetime.timezone.utc) - oldest_modification_time).total_seconds() / 60
211
+
212
+ if verbose:
213
+ self.logger.info(
214
+ f"Oldest file in dataset {dir_path} is {round(dataset_age_minutes, 2)} minutes old "
215
+ f"(threshold: {max_age_minutes} minutes)"
216
+ )
217
+
218
+ return dataset_age_minutes > max_age_minutes
219
+
220
+ def _is_file_older_than(self, file_path: str, max_age_minutes: int, fs: fsspec.AbstractFileSystem,
221
+ verbose: bool) -> bool:
222
+ """
223
+ Check if a single file is older than the specified max_age_minutes.
224
+
225
+ :param file_path: Path to the file.
226
+ :param max_age_minutes: Maximum allowed age in minutes.
227
+ :param fs: Filesystem object.
228
+ :param verbose: If True, log detailed information.
229
+ :return: True if the file is older than max_age_minutes, False otherwise.
230
+ """
231
+ info = fs.info(file_path)
232
+ if verbose:
233
+ self.logger.debug(f"File info for {file_path}: {info}")
234
+
235
+ file_modification_datetime = self._get_modification_time(info, file_path)
236
+ file_age_minutes = (datetime.datetime.now(
237
+ datetime.timezone.utc) - file_modification_datetime).total_seconds() / 60
238
+
239
+ if verbose:
240
+ self.logger.debug(
241
+ f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
242
+ f"(threshold: {max_age_minutes} minutes)"
243
+ )
244
+
245
+ return file_age_minutes > max_age_minutes
246
+
247
+ def _is_valid_file(self, file_path: str, fs: fsspec.AbstractFileSystem) -> bool:
248
+ """
249
+ Check if a file is valid (exists and has a valid modification time).
250
+
251
+ :param file_path: Path to the file.
252
+ :param fs: Filesystem object.
253
+ :return: True if the file is valid, False otherwise.
254
+ """
255
+ try:
256
+ fs.info(file_path)
257
+ return True
258
+ except Exception as e:
259
+ self.logger.warning(f"Error checking file age for {file_path}: {str(e)}")
260
+ return False
261
+
262
+ def _get_modification_time(self, info: Dict, file_path: str) -> datetime.datetime:
263
+ """
264
+ Extract the modification time from file info.
265
+
266
+ :param info: File info dictionary.
267
+ :param file_path: Path to the file (for logging purposes).
268
+ :return: Modification time as a timezone-aware datetime object.
269
+ """
270
+ if "LastModified" in info: # S3-compatible filesystem
271
+ last_modified = info["LastModified"]
272
+ if isinstance(last_modified, datetime.datetime):
273
+ return last_modified
274
+ else:
275
+ return datetime.datetime.strptime(last_modified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
276
+ tzinfo=datetime.timezone.utc)
277
+
278
+ elif "mtime" in info: # Local filesystem
279
+ return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
280
+
281
+ elif "modified" in info: # FTP or SSH filesystem
282
+ modified_str = info["modified"]
283
+ try:
284
+ return datetime.datetime.strptime(modified_str, "%Y-%m-%d %H:%M:%S").replace(
285
+ tzinfo=datetime.timezone.utc)
286
+ except ValueError:
287
+ try:
288
+ return datetime.datetime.strptime(modified_str, "%b %d %H:%M").replace(
289
+ year=datetime.datetime.now().year, tzinfo=datetime.timezone.utc
290
+ )
291
+ except ValueError:
292
+ self.logger.warning(f"Unsupported modification time format for {file_path}: {modified_str}")
293
+ raise ValueError("Unsupported modification time format")
294
+
295
+ else: # Fallback for unsupported filesystems
296
+ self.logger.warning(f"Modification time not available for {file_path}.")
297
+ raise ValueError("Modification time not available")
298
+
146
299
 
147
300
  class BusinessDays:
148
301
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.37
3
+ Version: 0.3.39
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -42,14 +42,14 @@ sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWd
42
42
  sibi_dst/utils/clickhouse_writer.py,sha256=syXGN9NG1FS8soHuMj6QNRqTRWi-thuYUF-_BWDc_KI,9883
43
43
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
44
44
  sibi_dst/utils/data_utils.py,sha256=j-lEKt6EJL2fm0z7adcjtVG7yFYLRpQL8xSgh2CVmJg,8769
45
- sibi_dst/utils/data_wrapper.py,sha256=V9p1CKvlmT3yyfZ5d2BIJSvz0Ee1TIA4hufpTIYo6v8,16684
46
- sibi_dst/utils/date_utils.py,sha256=ei7WgzIUk1tRa3sHniaVm_lNmfTGq12b_HzmMV91k18,12407
45
+ sibi_dst/utils/data_wrapper.py,sha256=SMmr4hwnUvCrN6nouI8N7aRiczuatjSMdYzNOBcDnr8,16283
46
+ sibi_dst/utils/date_utils.py,sha256=UppOs1vfm41Si9JITAM7Qn9qqOi9yb6ukYz1E2mnA1I,19214
47
47
  sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10989
48
48
  sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
49
49
  sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEBRY5FOUQ,6673
50
50
  sibi_dst/utils/log_utils.py,sha256=XUbeXa1JsOlcEJyW8jnBlWo295rLUnuYi-HMzyhHwJg,3145
51
51
  sibi_dst/utils/parquet_saver.py,sha256=_QkXL3IiC2b4m7sxHpCSeqPwBWxXeiP5sH_WheSMEm4,8042
52
52
  sibi_dst/utils/storage_manager.py,sha256=-zlMrRo_6o6mCd_OHknKqNQl7m0I9VW89grDAUO1V5c,4229
53
- sibi_dst-0.3.37.dist-info/METADATA,sha256=A09_8PykbkfOjpCHIpiij4Wn-nlS2tU6gsR0-MngZUQ,2564
54
- sibi_dst-0.3.37.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
- sibi_dst-0.3.37.dist-info/RECORD,,
53
+ sibi_dst-0.3.39.dist-info/METADATA,sha256=SZJ04u1MedYe00Z3h_YexNMEzB2Nfd9BFyTdDXVX6ZM,2564
54
+ sibi_dst-0.3.39.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
+ sibi_dst-0.3.39.dist-info/RECORD,,