PyPI - sibi-dst - Versions diffs - 0.3.38__tar.gz → 0.3.40__tar.gz - Mend

sibi-dst 0.3.38tar.gz → 0.3.40tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{sibi_dst-0.3.38 → sibi_dst-0.3.40}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 0.3.38
+Version: 0.3.40
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-0.3.38 → sibi_dst-0.3.40}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sibi-dst"
-version = "0.3.38"
+version = "0.3.40"
 description = "Data Science Toolkit"
 authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
 readme = "README.md"

{sibi_dst-0.3.38 → sibi_dst-0.3.40}/sibi_dst/utils/data_wrapper.py RENAMED Viewed

@@ -91,14 +91,14 @@ class DataWrapper:
                  max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
                  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
                  show_progress: bool = False,
-                 timeout: float = 300):
+                 timeout: float = 60):
         self.dataclass = dataclass
         self.date_field = date_field
         self.data_path = self.ensure_forward_slash(data_path)
         self.parquet_filename = parquet_filename
         self.filesystem_type = filesystem_type
         self.filesystem_options = filesystem_options or {}
-        self.fs = fs or fsspec.filesystem(filesystem_type, **self.filesystem_options)
+        self.fs = fs
         self.verbose = verbose
         self.class_params = class_params or {}
         self.load_params = load_params or {}
@@ -116,6 +116,10 @@ class DataWrapper:
         self._lock = Lock()
         self.processed_dates = []
         self.date_utils = DateUtils(logger=self.logger)
+        if self.fs is None:
+            with self._lock:
+                if self.fs is None:
+                    self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
     @staticmethod
     def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
@@ -154,16 +158,16 @@ class DataWrapper:
         """
         update_plan_table = self.generate_update_plan_with_conditions()
-        # Display the update plan table to the user if requested
-        if self.show_progress:
-            display(update_plan_table)
         # Filter out rows that do not require updates (priority 0 means skip)
         with self._lock:
             update_plan_table = update_plan_table[
                 (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
                 ]
+        # Display the update plan table to the user if requested
+        if len(update_plan_table.index) == 0:
+            return
+        if self.show_progress:
+            display(update_plan_table)
         # Group by priority
         with self._lock:
             priorities = sorted(update_plan_table["update_priority"].unique())
@@ -172,21 +176,20 @@ class DataWrapper:
         # Each thread will handle all dates associated with that priority.
         def process_priority(priority):
             # Extract dates for the current priority
-            with self._lock:
-                dates_to_process = update_plan_table[
-                    update_plan_table["update_priority"] == priority
-                    ]["date"].tolist()
+            dates_to_process = update_plan_table[
+                update_plan_table["update_priority"] == priority
+                ]["date"].tolist()
-                # If show_progress is True, wrap in a progress bar
-                date_iterator = dates_to_process
-                if self.show_progress:
-                    date_iterator = tqdm(date_iterator,
-                                         desc=f"Processing priority {priority}:{self.dataclass.__name__}",
-                                         unit="date")
+            # If show_progress is True, wrap in a progress bar
+            date_iterator = dates_to_process
+            if self.show_progress:
+                date_iterator = tqdm(date_iterator,
+                                     desc=f"Processing priority {priority}:{self.dataclass.__name__}",
+                                     unit="date")
-                # Process each date for this priority
-                for current_date in date_iterator:
-                    self.process_date(current_date)
+            # Process each date for this priority
+            for current_date in date_iterator:
+                self.process_date(current_date)
         # Launch a separate thread for each priority
         with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
@@ -232,21 +235,21 @@ class DataWrapper:
         :type date: datetime.date
         :return: None
         """
-        with self._lock:
-            folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
-            full_parquet_filename = f"{folder}{self.parquet_filename}"
+        folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
+        full_parquet_filename = f"{folder}{self.parquet_filename}"
-            start_time = datetime.datetime.now()
-            self.logger.info(f"Processing date: {date}")
-            self.logger.info(f"Processing {full_parquet_filename}...")
+        start_time = datetime.datetime.now()
+        self.logger.info(f"Processing date: {date}")
+        self.logger.info(f"Processing {full_parquet_filename}...")
-            data_object = self.dataclass(**self.class_params)
-            df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
+        data_object = self.dataclass(**self.class_params)
+        df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
-            if len(df.index) == 0:
-                self.logger.error("No data found for the specified date.")
-                return
+        if len(df.index) == 0:
+            self.logger.error("No data found for the specified date.")
+            return
+        with self._lock:
             parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
             parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
@@ -255,8 +258,9 @@ class DataWrapper:
             self.logger.info(
                 f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
             )
             self.processed_dates.append(date)
-            self.logger.info(f"Finished processing date: {date}")
+        self.logger.info(f"Finished processing date: {date}")
     def generate_update_plan_with_conditions(self):
         """
@@ -294,29 +298,33 @@ class DataWrapper:
             within_history = history_start_date <= current_date <= today
             missing_file = not file_exists and not self.ignore_missing
             category = None
+            update_required = False
             # Hierarchy 1: Overwrite
             if self.overwrite:
                 category = "overwrite"
                 update_required = True
-            # Hierarchy 2: History threshold evaluation
+            elif missing_file and current_date < today:
+                category = "missing_files"
+                update_required = True
             elif within_history:
-                if self.date_utils.is_file_older_than(
+                if file_exists:
+                    if self.date_utils.is_file_older_than(
                         full_parquet_filename,
                         max_age_minutes=self.max_age_minutes,
                         fs=self.fs,
                         ignore_missing=self.ignore_missing,
                         verbose=self.verbose
-                ):
-                    category = "history_days"
-                    update_required = True
+                    ):
+                        category = "history_days"
+                        update_required = True
+                    else:
+                        category = "file is recent"
+                        update_required = False
                 else:
-                    category = "file is recent"
-                    update_required = False
-            # Hierarchy 3: Missing files
-            elif missing_file and current_date <= today:
-                category = "missing_files"
-                update_required = True
+                    category = "missing_files"
+                    update_required = True
             else:
                 category = "No Update Required"
                 update_required = False
@@ -334,12 +342,6 @@ class DataWrapper:
             }
             rows.append(row)
-            for row in rows:
-                category = row.get("update_category")
-                # Default to None if no category assigned (no update required)
-                row["update_priority"] = priority_map.get(category, 0)
         update_plan_table = pd.DataFrame(rows)
         return update_plan_table