sibi-dst 0.3.21__py3-none-any.whl → 0.3.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from ._io_dask import ReadFrameDask
3
4
  from ._django_db_connection import DjangoConnectionConfig
4
5
  from ._django_load_from_db import DjangoLoadFromDb
5
- from ._io_dask import ReadFrameDask
6
6
 
7
7
  __all__ = [
8
8
  "DjangoConnectionConfig",
@@ -1,22 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._airflow_manager import AirflowDAGManager
4
- from ._clickhouse_writer import ClickHouseWriter
5
- from ._credentials import *
6
- from ._data_utils import DataUtils
7
- from ._data_wrapper import DataWrapper
3
+ from ._log_utils import Logger
8
4
  from ._date_utils import *
9
- from ._df_utils import DfUtils
5
+ from ._data_utils import DataUtils
10
6
  from ._file_utils import FileUtils
11
7
  from ._filepath_generator import FilePathGenerator
12
- from ._log_utils import Logger
13
- from ._parquet_saver import ParquetSaver
8
+ from ._df_utils import DfUtils
14
9
  from ._storage_manager import StorageManager
10
+ from ._parquet_saver import ParquetSaver
11
+ from ._clickhouse_writer import ClickHouseWriter
12
+ from ._airflow_manager import AirflowDAGManager
13
+ from ._credentials import *
14
+ from ._data_wrapper import DataWrapper
15
15
 
16
16
  __all__ = [
17
+ "Logger",
17
18
  "ConfigManager",
18
19
  "ConfigLoader",
19
- "Logger",
20
20
  "DateUtils",
21
21
  "BusinessDays",
22
22
  "FileUtils",
@@ -1,4 +1,5 @@
1
1
  import datetime
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
  from typing import Type, Any, Dict, Optional
3
4
 
4
5
  import fsspec
@@ -32,7 +33,8 @@ class DataWrapper:
32
33
  logger: Optional[Logger] = None,
33
34
  max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
34
35
  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
35
- show_progress: bool = False):
36
+ show_progress: bool = False,
37
+ timeout: Optional[int] = 300):
36
38
  self.dataclass = dataclass
37
39
  self.date_field = date_field
38
40
  self.data_path = self.ensure_forward_slash(data_path)
@@ -50,6 +52,7 @@ class DataWrapper:
50
52
  self.max_age_minutes = max_age_minutes
51
53
  self.history_days_threshold = history_days_threshold
52
54
  self.show_progress = show_progress
55
+ self.timeout = timeout
53
56
 
54
57
  self.start_date = self.convert_to_date(start_date)
55
58
  self.end_date = self.convert_to_date(end_date)
@@ -76,31 +79,79 @@ class DataWrapper:
76
79
  yield date.date()
77
80
 
78
81
  def process(self):
79
- """Execute the update plan following the specified hierarchy."""
82
+ """Execute the update plan using 'update_priority' to determine processing order."""
80
83
  update_plan_table = self.generate_update_plan_with_conditions()
81
84
 
82
- # Display the update plan table to the user if show_progress is True
85
+ # Display the update plan table to the user if requested
83
86
  if self.show_progress:
84
87
  display(update_plan_table)
85
88
 
86
- # Process files according to the hierarchy, considering only `update_required` dates
87
- for category, description in [
88
- ("overwrite", "Processing files due to overwrite=True"),
89
- ("history_days", "Processing files within history_days_threshold"),
90
- ("missing_files", "Processing missing files")
91
- ]:
92
- # Filter dates in the category where `update_required` is True
89
+ # Filter out rows that do not require updates (priority 0 means skip)
90
+ update_plan_table = update_plan_table[
91
+ (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
92
+ ]
93
+
94
+ # Group by priority
95
+ priorities = sorted(update_plan_table["update_priority"].unique())
96
+
97
+ # We will process each priority level in its own thread.
98
+ # Each thread will handle all dates associated with that priority.
99
+ def process_priority(priority):
100
+ # Extract dates for the current priority
93
101
  dates_to_process = update_plan_table[
94
- (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
102
+ update_plan_table["update_priority"] == priority
95
103
  ]["date"].tolist()
96
104
 
105
+ # If show_progress is True, wrap in a progress bar
97
106
  date_iterator = dates_to_process
98
107
  if self.show_progress:
99
- date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
108
+ date_iterator = tqdm(date_iterator, desc=f"Processing priority {priority}:{self.dataclass.__name__}",
109
+ unit="date")
100
110
 
111
+ # Process each date for this priority
101
112
  for current_date in date_iterator:
102
113
  self.process_date(current_date)
103
114
 
115
+ # Launch a separate thread for each priority
116
+ with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
117
+ futures = {executor.submit(process_priority, p): p for p in priorities}
118
+ for future in futures:
119
+ try:
120
+ future.result(timeout=self.timeout)
121
+ except TimeoutError:
122
+ self.logger.error(f"Thread for {self.dataclass.__name__} timed out. Thread cancelled.")
123
+ future.cancel()
124
+ priority = futures[future]
125
+ new_future = executor.submit(process_priority, priority)
126
+ futures[new_future] = priority
127
+ self.logger.info(f"Resubmitted task for priority {priority} after timeout.")
128
+
129
+ # def process(self):
130
+ # """Execute the update plan following the specified hierarchy."""
131
+ # update_plan_table = self.generate_update_plan_with_conditions()
132
+ #
133
+ # # Display the update plan table to the user if show_progress is True
134
+ # if self.show_progress:
135
+ # display(update_plan_table)
136
+ #
137
+ # # Process files according to the hierarchy, considering only `update_required` dates
138
+ # for category, description in [
139
+ # ("overwrite", "Processing files due to overwrite=True"),
140
+ # ("history_days", "Processing files within history_days_threshold"),
141
+ # ("missing_files", "Processing missing files")
142
+ # ]:
143
+ # # Filter dates in the category where `update_required` is True
144
+ # dates_to_process = update_plan_table[
145
+ # (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
146
+ # ]["date"].tolist()
147
+ #
148
+ # date_iterator = dates_to_process
149
+ # if self.show_progress:
150
+ # date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
151
+ #
152
+ # for current_date in date_iterator:
153
+ # self.process_date(current_date)
154
+
104
155
  def is_file_older_than(self, file_path: str) -> bool:
105
156
  """
106
157
  Check if a file is older than the specified max_age_minutes.
@@ -181,12 +232,14 @@ class DataWrapper:
181
232
  category = "history_days"
182
233
  update_required = True
183
234
  else:
235
+ category = "file age is recent"
184
236
  update_required = False
185
237
  # Hierarchy 3: Missing files
186
238
  elif missing_file and current_date <= today:
187
239
  category = "missing_files"
188
240
  update_required = True
189
241
  else:
242
+ category = "No Update Required"
190
243
  update_required = False
191
244
 
192
245
  # Collect condition descriptions for the update plan table
@@ -199,6 +252,16 @@ class DataWrapper:
199
252
  "update_category": category,
200
253
  "datawrapper class": self.dataclass.__name__
201
254
  })
255
+ priority_map = {
256
+ "overwrite": 1,
257
+ "history_days": 2,
258
+ "missing_files": 3
259
+ }
260
+
261
+ for row in rows:
262
+ category = row.get("update_category")
263
+ # Default to None if no category assigned (no update required)
264
+ row["update_priority"] = priority_map.get(category, 0)
202
265
 
203
266
  update_plan_table = pd.DataFrame(rows)
204
267
  return update_plan_table
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.21
3
+ Version: 0.3.22
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -4,7 +4,7 @@ sibi_dst/df_helper/_df_helper.py,sha256=3fibDnRAX4R0v-xgfG87BKLR-ZCg8AZWrKDIO7qb
4
4
  sibi_dst/df_helper/_parquet_artifact.py,sha256=nx1wTEyrjARpCCPNwBxYiBROee3CSb6c-u7Cpme_tdk,4978
5
5
  sibi_dst/df_helper/_parquet_reader.py,sha256=sbe8DsScNT2h6huNsz8mUxVnUGpJeRzbaONZ3u2sQeQ,1685
6
6
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- sibi_dst/df_helper/backends/django/__init__.py,sha256=Py4XGV8OnWv_6qkdlJ0hhc1_xT_NLMS712N00CJOg0w,270
7
+ sibi_dst/df_helper/backends/django/__init__.py,sha256=6LNllh46ZQ10QpEAxYizEOybx0__nUmpyLVCp8uyHyc,270
8
8
  sibi_dst/df_helper/backends/django/_django_db_connection.py,sha256=GypF84Ej7ViTID4r3UEReRGmLiyfMtEknPI_NINSm3g,1641
9
9
  sibi_dst/df_helper/backends/django/_django_load_from_db.py,sha256=GLsAsuEQD1cXfEm7BuxofZfR32VwZNEfwR9c-AZn-x0,5555
10
10
  sibi_dst/df_helper/backends/django/_django_sql_model_builder.py,sha256=xyOq0JY0k9380aBeV66RXbeXL-mF22CczbQQoXLDhuo,14884
@@ -29,12 +29,12 @@ sibi_dst/df_helper/core/_defaults.py,sha256=eNpHD2sZxir-2xO0b3_V16ryw8YP_5FfpIKK
29
29
  sibi_dst/df_helper/core/_filter_handler.py,sha256=1-IdviSYi5Hc28KckO4dkYHDfQ8X9SUb6kwfobm16_E,8580
30
30
  sibi_dst/df_helper/core/_params_config.py,sha256=mM1CnF29zls5LXx7rpKY8uix_GyOG5smO4ry_OX31IU,3477
31
31
  sibi_dst/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
32
- sibi_dst/utils/__init__.py,sha256=5WeBMxhNGB8ZpHUrp1NOQf8Kn0bLOtjrerFjQdFTa7U,787
32
+ sibi_dst/utils/__init__.py,sha256=-7gtDN58v5f09bd2OGKAqMyccEyDP8tANWdw3RwI408,787
33
33
  sibi_dst/utils/_airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWdQv10,7526
34
34
  sibi_dst/utils/_clickhouse_writer.py,sha256=dL5pixjn4cj0Rwpc3POfCcY2D-aQCMbPSECX0dKATyE,8587
35
35
  sibi_dst/utils/_credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
36
36
  sibi_dst/utils/_data_utils.py,sha256=ch4j5FEs8ZnniUzpbeLO-b4Yco_6nwCu71xHaVqMGi4,7050
37
- sibi_dst/utils/_data_wrapper.py,sha256=BmKFO70xVX3AjpGRzqrc6HS1Uw4xerZDx3IpFrcrIIo,9020
37
+ sibi_dst/utils/_data_wrapper.py,sha256=4U0sKVXK7qDTObhufO19jxTzJa6ohs2VOh3WAhhzLCU,11982
38
38
  sibi_dst/utils/_date_utils.py,sha256=CMAZBNwVj7cvERcNiTA8Pf7_5EjV9By9yxkYJpkqz1g,10656
39
39
  sibi_dst/utils/_df_utils.py,sha256=NHnEJG9KDeRuqfE60kwBOO21B6Hvjh7PzE5B8cQrIXc,10986
40
40
  sibi_dst/utils/_file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
@@ -42,6 +42,6 @@ sibi_dst/utils/_filepath_generator.py,sha256=hjI7gQwfwRToPeuzoUQDayHKQrr4Ivhi4Ch
42
42
  sibi_dst/utils/_log_utils.py,sha256=AvKu5Qmi9LXezA7gdkG7yV-MvzZeav8c3PK8s-DwTGE,2314
43
43
  sibi_dst/utils/_parquet_saver.py,sha256=hLrWr1G132y94eLopDPPGQGDsAiR1lQ8id4QQtGYPE4,4349
44
44
  sibi_dst/utils/_storage_manager.py,sha256=7nkfeBW_2xlF59pGj7V2aY5TLwpJnPQuPVclqjavJOA,3856
45
- sibi_dst-0.3.21.dist-info/METADATA,sha256=P0GRxm9kh8V1mSjJgfvBSQplvfTuDnPbZZ71uOzwPJQ,2134
46
- sibi_dst-0.3.21.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
47
- sibi_dst-0.3.21.dist-info/RECORD,,
45
+ sibi_dst-0.3.22.dist-info/METADATA,sha256=syXtIBRGv51uoahxxWFHbWvFkHIezDeAWzXCpkAOZWA,2134
46
+ sibi_dst-0.3.22.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
47
+ sibi_dst-0.3.22.dist-info/RECORD,,