sibi-dst 0.3.21__tar.gz → 0.3.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/PKG-INFO +1 -1
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/pyproject.toml +1 -1
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_data_wrapper.py +75 -12
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/README.md +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/__init__.py +1 -1
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_db_connection.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_load_from_db.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_io_dask_alt.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_model/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/__init__.py +9 -9
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_airflow_manager.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_clickhouse_writer.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_credentials.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_data_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_date_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_df_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_file_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_filepath_generator.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_log_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_parquet_saver.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
import datetime
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
2
3
|
from typing import Type, Any, Dict, Optional
|
3
4
|
|
4
5
|
import fsspec
|
@@ -32,7 +33,8 @@ class DataWrapper:
|
|
32
33
|
logger: Optional[Logger] = None,
|
33
34
|
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
34
35
|
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
35
|
-
show_progress: bool = False
|
36
|
+
show_progress: bool = False,
|
37
|
+
timeout: Optional[int] = 300):
|
36
38
|
self.dataclass = dataclass
|
37
39
|
self.date_field = date_field
|
38
40
|
self.data_path = self.ensure_forward_slash(data_path)
|
@@ -50,6 +52,7 @@ class DataWrapper:
|
|
50
52
|
self.max_age_minutes = max_age_minutes
|
51
53
|
self.history_days_threshold = history_days_threshold
|
52
54
|
self.show_progress = show_progress
|
55
|
+
self.timeout = timeout
|
53
56
|
|
54
57
|
self.start_date = self.convert_to_date(start_date)
|
55
58
|
self.end_date = self.convert_to_date(end_date)
|
@@ -76,31 +79,79 @@ class DataWrapper:
|
|
76
79
|
yield date.date()
|
77
80
|
|
78
81
|
def process(self):
|
79
|
-
"""Execute the update plan
|
82
|
+
"""Execute the update plan using 'update_priority' to determine processing order."""
|
80
83
|
update_plan_table = self.generate_update_plan_with_conditions()
|
81
84
|
|
82
|
-
# Display the update plan table to the user if
|
85
|
+
# Display the update plan table to the user if requested
|
83
86
|
if self.show_progress:
|
84
87
|
display(update_plan_table)
|
85
88
|
|
86
|
-
#
|
87
|
-
|
88
|
-
("
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
89
|
+
# Filter out rows that do not require updates (priority 0 means skip)
|
90
|
+
update_plan_table = update_plan_table[
|
91
|
+
(update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
|
92
|
+
]
|
93
|
+
|
94
|
+
# Group by priority
|
95
|
+
priorities = sorted(update_plan_table["update_priority"].unique())
|
96
|
+
|
97
|
+
# We will process each priority level in its own thread.
|
98
|
+
# Each thread will handle all dates associated with that priority.
|
99
|
+
def process_priority(priority):
|
100
|
+
# Extract dates for the current priority
|
93
101
|
dates_to_process = update_plan_table[
|
94
|
-
|
102
|
+
update_plan_table["update_priority"] == priority
|
95
103
|
]["date"].tolist()
|
96
104
|
|
105
|
+
# If show_progress is True, wrap in a progress bar
|
97
106
|
date_iterator = dates_to_process
|
98
107
|
if self.show_progress:
|
99
|
-
date_iterator = tqdm(date_iterator, desc=f"{
|
108
|
+
date_iterator = tqdm(date_iterator, desc=f"Processing priority {priority}:{self.dataclass.__name__}",
|
109
|
+
unit="date")
|
100
110
|
|
111
|
+
# Process each date for this priority
|
101
112
|
for current_date in date_iterator:
|
102
113
|
self.process_date(current_date)
|
103
114
|
|
115
|
+
# Launch a separate thread for each priority
|
116
|
+
with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
|
117
|
+
futures = {executor.submit(process_priority, p): p for p in priorities}
|
118
|
+
for future in futures:
|
119
|
+
try:
|
120
|
+
future.result(timeout=self.timeout)
|
121
|
+
except TimeoutError:
|
122
|
+
self.logger.error(f"Thread for {self.dataclass.__name__} timed out. Thread cancelled.")
|
123
|
+
future.cancel()
|
124
|
+
priority = futures[future]
|
125
|
+
new_future = executor.submit(process_priority, priority)
|
126
|
+
futures[new_future] = priority
|
127
|
+
self.logger.info(f"Resubmitted task for priority {priority} after timeout.")
|
128
|
+
|
129
|
+
# def process(self):
|
130
|
+
# """Execute the update plan following the specified hierarchy."""
|
131
|
+
# update_plan_table = self.generate_update_plan_with_conditions()
|
132
|
+
#
|
133
|
+
# # Display the update plan table to the user if show_progress is True
|
134
|
+
# if self.show_progress:
|
135
|
+
# display(update_plan_table)
|
136
|
+
#
|
137
|
+
# # Process files according to the hierarchy, considering only `update_required` dates
|
138
|
+
# for category, description in [
|
139
|
+
# ("overwrite", "Processing files due to overwrite=True"),
|
140
|
+
# ("history_days", "Processing files within history_days_threshold"),
|
141
|
+
# ("missing_files", "Processing missing files")
|
142
|
+
# ]:
|
143
|
+
# # Filter dates in the category where `update_required` is True
|
144
|
+
# dates_to_process = update_plan_table[
|
145
|
+
# (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
|
146
|
+
# ]["date"].tolist()
|
147
|
+
#
|
148
|
+
# date_iterator = dates_to_process
|
149
|
+
# if self.show_progress:
|
150
|
+
# date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
|
151
|
+
#
|
152
|
+
# for current_date in date_iterator:
|
153
|
+
# self.process_date(current_date)
|
154
|
+
|
104
155
|
def is_file_older_than(self, file_path: str) -> bool:
|
105
156
|
"""
|
106
157
|
Check if a file is older than the specified max_age_minutes.
|
@@ -181,12 +232,14 @@ class DataWrapper:
|
|
181
232
|
category = "history_days"
|
182
233
|
update_required = True
|
183
234
|
else:
|
235
|
+
category = "file age is recent"
|
184
236
|
update_required = False
|
185
237
|
# Hierarchy 3: Missing files
|
186
238
|
elif missing_file and current_date <= today:
|
187
239
|
category = "missing_files"
|
188
240
|
update_required = True
|
189
241
|
else:
|
242
|
+
category = "No Update Required"
|
190
243
|
update_required = False
|
191
244
|
|
192
245
|
# Collect condition descriptions for the update plan table
|
@@ -199,6 +252,16 @@ class DataWrapper:
|
|
199
252
|
"update_category": category,
|
200
253
|
"datawrapper class": self.dataclass.__name__
|
201
254
|
})
|
255
|
+
priority_map = {
|
256
|
+
"overwrite": 1,
|
257
|
+
"history_days": 2,
|
258
|
+
"missing_files": 3
|
259
|
+
}
|
260
|
+
|
261
|
+
for row in rows:
|
262
|
+
category = row.get("update_category")
|
263
|
+
# Default to None if no category assigned (no update required)
|
264
|
+
row["update_priority"] = priority_map.get(category, 0)
|
202
265
|
|
203
266
|
update_plan_table = pd.DataFrame(rows)
|
204
267
|
return update_plan_table
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
from ._io_dask import ReadFrameDask
|
3
4
|
from ._django_db_connection import DjangoConnectionConfig
|
4
5
|
from ._django_load_from_db import DjangoLoadFromDb
|
5
|
-
from ._io_dask import ReadFrameDask
|
6
6
|
|
7
7
|
__all__ = [
|
8
8
|
"DjangoConnectionConfig",
|
{sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.22}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,22 +1,22 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from .
|
4
|
-
from ._clickhouse_writer import ClickHouseWriter
|
5
|
-
from ._credentials import *
|
6
|
-
from ._data_utils import DataUtils
|
7
|
-
from ._data_wrapper import DataWrapper
|
3
|
+
from ._log_utils import Logger
|
8
4
|
from ._date_utils import *
|
9
|
-
from .
|
5
|
+
from ._data_utils import DataUtils
|
10
6
|
from ._file_utils import FileUtils
|
11
7
|
from ._filepath_generator import FilePathGenerator
|
12
|
-
from .
|
13
|
-
from ._parquet_saver import ParquetSaver
|
8
|
+
from ._df_utils import DfUtils
|
14
9
|
from ._storage_manager import StorageManager
|
10
|
+
from ._parquet_saver import ParquetSaver
|
11
|
+
from ._clickhouse_writer import ClickHouseWriter
|
12
|
+
from ._airflow_manager import AirflowDAGManager
|
13
|
+
from ._credentials import *
|
14
|
+
from ._data_wrapper import DataWrapper
|
15
15
|
|
16
16
|
__all__ = [
|
17
|
+
"Logger",
|
17
18
|
"ConfigManager",
|
18
19
|
"ConfigLoader",
|
19
|
-
"Logger",
|
20
20
|
"DateUtils",
|
21
21
|
"BusinessDays",
|
22
22
|
"FileUtils",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|