sibi-dst 0.3.56__py3-none-any.whl → 0.3.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +165 -166
- sibi_dst/df_helper/_df_helper.py +55 -23
- sibi_dst/df_helper/_parquet_artifact.py +29 -11
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +182 -89
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +6 -2
- sibi_dst/utils/__init__.py +2 -0
- sibi_dst/utils/data_wrapper.py +34 -93
- sibi_dst/utils/parquet_saver.py +15 -12
- sibi_dst/utils/update_planner.py +237 -0
- {sibi_dst-0.3.56.dist-info → sibi_dst-0.3.57.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.56.dist-info → sibi_dst-0.3.57.dist-info}/RECORD +12 -11
- {sibi_dst-0.3.56.dist-info → sibi_dst-0.3.57.dist-info}/WHEEL +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
import datetime
|
2
2
|
import logging
|
3
|
+
import threading
|
3
4
|
from typing import Optional, Any, Dict
|
4
5
|
|
5
6
|
import dask.dataframe as dd
|
@@ -78,6 +79,7 @@ class ParquetArtifact(DfHelper):
|
|
78
79
|
`parquet_filename`, `parquet_start_date`,
|
79
80
|
or `parquet_end_date`) are missing or not set properly.
|
80
81
|
"""
|
82
|
+
self._lock = threading.Lock()
|
81
83
|
self.config = {
|
82
84
|
**self.DEFAULT_CONFIG,
|
83
85
|
**kwargs,
|
@@ -119,21 +121,36 @@ class ParquetArtifact(DfHelper):
|
|
119
121
|
super().__init__(**self.config)
|
120
122
|
|
121
123
|
def load(self, **kwargs):
|
122
|
-
self.
|
124
|
+
with self._lock:
|
125
|
+
self.df = super().load(**kwargs)
|
123
126
|
return self.df
|
124
127
|
|
125
128
|
def generate_parquet(self, **kwargs) -> None:
|
126
129
|
"""
|
127
130
|
Generate a Parquet file using the configured DataWrapper class.
|
128
131
|
"""
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
+
with self._lock:
|
133
|
+
params = self._prepare_params(kwargs)
|
134
|
+
dw = DataWrapper(self.data_wrapper_class, **params)
|
135
|
+
dw.process()
|
136
|
+
|
137
|
+
def __enter__(self):
|
138
|
+
if getattr(self, "_entered", False):
|
139
|
+
return self
|
140
|
+
self._entered = True
|
141
|
+
self.ensure_directory_exists(self.parquet_storage_path)
|
142
|
+
return self
|
132
143
|
|
133
144
|
def __exit__(self, exc_type, exc_value, traceback):
|
134
|
-
|
135
|
-
|
136
|
-
|
145
|
+
try:
|
146
|
+
if getattr(self, "_entered", False) and self.fs:
|
147
|
+
self.fs.close()
|
148
|
+
except Exception as e:
|
149
|
+
self.logger.warning(f"Error closing filesystem: {e}")
|
150
|
+
finally:
|
151
|
+
self._entered = False
|
152
|
+
# return False so exceptions aren’t suppressed
|
153
|
+
return False
|
137
154
|
|
138
155
|
def update_parquet(self, period: str = 'today', **kwargs) -> None:
|
139
156
|
"""Update the Parquet file with data from a specific period."""
|
@@ -223,7 +240,8 @@ class ParquetArtifact(DfHelper):
|
|
223
240
|
|
224
241
|
def ensure_directory_exists(self, path: str) -> None:
|
225
242
|
"""Ensure the directory exists in the specified filesystem."""
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
243
|
+
with self._lock:
|
244
|
+
try:
|
245
|
+
self.fs.makedirs(path, exist_ok=True)
|
246
|
+
except Exception as e:
|
247
|
+
raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
|
@@ -1,109 +1,202 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Any, Optional, ClassVar, Generator, Type
|
2
3
|
import threading
|
3
|
-
|
4
|
-
from pydantic import BaseModel, model_validator
|
5
|
-
from sqlalchemy import create_engine
|
4
|
+
from contextlib import contextmanager
|
5
|
+
from pydantic import BaseModel, field_validator, ValidationError, model_validator
|
6
|
+
from sqlalchemy import create_engine, event, text
|
7
|
+
from sqlalchemy.engine import url as sqlalchemy_url
|
8
|
+
from sqlalchemy.engine import Engine
|
6
9
|
from sqlalchemy.exc import OperationalError
|
7
|
-
from sqlalchemy.
|
10
|
+
from sqlalchemy.pool import QueuePool, NullPool, StaticPool
|
11
|
+
from sqlalchemy.orm import sessionmaker, Session
|
8
12
|
from sibi_dst.utils import Logger
|
9
|
-
|
10
13
|
from ._sql_model_builder import SqlAlchemyModelBuilder
|
11
14
|
|
15
|
+
|
12
16
|
class SqlAlchemyConnectionConfig(BaseModel):
|
13
17
|
"""
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
:ivar connection_url: The URL used to connect to the database.
|
22
|
-
:type connection_url: str
|
23
|
-
:ivar table: The name of the database table for which a model will be constructed.
|
24
|
-
:type table: Optional[str]
|
25
|
-
:ivar model: The dynamically built SQLAlchemy model for the specified table.
|
26
|
-
:type model: Any
|
27
|
-
:ivar engine: The SQLAlchemy engine instance reused for database connections.
|
28
|
-
:type engine: Optional[Any]
|
18
|
+
Thread-safe, registry-backed SQLAlchemy connection manager with:
|
19
|
+
- Shared engine reuse
|
20
|
+
- Active connection tracking
|
21
|
+
- Idle-pool and database-level cleanup
|
22
|
+
- Dynamic ORM model building via SqlAlchemyModelBuilder
|
23
|
+
- Optional session factory
|
29
24
|
"""
|
30
25
|
connection_url: str
|
31
26
|
table: Optional[str] = None
|
32
|
-
model: Any = None
|
33
|
-
engine: Optional[
|
34
|
-
logger:
|
35
|
-
|
36
|
-
|
27
|
+
model: Optional[Any] = None
|
28
|
+
engine: Optional[Engine] = None
|
29
|
+
logger: Logger = None
|
30
|
+
debug: bool = False
|
31
|
+
|
32
|
+
pool_size: int = 5
|
33
|
+
max_overflow: int = 10
|
37
34
|
pool_timeout: int = 30
|
38
|
-
pool_recycle:int = 300
|
35
|
+
pool_recycle: int = 300
|
36
|
+
pool_pre_ping: bool = True
|
37
|
+
poolclass: Type = QueuePool
|
39
38
|
|
40
|
-
|
41
|
-
|
39
|
+
session_factory: Optional[sessionmaker] = None
|
40
|
+
_owns_engine: bool = False
|
41
|
+
|
42
|
+
_engine_registry: ClassVar[dict[tuple, Engine]] = {}
|
42
43
|
_registry_lock: ClassVar[threading.Lock] = threading.Lock()
|
44
|
+
_active_connections: ClassVar[int] = 0
|
43
45
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
# Validate `connection_url`
|
56
|
-
if self.engine is not None:
|
57
|
-
engine_url = str(self.engine.url)
|
58
|
-
if engine_url != self.connection_url:
|
59
|
-
raise ValueError(f"Engine URL '{engine_url}' does not match the provided connection URL '{self.connection_url}'.")
|
60
|
-
else:
|
61
|
-
# Generate a unique key for the engine registry based on the connection URL
|
62
|
-
engine_key = (
|
63
|
-
self.connection_url,
|
64
|
-
self.pool_size,
|
65
|
-
self.max_overflow,
|
66
|
-
self.pool_timeout,
|
67
|
-
self.pool_recycle
|
68
|
-
)
|
69
|
-
with self.__class__._registry_lock:
|
70
|
-
if engine_key in self.__class__._engine_registry:
|
71
|
-
# Reuse the existing engine
|
72
|
-
self.logger.info(f"Reusing existing engine for connection URL: {self.connection_url}")
|
73
|
-
self.engine = self.__class__._engine_registry[engine_key]
|
74
|
-
else:
|
75
|
-
# Initialize the engine
|
76
|
-
self.logger.info(f"Creating new engine for connection URL: {self.connection_url}")
|
77
|
-
self.engine = create_engine(self.connection_url,
|
78
|
-
pool_size=self.pool_size,
|
79
|
-
max_overflow=self.max_overflow,
|
80
|
-
pool_timeout=self.pool_timeout,
|
81
|
-
pool_recycle=self.pool_recycle)
|
82
|
-
self.__class__._engine_registry[engine_key] = self.engine
|
83
|
-
|
84
|
-
# Validate the connection
|
85
|
-
self.validate_connection()
|
86
|
-
if not self.table:
|
87
|
-
raise ValueError("`table_name` must be provided to build the model.")
|
88
|
-
try:
|
89
|
-
self.model = SqlAlchemyModelBuilder(self.engine, self.table).build_model()
|
90
|
-
except Exception as e:
|
91
|
-
raise ValueError(f"Failed to build model for table '{self.table}': {e}")
|
46
|
+
class Config:
|
47
|
+
arbitrary_types_allowed = True
|
48
|
+
underscore_attrs_are_private = True
|
49
|
+
|
50
|
+
@field_validator("pool_size", "max_overflow", "pool_timeout", "pool_recycle")
|
51
|
+
@classmethod
|
52
|
+
def _validate_pool_params(cls, v: int) -> int:
|
53
|
+
if v < 0:
|
54
|
+
raise ValueError("Pool parameters must be non-negative")
|
55
|
+
return v
|
92
56
|
|
57
|
+
@model_validator(mode="after")
|
58
|
+
def _init_all(self) -> SqlAlchemyConnectionConfig:
|
59
|
+
self._init_logger()
|
60
|
+
self._init_engine()
|
61
|
+
self._validate_conn()
|
62
|
+
self._build_model()
|
63
|
+
self.session_factory = sessionmaker(bind=self.engine, expire_on_commit=False)
|
93
64
|
return self
|
94
65
|
|
95
|
-
def
|
96
|
-
|
97
|
-
|
98
|
-
|
66
|
+
def _init_logger(self) -> None:
|
67
|
+
self.logger = self.logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
68
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
69
|
+
|
70
|
+
def _engine_key(self) -> tuple:
|
71
|
+
parsed = sqlalchemy_url.make_url(self.connection_url)
|
72
|
+
query = {k: v for k, v in parsed.query.items() if not k.startswith("pool_")}
|
73
|
+
normalized = parsed.set(query=query)
|
74
|
+
key = [str(normalized)]
|
75
|
+
if self.poolclass not in (NullPool, StaticPool):
|
76
|
+
key += [self.pool_size, self.max_overflow, self.pool_timeout, self.pool_recycle, self.pool_pre_ping, self.table]
|
77
|
+
return tuple(key)
|
78
|
+
|
79
|
+
def _init_engine(self) -> None:
|
80
|
+
key = self._engine_key()
|
81
|
+
with self._registry_lock:
|
82
|
+
existing = self._engine_registry.get(key)
|
83
|
+
if existing:
|
84
|
+
self.engine = existing
|
85
|
+
self._owns_engine = False
|
86
|
+
self.logger.debug(f"Reusing engine {key}")
|
87
|
+
else:
|
88
|
+
self.logger.debug(f"Creating engine {key}")
|
89
|
+
self.engine = create_engine(
|
90
|
+
self.connection_url,
|
91
|
+
pool_size=self.pool_size,
|
92
|
+
max_overflow=self.max_overflow,
|
93
|
+
pool_timeout=self.pool_timeout,
|
94
|
+
pool_recycle=self.pool_recycle,
|
95
|
+
pool_pre_ping=self.pool_pre_ping,
|
96
|
+
poolclass=self.poolclass,
|
97
|
+
)
|
98
|
+
self._attach_events()
|
99
|
+
self._engine_registry[key] = self.engine
|
100
|
+
self._owns_engine = True
|
101
|
+
|
102
|
+
def _attach_events(self) -> None:
|
103
|
+
event.listen(self.engine, "checkout", self._on_checkout)
|
104
|
+
event.listen(self.engine, "checkin", self._on_checkin)
|
105
|
+
|
106
|
+
def _on_checkout(self, *args) -> None:
|
107
|
+
with self._registry_lock:
|
108
|
+
type(self)._active_connections += 1
|
109
|
+
self.logger.debug(f"Checked out, active: {self.active_connections}")
|
110
|
+
|
111
|
+
def _on_checkin(self, *args) -> None:
|
112
|
+
with self._registry_lock:
|
113
|
+
type(self)._active_connections = max(type(self)._active_connections - 1, 0)
|
114
|
+
self.logger.debug(f"Checked in, active: {self.active_connections}")
|
115
|
+
|
116
|
+
@property
|
117
|
+
def active_connections(self) -> int:
|
118
|
+
return type(self)._active_connections
|
119
|
+
|
120
|
+
def _validate_conn(self) -> None:
|
99
121
|
try:
|
100
|
-
with self.
|
101
|
-
|
122
|
+
with self.managed_connection() as conn:
|
123
|
+
conn.execute(text("SELECT 1"))
|
124
|
+
self.logger.debug("Connection OK")
|
102
125
|
except OperationalError as e:
|
103
|
-
|
126
|
+
self.logger.error(f"Connection failed: {e}")
|
127
|
+
raise ValidationError(f"DB connection failed: {e}")
|
104
128
|
|
105
|
-
@
|
106
|
-
def
|
107
|
-
|
108
|
-
|
109
|
-
|
129
|
+
@contextmanager
|
130
|
+
def managed_connection(self) -> Generator[Any, None, Any]:
|
131
|
+
conn = self.engine.connect()
|
132
|
+
try:
|
133
|
+
yield conn
|
134
|
+
finally:
|
135
|
+
conn.close()
|
136
|
+
|
137
|
+
def get_session(self) -> Session:
|
138
|
+
if not self.session_factory:
|
139
|
+
raise RuntimeError("Session factory not initialized")
|
140
|
+
return self.session_factory()
|
141
|
+
|
142
|
+
def _build_model(self) -> None:
|
143
|
+
"""Dynamically build and assign the ORM model if table is set"""
|
144
|
+
if not self.table or not self.engine:
|
145
|
+
return
|
146
|
+
try:
|
147
|
+
builder = SqlAlchemyModelBuilder(self.engine, self.table)
|
148
|
+
self.model = builder.build_model()
|
149
|
+
self.logger.debug(f"Model built for table: {self.table}")
|
150
|
+
except Exception as e:
|
151
|
+
self.logger.error(f"Model build failed: {e}")
|
152
|
+
raise ValidationError(f"Model construction error: {e}") from e
|
153
|
+
|
154
|
+
def dispose_idle_connections(self) -> int:
|
155
|
+
key = self._engine_key()
|
156
|
+
with self._registry_lock:
|
157
|
+
if self._engine_registry.get(key) is not self.engine:
|
158
|
+
self.logger.warning("Engine changed")
|
159
|
+
return 0
|
160
|
+
pool = self.engine.pool
|
161
|
+
if isinstance(pool, QueuePool):
|
162
|
+
count = pool.checkedin()
|
163
|
+
pool.dispose()
|
164
|
+
self.logger.debug(f"Disposed {count}")
|
165
|
+
return count
|
166
|
+
self.logger.warning(f"No idle dispose for {type(pool).__name__}")
|
167
|
+
return 0
|
168
|
+
|
169
|
+
def terminate_idle_connections(self, idle_seconds: int = 300) -> int:
|
170
|
+
terminated = 0
|
171
|
+
dialect = self.engine.dialect.name
|
172
|
+
with self.managed_connection() as conn:
|
173
|
+
if dialect == 'postgresql':
|
174
|
+
res = conn.execute(text(
|
175
|
+
f"SELECT pg_terminate_backend(pid) FROM pg_stat_activity "
|
176
|
+
f"WHERE state='idle' AND (now() - query_start) > interval '{idle_seconds} seconds' "
|
177
|
+
f"AND pid<>pg_backend_pid()"
|
178
|
+
))
|
179
|
+
terminated = res.rowcount
|
180
|
+
elif dialect == 'mysql':
|
181
|
+
for row in conn.execute(text("SHOW PROCESSLIST")):
|
182
|
+
if row.Command == 'Sleep' and row.Time > idle_seconds:
|
183
|
+
conn.execute(text(f"KILL {row.Id}"))
|
184
|
+
terminated += 1
|
185
|
+
else:
|
186
|
+
self.logger.warning(f"Idle termination not supported: {dialect}")
|
187
|
+
self.logger.debug(f"Terminated {terminated}")
|
188
|
+
return terminated
|
189
|
+
|
190
|
+
def close(self) -> None:
|
191
|
+
with self._registry_lock:
|
192
|
+
key = self._engine_key()
|
193
|
+
if not self._owns_engine:
|
194
|
+
self.logger.warning("Not owner, skipping close")
|
195
|
+
return
|
196
|
+
if self._engine_registry.get(key) != self.engine:
|
197
|
+
self.logger.warning("Engine not in registry")
|
198
|
+
return
|
199
|
+
self.engine.dispose()
|
200
|
+
del self._engine_registry[key]
|
201
|
+
type(self)._active_connections = 0
|
202
|
+
self.logger.debug(f"Engine closed {key}")
|
@@ -133,9 +133,13 @@ class SqlAlchemyLoadFromDb:
|
|
133
133
|
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
134
134
|
|
135
135
|
return dask_df
|
136
|
+
|
136
137
|
return self.df
|
138
|
+
except RuntimeError as e:
|
139
|
+
self.logger.info(f"Runtime Error {e}:Failed to load data into Dask DataFrame.")
|
140
|
+
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
141
|
+
return dask_df
|
137
142
|
except Exception as e:
|
138
|
-
self.logger.
|
143
|
+
self.logger.info(f"Exception {e}:Failed to load data into Dask DataFrame.")
|
139
144
|
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
140
|
-
|
141
145
|
return dask_df
|
sibi_dst/utils/__init__.py
CHANGED
@@ -12,11 +12,13 @@ from .parquet_saver import ParquetSaver
|
|
12
12
|
from .clickhouse_writer import ClickHouseWriter
|
13
13
|
from .airflow_manager import AirflowDAGManager
|
14
14
|
from .credentials import *
|
15
|
+
from .update_planner import UpdatePlanner
|
15
16
|
from .data_wrapper import DataWrapper
|
16
17
|
from .storage_config import StorageConfig
|
17
18
|
from .data_from_http_source import DataFromHttpSource
|
18
19
|
from .webdav_client import WebDAVClient
|
19
20
|
|
21
|
+
|
20
22
|
__all__ = [
|
21
23
|
"Logger",
|
22
24
|
"ConfigManager",
|
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -2,7 +2,7 @@ import datetime
|
|
2
2
|
import logging
|
3
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4
4
|
from typing import Type, Any, Dict, Optional, Union, List, Tuple
|
5
|
-
|
5
|
+
import threading
|
6
6
|
import fsspec
|
7
7
|
import pandas as pd
|
8
8
|
from IPython.display import display
|
@@ -11,6 +11,7 @@ from tqdm import tqdm
|
|
11
11
|
from .log_utils import Logger
|
12
12
|
from .date_utils import FileAgeChecker
|
13
13
|
from .parquet_saver import ParquetSaver
|
14
|
+
from .update_planner import UpdatePlanner
|
14
15
|
|
15
16
|
|
16
17
|
class DataWrapper:
|
@@ -48,7 +49,7 @@ class DataWrapper:
|
|
48
49
|
timeout: float = 60,
|
49
50
|
reference_date: datetime.date = None,
|
50
51
|
custom_priority_map: Dict[str, int] = None,
|
51
|
-
max_threads: int =
|
52
|
+
max_threads: int = 3):
|
52
53
|
self.dataclass = dataclass
|
53
54
|
self.date_field = date_field
|
54
55
|
self.data_path = self._ensure_forward_slash(data_path)
|
@@ -75,10 +76,26 @@ class DataWrapper:
|
|
75
76
|
|
76
77
|
self.start_date = self._convert_to_date(start_date)
|
77
78
|
self.end_date = self._convert_to_date(end_date)
|
78
|
-
self._lock = Lock()
|
79
|
+
self._lock = threading.Lock()
|
79
80
|
self.processed_dates = []
|
80
81
|
self.age_checker = FileAgeChecker(logger=self.logger)
|
81
|
-
|
82
|
+
|
83
|
+
self.update_planner_params = {
|
84
|
+
"data_path": self.data_path,
|
85
|
+
"filename": self.parquet_filename,
|
86
|
+
"fs": self.fs,
|
87
|
+
"debug": self.debug,
|
88
|
+
"logger": self.logger,
|
89
|
+
"reverse_order": self.reverse_order,
|
90
|
+
"overwrite": self.overwrite,
|
91
|
+
"ignore_missing": self.ignore_missing,
|
92
|
+
"history_days_threshold": history_days_threshold,
|
93
|
+
"max_age_minutes": max_age_minutes,
|
94
|
+
"show_progress": self.show_progress,
|
95
|
+
"description": f"{self.dataclass.__name__}"
|
96
|
+
}
|
97
|
+
self.update_plan = UpdatePlanner(**self.update_planner_params).generate_plan(self.start_date, self.end_date)
|
98
|
+
|
82
99
|
|
83
100
|
def _init_filesystem(self) -> fsspec.AbstractFileSystem:
|
84
101
|
with self._lock:
|
@@ -115,13 +132,13 @@ class DataWrapper:
|
|
115
132
|
|
116
133
|
def process(self, max_retries: int = 3):
|
117
134
|
"""Process updates with priority-based execution and retries"""
|
118
|
-
update_plan = self.generate_update_plan()
|
119
|
-
|
135
|
+
#update_plan = self.generate_update_plan()
|
136
|
+
update_plan = self.update_plan
|
120
137
|
if update_plan.empty:
|
121
138
|
self.logger.info("No updates required")
|
122
139
|
return
|
123
140
|
# Filter for required updates first
|
124
|
-
update_plan = update_plan[update_plan["update_required"] == True]
|
141
|
+
#update_plan = update_plan[update_plan["update_required"] == True]
|
125
142
|
|
126
143
|
if self.show_progress:
|
127
144
|
#display(self._enhanced_display_table(update_plan))
|
@@ -140,9 +157,9 @@ class DataWrapper:
|
|
140
157
|
return
|
141
158
|
|
142
159
|
desc = f"Processing {self.dataclass.__name__}, task: {self._priority_label(priority)}"
|
143
|
-
self.logger.
|
160
|
+
self.logger.debug(f"Starting {desc.lower()}")
|
144
161
|
max_threads = min(len(dates), self.max_threads)
|
145
|
-
self.logger.
|
162
|
+
self.logger.debug(f"DataWrapper Max threads set at: {max_threads}")
|
146
163
|
with ThreadPoolExecutor(max_workers=max_threads) as executor:
|
147
164
|
futures = {
|
148
165
|
executor.submit(self._process_date_with_retry, date, max_retries): date
|
@@ -166,83 +183,6 @@ class DataWrapper:
|
|
166
183
|
f"Unknown Priority {priority}"
|
167
184
|
)
|
168
185
|
|
169
|
-
def _enhanced_display_table(self, df: pd.DataFrame) -> pd.DataFrame.style:
|
170
|
-
"""Format the update plan table for better readability"""
|
171
|
-
return df.style \
|
172
|
-
.bar(subset=["file_age_minutes"], color="#5fba7d") \
|
173
|
-
.background_gradient(subset=["update_priority"], cmap="YlOrBr") \
|
174
|
-
.set_caption(f"Update Plan: {self.dataclass.__name__}")
|
175
|
-
|
176
|
-
def generate_update_plan(self) -> pd.DataFrame:
|
177
|
-
"""Generate update plan with parallel file status checks"""
|
178
|
-
dates = self.generate_date_range()
|
179
|
-
history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
|
180
|
-
rows = []
|
181
|
-
|
182
|
-
with ThreadPoolExecutor() as executor:
|
183
|
-
future_to_date = {
|
184
|
-
executor.submit(self._get_file_status, date): date
|
185
|
-
for date in dates
|
186
|
-
}
|
187
|
-
|
188
|
-
for future in tqdm(as_completed(future_to_date),
|
189
|
-
total=len(future_to_date),
|
190
|
-
desc=f"Analyzing files for {self.dataclass.__name__} ",
|
191
|
-
disable=not self.show_progress):
|
192
|
-
current_date = future_to_date[future]
|
193
|
-
file_exists, file_age = future.result()
|
194
|
-
rows.append(self._create_plan_row(
|
195
|
-
current_date,
|
196
|
-
history_start,
|
197
|
-
file_exists,
|
198
|
-
file_age
|
199
|
-
))
|
200
|
-
|
201
|
-
return pd.DataFrame(rows).sort_values("update_priority")
|
202
|
-
|
203
|
-
def _get_file_status(self, date: datetime.date) -> Tuple[bool, float]:
|
204
|
-
"""Get file existence and age with error handling"""
|
205
|
-
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/{self.parquet_filename}"
|
206
|
-
try:
|
207
|
-
exists = self.fs.exists(path)
|
208
|
-
age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
|
209
|
-
return exists, age
|
210
|
-
except Exception as e:
|
211
|
-
self.logger.warning(f"Error checking {path}: {str(e)}")
|
212
|
-
return False, None
|
213
|
-
|
214
|
-
def _create_plan_row(self,
|
215
|
-
date: datetime.date,
|
216
|
-
history_start: datetime.date,
|
217
|
-
file_exists: bool,
|
218
|
-
file_age: float) -> dict:
|
219
|
-
"""Create a row for the update plan DataFrame"""
|
220
|
-
within_history = history_start <= date <= self.reference_date
|
221
|
-
category, update_required = "file_is_recent", False
|
222
|
-
|
223
|
-
if self.overwrite:
|
224
|
-
category, update_required = "overwrite", True
|
225
|
-
elif within_history:
|
226
|
-
if not file_exists:
|
227
|
-
category, update_required = "missing_in_history", True
|
228
|
-
elif file_age > self.max_age_minutes:
|
229
|
-
category, update_required = "existing_but_stale", True
|
230
|
-
elif not file_exists and not self.ignore_missing:
|
231
|
-
category, update_required = "missing_outside_history", True
|
232
|
-
|
233
|
-
return {
|
234
|
-
"date": date,
|
235
|
-
"file_exists": file_exists,
|
236
|
-
"file_age_minutes": file_age,
|
237
|
-
"age_threshold": self.max_age_minutes,
|
238
|
-
"within_history": within_history,
|
239
|
-
"ignore_missing": self.ignore_missing,
|
240
|
-
"update_category": category,
|
241
|
-
"update_priority": self.priority_map[category],
|
242
|
-
"update_required": update_required,
|
243
|
-
"class": self.dataclass.__name__
|
244
|
-
}
|
245
|
-
|
246
186
|
def _process_date_with_retry(self, date: datetime.date, max_retries: int):
|
247
187
|
"""Process a date with retry logic"""
|
248
188
|
for attempt in range(1, max_retries + 1):
|
@@ -267,13 +207,14 @@ class DataWrapper:
|
|
267
207
|
self.logger.debug(f"Class Params: {self.class_params}")
|
268
208
|
self.logger.debug(f"Load Params: {self.load_params}")
|
269
209
|
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
210
|
+
df = pd.DataFrame()
|
211
|
+
with self.dataclass(**self.class_params) as data:
|
212
|
+
df = data.load_period(
|
213
|
+
dt_field=self.date_field,
|
214
|
+
start=date,
|
215
|
+
end=date,
|
216
|
+
**self.load_params
|
217
|
+
)
|
277
218
|
|
278
219
|
if len(df.index)==0:
|
279
220
|
self.logger.warning(f"No data found for {date}")
|
sibi_dst/utils/parquet_saver.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import base64
|
2
2
|
import hashlib
|
3
3
|
import logging
|
4
|
+
import threading
|
4
5
|
import warnings
|
5
6
|
from typing import Optional
|
6
7
|
|
@@ -27,6 +28,7 @@ class ParquetSaver:
|
|
27
28
|
self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
28
29
|
self.fs = fs
|
29
30
|
self.protocol = self.parquet_storage_path.split(":")[0]
|
31
|
+
self._lock = threading.Lock()
|
30
32
|
|
31
33
|
def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
|
32
34
|
"""
|
@@ -34,18 +36,19 @@ class ParquetSaver:
|
|
34
36
|
:param parquet_filename: Filename for the Parquet file.
|
35
37
|
:param clear_existing: Whether to clear existing files in the target directory.
|
36
38
|
"""
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
self.fs.close
|
39
|
+
with self._lock:
|
40
|
+
full_path = self._construct_full_path(parquet_filename)
|
41
|
+
self.logger.info(f"Save method for :{full_path}")
|
42
|
+
# Ensure directory exists and clear if necessary
|
43
|
+
self._ensure_directory_exists(full_path, clear_existing=clear_existing)
|
44
|
+
|
45
|
+
# Define schema and save DataFrame to Parquet
|
46
|
+
schema = self._define_schema()
|
47
|
+
self._convert_dtypes(schema)
|
48
|
+
self._save_dataframe_to_parquet(full_path, schema)
|
49
|
+
# Close the filesystem if the close method exists
|
50
|
+
if hasattr(self.fs, 'close') and callable(getattr(self.fs, 'close', None)):
|
51
|
+
self.fs.close()
|
49
52
|
|
50
53
|
def _define_schema(self) -> pa.Schema:
|
51
54
|
"""Define a PyArrow schema dynamically based on df_result column types."""
|