sibi-dst 2025.8.6__py3-none-any.whl → 2025.8.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +111 -61
- sibi_dst/df_helper/_parquet_artifact.py +11 -10
- sibi_dst/df_helper/_parquet_reader.py +4 -0
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +504 -214
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +11 -10
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +9 -8
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +4 -76
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -104
- sibi_dst/utils/async_utils.py +12 -0
- sibi_dst/utils/boilerplate/__init__.py +6 -0
- sibi_dst/utils/boilerplate/base_data_artifact.py +110 -0
- sibi_dst/utils/boilerplate/base_data_cube.py +79 -0
- sibi_dst/utils/data_wrapper.py +22 -263
- sibi_dst/utils/iceberg_saver.py +126 -0
- sibi_dst/utils/log_utils.py +0 -346
- sibi_dst/utils/parquet_saver.py +110 -9
- sibi_dst/utils/progress/__init__.py +5 -0
- sibi_dst/utils/progress/jobs.py +82 -0
- sibi_dst/utils/progress/sse_runner.py +82 -0
- sibi_dst/utils/storage_hive.py +232 -0
- sibi_dst/utils/update_planner.py +617 -116
- {sibi_dst-2025.8.6.dist-info → sibi_dst-2025.8.8.dist-info}/METADATA +3 -2
- {sibi_dst-2025.8.6.dist-info → sibi_dst-2025.8.8.dist-info}/RECORD +24 -15
- {sibi_dst-2025.8.6.dist-info → sibi_dst-2025.8.8.dist-info}/WHEEL +0 -0
@@ -29,6 +29,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
29
29
|
connection_url: str
|
30
30
|
table: Optional[str] = None
|
31
31
|
debug: bool = False
|
32
|
+
logger_extra: Optional[Dict[str, Any]] = {"sibi_dst_component": __name__}
|
32
33
|
|
33
34
|
# --- Pool Configuration ---
|
34
35
|
pool_size: int = int(os.environ.get("DB_POOL_SIZE", 5))
|
@@ -99,10 +100,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
99
100
|
self.engine = wrapper["engine"]
|
100
101
|
wrapper["ref_count"] += 1
|
101
102
|
if self.debug:
|
102
|
-
self.logger.debug(f"Reusing engine. Ref count: {wrapper['ref_count']}.")
|
103
|
+
self.logger.debug(f"Reusing engine. Ref count: {wrapper['ref_count']}.", extra=self.logger_extra)
|
103
104
|
else:
|
104
105
|
if self.debug:
|
105
|
-
self.logger.debug(f"Creating new engine for key: {self._engine_key_instance}")
|
106
|
+
self.logger.debug(f"Creating new engine for key: {self._engine_key_instance}", extra=self.logger_extra)
|
106
107
|
try:
|
107
108
|
new_engine = create_engine(
|
108
109
|
self.connection_url,
|
@@ -121,7 +122,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
121
122
|
"active_connections": 0,
|
122
123
|
}
|
123
124
|
except Exception as e:
|
124
|
-
self.logger.error(f"Failed to create engine: {e}")
|
125
|
+
self.logger.error(f"Failed to create engine: {e}", extra=self.logger_extra)
|
125
126
|
raise SQLAlchemyError(f"Engine creation failed: {e}") from e
|
126
127
|
|
127
128
|
def close(self) -> None:
|
@@ -134,14 +135,14 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
134
135
|
key = self._engine_key_instance
|
135
136
|
wrapper = _ENGINE_REGISTRY.get(key)
|
136
137
|
if not wrapper:
|
137
|
-
self.logger.warning("Attempted to close a config whose engine is not in the registry.")
|
138
|
+
self.logger.warning("Attempted to close a config whose engine is not in the registry.", extra=self.logger_extra)
|
138
139
|
else:
|
139
140
|
wrapper["ref_count"] -= 1
|
140
141
|
if self.debug:
|
141
|
-
self.logger.debug(f"Closing connection. Ref count now {wrapper['ref_count']}.")
|
142
|
+
self.logger.debug(f"Closing connection. Ref count now {wrapper['ref_count']}.", extra=self.logger_extra)
|
142
143
|
if wrapper["ref_count"] <= 0:
|
143
144
|
if self.debug:
|
144
|
-
self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
|
145
|
+
self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}", extra=self.logger_extra)
|
145
146
|
try:
|
146
147
|
wrapper["engine"].dispose()
|
147
148
|
finally:
|
@@ -177,9 +178,9 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
177
178
|
with self.managed_connection() as conn:
|
178
179
|
conn.execute(text("SELECT 1"))
|
179
180
|
if self.debug:
|
180
|
-
self.logger.debug("Database connection validated successfully.")
|
181
|
+
self.logger.debug("Database connection validated successfully.", extra=self.logger_extra)
|
181
182
|
except OperationalError as e:
|
182
|
-
self.logger.error(f"Database connection failed: {e}")
|
183
|
+
self.logger.error(f"Database connection failed: {e}", extra=self.logger_extra)
|
183
184
|
raise ValueError(f"DB connection failed: {e}") from e
|
184
185
|
|
185
186
|
@contextmanager
|
@@ -204,8 +205,8 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
204
205
|
builder = SqlAlchemyModelBuilder(self.engine, self.table)
|
205
206
|
self.model = builder.build_model()
|
206
207
|
if self.debug:
|
207
|
-
self.logger.debug(f"Successfully built ORM model for table: {self.table}")
|
208
|
+
self.logger.debug(f"Successfully built ORM model for table: {self.table}", extra=self.logger_extra)
|
208
209
|
except Exception as e:
|
209
|
-
self.logger.error(f"Failed to build ORM model for table '{self.table}': {e}")
|
210
|
+
self.logger.error(f"Failed to build ORM model for table '{self.table}': {e}", extra=self.logger_extra)
|
210
211
|
raise ValueError(f"Model construction failed for table '{self.table}': {e}") from e
|
211
212
|
|
@@ -38,6 +38,7 @@ class SQLAlchemyDask(ManagedResource):
|
|
38
38
|
"TIME": "object",
|
39
39
|
"UUID": "object",
|
40
40
|
}
|
41
|
+
logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
|
41
42
|
|
42
43
|
def __init__(
|
43
44
|
self,
|
@@ -97,7 +98,7 @@ class SQLAlchemyDask(ManagedResource):
|
|
97
98
|
max_overflow = _to_int(max_overflow_attr, 10)
|
98
99
|
|
99
100
|
cap = max(1, pool_size + max_overflow - 1)
|
100
|
-
self.logger.debug(f"Using a Cap of {cap} from pool size of {pool_size} and max overflow of {max_overflow}.")
|
101
|
+
self.logger.debug(f"Using a Cap of {cap} from pool size of {pool_size} and max overflow of {max_overflow}.", extra=self.logger_extra)
|
101
102
|
return max(1, cap)
|
102
103
|
|
103
104
|
# ---------- meta ----------
|
@@ -140,25 +141,25 @@ class SQLAlchemyDask(ManagedResource):
|
|
140
141
|
break
|
141
142
|
except SASQLTimeoutError:
|
142
143
|
if attempt < retry_attempts - 1:
|
143
|
-
self.logger.warning(f"Connection pool limit reached. Retrying in {backoff} seconds...")
|
144
|
+
self.logger.warning(f"Connection pool limit reached. Retrying in {backoff} seconds...", extra=self.logger_extra)
|
144
145
|
time.sleep(backoff)
|
145
146
|
backoff *= 2
|
146
147
|
else:
|
147
148
|
self.total_records = -1
|
148
|
-
self.logger.error("Failed to get a connection from the pool after retries.", exc_info=True)
|
149
|
+
self.logger.error("Failed to get a connection from the pool after retries.", exc_info=True, extra=self.logger_extra)
|
149
150
|
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
150
151
|
except OperationalError as oe:
|
151
152
|
if "timeout" in str(oe).lower() and attempt < retry_attempts - 1:
|
152
|
-
self.logger.warning("Operational timeout, retrying…", exc_info=self.debug)
|
153
|
+
self.logger.warning("Operational timeout, retrying…", exc_info=self.debug, extra=self.logger_extra)
|
153
154
|
time.sleep(backoff)
|
154
155
|
backoff *= 2
|
155
156
|
continue
|
156
157
|
self.total_records = -1
|
157
|
-
self.logger.error("OperationalError during count.", exc_info=True)
|
158
|
+
self.logger.error("OperationalError during count.", exc_info=True, extra=self.logger_extra)
|
158
159
|
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
159
160
|
except Exception as e:
|
160
161
|
self.total_records = -1
|
161
|
-
self.logger.error(f"Unexpected error during count: {e}", exc_info=True)
|
162
|
+
self.logger.error(f"Unexpected error during count: {e}", exc_info=True, extra=self.logger_extra)
|
162
163
|
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
163
164
|
|
164
165
|
self.total_records = int(total)
|
@@ -167,7 +168,7 @@ class SQLAlchemyDask(ManagedResource):
|
|
167
168
|
super().close()
|
168
169
|
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
169
170
|
|
170
|
-
self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
|
171
|
+
self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.", extra=self.logger_extra)
|
171
172
|
|
172
173
|
@dask.delayed
|
173
174
|
def get_chunk(sql_query, chunk_offset):
|
@@ -181,6 +182,6 @@ class SQLAlchemyDask(ManagedResource):
|
|
181
182
|
offsets = range(0, total, self.chunk_size)
|
182
183
|
delayed_chunks = [get_chunk(query, off) for off in offsets]
|
183
184
|
ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
|
184
|
-
self.logger.debug(f"
|
185
|
+
self.logger.debug(f"{self.model.__name__} created Dask DataFrame with {ddf.npartitions} partitions.", extra=self.logger_extra)
|
185
186
|
return self.total_records, ddf
|
186
187
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import Any, Tuple
|
3
|
+
from typing import Any, Tuple, Dict
|
4
4
|
|
5
5
|
import dask.dataframe as dd
|
6
6
|
import pandas as pd
|
@@ -15,6 +15,7 @@ class SqlAlchemyLoadFromDb(ManagedResource):
|
|
15
15
|
"""
|
16
16
|
Orchestrates loading data from a database using SQLAlchemy into a Dask DataFrame.
|
17
17
|
"""
|
18
|
+
logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
|
18
19
|
|
19
20
|
def __init__(
|
20
21
|
self,
|
@@ -43,86 +44,13 @@ class SqlAlchemyLoadFromDb(ManagedResource):
|
|
43
44
|
verbose=self.verbose,
|
44
45
|
debug=self.debug,
|
45
46
|
) as loader:
|
46
|
-
self.logger.debug(f"SQLAlchemyDask loader initialized for model: {self.model.__name__}")
|
47
|
+
self.logger.debug(f"SQLAlchemyDask loader initialized for model: {self.model.__name__}", extra=self.logger_extra)
|
47
48
|
self.total_records, dask_df = loader.read_frame()
|
48
49
|
return self.total_records, dask_df
|
49
50
|
except Exception as e:
|
50
51
|
self.total_records = -1
|
51
|
-
self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
|
52
|
+
self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True, extra=self.logger_extra)
|
52
53
|
# empty df with correct columns
|
53
54
|
columns = [c.name for c in self.model.__table__.columns]
|
54
55
|
return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
55
56
|
|
56
|
-
# from __future__ import annotations
|
57
|
-
#
|
58
|
-
# from typing import Any
|
59
|
-
#
|
60
|
-
# import dask.dataframe as dd
|
61
|
-
# import pandas as pd
|
62
|
-
#
|
63
|
-
# from sibi_dst.utils import ManagedResource
|
64
|
-
# from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
65
|
-
# from ._db_connection import SqlAlchemyConnectionConfig
|
66
|
-
# from ._io_dask import SQLAlchemyDask
|
67
|
-
#
|
68
|
-
# class SqlAlchemyLoadFromDb(ManagedResource):
|
69
|
-
# """
|
70
|
-
# Orchestrates loading data from a database using SQLAlchemy into a Dask
|
71
|
-
# DataFrame by configuring and delegating to the SQLAlchemyDask loader.
|
72
|
-
# """
|
73
|
-
#
|
74
|
-
# def __init__(
|
75
|
-
# self,
|
76
|
-
# plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
77
|
-
# plugin_query: QueryConfig = None,
|
78
|
-
# plugin_params: ParamsConfig = None,
|
79
|
-
# **kwargs,
|
80
|
-
# ):
|
81
|
-
# """
|
82
|
-
# Initializes the loader with all necessary configurations.
|
83
|
-
#
|
84
|
-
# Args:
|
85
|
-
# plugin_sqlalchemy: The database connection configuration object.
|
86
|
-
# plugin_query: The query configuration object.
|
87
|
-
# plugin_params: The parameters and filters configuration object.
|
88
|
-
# logger: An optional logger instance.
|
89
|
-
# **kwargs: Must contain 'index_column' for Dask partitioning.
|
90
|
-
# """
|
91
|
-
# super().__init__(**kwargs)
|
92
|
-
# self.db_connection = plugin_sqlalchemy
|
93
|
-
# self.model = self.db_connection.model
|
94
|
-
# self.engine = self.db_connection.engine
|
95
|
-
# self.query_config = plugin_query
|
96
|
-
# self.params_config = plugin_params
|
97
|
-
# self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
|
98
|
-
# self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
|
99
|
-
#
|
100
|
-
# def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
|
101
|
-
# """
|
102
|
-
# Builds and loads a Dask DataFrame from a SQLAlchemy source.
|
103
|
-
#
|
104
|
-
# This method is stateless and returns the DataFrame directly.
|
105
|
-
#
|
106
|
-
# Returns:
|
107
|
-
# A Dask DataFrame containing the queried data or an empty,
|
108
|
-
# correctly structured DataFrame if the query fails or returns no results.
|
109
|
-
# """
|
110
|
-
# try:
|
111
|
-
# # Instantiate and use the low-level Dask loader
|
112
|
-
# with SQLAlchemyDask(model=self.model,filters=self.params_config.filters if self.params_config else {},
|
113
|
-
# engine=self.engine,
|
114
|
-
# chunk_size=self.chunk_size,
|
115
|
-
# logger=self.logger,
|
116
|
-
# verbose=self.verbose,
|
117
|
-
# debug=self.debug) as sqlalchemy_dask_loader:
|
118
|
-
# self.logger.debug(f"SQLAlchemyDask loader initialized for model: {self.model.__name__}")
|
119
|
-
# # Create the lazy DataFrame and read a record count
|
120
|
-
# # if total_records less than 0, it means an error occurred during the loading process
|
121
|
-
# self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
|
122
|
-
# return self.total_records, dask_df
|
123
|
-
# except Exception as e:
|
124
|
-
# self.total_records = -1
|
125
|
-
# self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
|
126
|
-
# # Return an empty dataframe with the correct schema on failure
|
127
|
-
# columns = [c.name for c in self.model.__table__.columns]
|
128
|
-
# return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
@@ -48,107 +48,3 @@ class SqlAlchemyModelBuilder:
|
|
48
48
|
return f"{sane_name}_field"
|
49
49
|
return sane_name
|
50
50
|
|
51
|
-
# import re
|
52
|
-
# import keyword
|
53
|
-
# import threading
|
54
|
-
# from sqlalchemy import MetaData, Engine
|
55
|
-
# from sqlalchemy.orm import DeclarativeBase
|
56
|
-
#
|
57
|
-
#
|
58
|
-
# class Base(DeclarativeBase):
|
59
|
-
# """Shared declarative base for all ORM models."""
|
60
|
-
# pass
|
61
|
-
#
|
62
|
-
#
|
63
|
-
# apps_label = "datacubes.models"
|
64
|
-
#
|
65
|
-
#
|
66
|
-
# class SqlAlchemyModelBuilder:
|
67
|
-
# """
|
68
|
-
# Builds a single SQLAlchemy ORM model from a specific database table.
|
69
|
-
# This class is thread-safe and caches reflected table metadata to
|
70
|
-
# improve performance across multiple instantiations.
|
71
|
-
# """
|
72
|
-
# _lock = threading.Lock()
|
73
|
-
# _metadata_cache: dict[str, MetaData] = {}
|
74
|
-
#
|
75
|
-
# def __init__(self, engine: Engine, table_name: str):
|
76
|
-
# """
|
77
|
-
# Initializes the model builder for a specific table.
|
78
|
-
#
|
79
|
-
# Args:
|
80
|
-
# engine: The SQLAlchemy engine connected to the database.
|
81
|
-
# table_name: The name of the table to generate the model for.
|
82
|
-
# """
|
83
|
-
# self.engine = engine
|
84
|
-
# self.table_name = table_name
|
85
|
-
# self.class_name = self._normalize_class_name(self.table_name)
|
86
|
-
#
|
87
|
-
# engine_key = str(engine.url)
|
88
|
-
#
|
89
|
-
# # ✅ REFACTOR: Acquire lock to make cache access and creation atomic,
|
90
|
-
# # preventing a race condition between multiple threads.
|
91
|
-
# with self._lock:
|
92
|
-
# if engine_key not in self._metadata_cache:
|
93
|
-
# self._metadata_cache[engine_key] = MetaData()
|
94
|
-
# self.metadata = self._metadata_cache[engine_key]
|
95
|
-
#
|
96
|
-
# def build_model(self) -> type:
|
97
|
-
# """
|
98
|
-
# Builds and returns a database model class for the specified table.
|
99
|
-
# This process is atomic and thread-safe.
|
100
|
-
#
|
101
|
-
# Raises:
|
102
|
-
# ValueError: If the specified table does not exist in the database.
|
103
|
-
# Returns:
|
104
|
-
# The dynamically created ORM model class.
|
105
|
-
# """
|
106
|
-
# with self._lock:
|
107
|
-
# # NOTE: Using a private SQLAlchemy API. This is a performance
|
108
|
-
# # optimization but may break in future versions of the library.
|
109
|
-
# registered_model = Base.registry._class_registry.get(self.class_name)
|
110
|
-
# if registered_model:
|
111
|
-
# return registered_model
|
112
|
-
#
|
113
|
-
# # Check if the table's schema is in our metadata cache
|
114
|
-
# table = self.metadata.tables.get(self.table_name)
|
115
|
-
#
|
116
|
-
# # If not cached, reflect it from the database
|
117
|
-
# if table is None:
|
118
|
-
# self.metadata.reflect(bind=self.engine, only=[self.table_name])
|
119
|
-
# table = self.metadata.tables.get(self.table_name)
|
120
|
-
#
|
121
|
-
# if table is None:
|
122
|
-
# raise ValueError(
|
123
|
-
# f"Table '{self.table_name}' does not exist in the database."
|
124
|
-
# )
|
125
|
-
#
|
126
|
-
# # Create the model class dynamically.
|
127
|
-
# attrs = {
|
128
|
-
# "__tablename__": table.name,
|
129
|
-
# "__table__": table,
|
130
|
-
# "__module__": apps_label,
|
131
|
-
# }
|
132
|
-
# model = type(self.class_name, (Base,), attrs)
|
133
|
-
#
|
134
|
-
# return model
|
135
|
-
#
|
136
|
-
# @staticmethod
|
137
|
-
# def _normalize_class_name(table_name: str) -> str:
|
138
|
-
# """Converts a snake_case table_name to a CamelCase class name."""
|
139
|
-
# return "".join(word.capitalize() for word in table_name.split("_"))
|
140
|
-
#
|
141
|
-
# @staticmethod
|
142
|
-
# def _normalize_column_name(column_name: str) -> str:
|
143
|
-
# """
|
144
|
-
# Sanitizes a column name to be a valid Python identifier.
|
145
|
-
# (Kept for utility, though not used in the final model creation).
|
146
|
-
# """
|
147
|
-
# sane_name = re.sub(r"\W", "_", column_name)
|
148
|
-
# sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
|
149
|
-
#
|
150
|
-
# if keyword.iskeyword(sane_name):
|
151
|
-
# return f"{sane_name}_field"
|
152
|
-
# return sane_name
|
153
|
-
#
|
154
|
-
#
|
@@ -0,0 +1,12 @@
|
|
1
|
+
import asyncio
|
2
|
+
import dask.dataframe as dd
|
3
|
+
|
4
|
+
|
5
|
+
def is_dask_dataframe(df):
|
6
|
+
"""Check if the given object is a Dask DataFrame."""
|
7
|
+
return isinstance(df, dd.DataFrame)
|
8
|
+
|
9
|
+
async def to_thread(func, *args, **kwargs):
|
10
|
+
"""Explicit helper to keep code clear where we hop off the event loop."""
|
11
|
+
return await asyncio.to_thread(func, *args, **kwargs)
|
12
|
+
|
@@ -0,0 +1,110 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
from typing import Any, Dict, Mapping, Optional, Type, Union
|
5
|
+
from datetime import date, datetime
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import dask.dataframe as dd
|
9
|
+
from sibi_dst.df_helper import ParquetArtifact
|
10
|
+
|
11
|
+
|
12
|
+
DateLike = Union[str, date, datetime, None]
|
13
|
+
|
14
|
+
|
15
|
+
def _validate_and_format_date(name: str, value: DateLike) -> Optional[str]:
|
16
|
+
"""
|
17
|
+
Normalize date-like input into a canonical string '%Y-%m-%d'.
|
18
|
+
|
19
|
+
- None -> None
|
20
|
+
- str/date/datetime -> parse with pandas.to_datetime, take .date(), return '%Y-%m-%d'
|
21
|
+
- else -> TypeError
|
22
|
+
"""
|
23
|
+
if value is None:
|
24
|
+
return None
|
25
|
+
if isinstance(value, (str, date, datetime)):
|
26
|
+
try:
|
27
|
+
return pd.to_datetime(value).date().strftime("%Y-%m-%d")
|
28
|
+
except Exception as e:
|
29
|
+
raise ValueError(f"{name} must be a valid date, got {value!r}") from e
|
30
|
+
raise TypeError(f"{name} must be str, date, datetime, or None; got {type(value)}")
|
31
|
+
|
32
|
+
|
33
|
+
class BaseDataArtifact(ParquetArtifact):
|
34
|
+
"""
|
35
|
+
Base class for Parquet artifacts with optional date window.
|
36
|
+
|
37
|
+
Dates are always stored as strings in '%Y-%m-%d' format.
|
38
|
+
"""
|
39
|
+
|
40
|
+
config: Mapping[str, Any] = {}
|
41
|
+
|
42
|
+
parquet_start_date: Optional[str]
|
43
|
+
parquet_end_date: Optional[str]
|
44
|
+
data_wrapper_class: Optional[Type[Any]]
|
45
|
+
class_params: Dict[str, Any]
|
46
|
+
df: Union[pd.DataFrame | dd.DataFrame] = None
|
47
|
+
|
48
|
+
def __init__(
|
49
|
+
self,
|
50
|
+
**kwargs: Any,
|
51
|
+
) -> None:
|
52
|
+
merged = {**self.config, **kwargs}
|
53
|
+
super().__init__(**merged)
|
54
|
+
|
55
|
+
# Normalize and store as canonical strings
|
56
|
+
self.parquet_start_date = _validate_and_format_date("parquet_start_date", merged.get("parquet_start_date", None))
|
57
|
+
self.parquet_end_date = _validate_and_format_date("parquet_end_date", merged.get("parquet_end_date", None))
|
58
|
+
|
59
|
+
self.data_wrapper_class = merged.get("data_wrapper_class", None)
|
60
|
+
self.class_params = merged.get("class_params", None) or {
|
61
|
+
"debug": self.debug,
|
62
|
+
"logger": self.logger,
|
63
|
+
"fs": self.fs,
|
64
|
+
"verbose": getattr(self, "verbose", False),
|
65
|
+
}
|
66
|
+
|
67
|
+
# Ordering check
|
68
|
+
if self.parquet_start_date and self.parquet_end_date:
|
69
|
+
if self.parquet_start_date > self.parquet_end_date:
|
70
|
+
raise ValueError(
|
71
|
+
f"parquet_start_date {self.parquet_start_date} "
|
72
|
+
f"cannot be after parquet_end_date {self.parquet_end_date}"
|
73
|
+
)
|
74
|
+
|
75
|
+
# -------- Optional hooks --------
|
76
|
+
|
77
|
+
def before_load(self, **kwargs: Any) -> None: return None
|
78
|
+
def after_load(self, **kwargs: Any) -> None: return None
|
79
|
+
async def abefore_load(self, **kwargs: Any) -> None: return None
|
80
|
+
async def aafter_load(self, **kwargs: Any) -> None: return None
|
81
|
+
|
82
|
+
# -------- Public API --------
|
83
|
+
|
84
|
+
def load(self, **kwargs: Any):
|
85
|
+
self.before_load(**kwargs)
|
86
|
+
self.df = super().load(**kwargs)
|
87
|
+
self.after_load(**kwargs)
|
88
|
+
return self.df
|
89
|
+
|
90
|
+
async def aload(self, **kwargs: Any):
|
91
|
+
await self.abefore_load(**kwargs)
|
92
|
+
df = await asyncio.to_thread(super().load, **kwargs)
|
93
|
+
self.df = df
|
94
|
+
await self.aafter_load(**kwargs)
|
95
|
+
return self.df
|
96
|
+
|
97
|
+
def has_date_window(self) -> bool:
|
98
|
+
return bool(self.parquet_start_date or self.parquet_end_date)
|
99
|
+
|
100
|
+
def date_window(self) -> tuple[Optional[str], Optional[str]]:
|
101
|
+
return self.parquet_start_date, self.parquet_end_date
|
102
|
+
|
103
|
+
def to_params(self) -> Dict[str, Any]:
|
104
|
+
return {
|
105
|
+
"parquet_start_date": self.parquet_start_date,
|
106
|
+
"parquet_end_date": self.parquet_end_date,
|
107
|
+
"data_wrapper_class": self.data_wrapper_class,
|
108
|
+
"class_params": dict(self.class_params),
|
109
|
+
}
|
110
|
+
|
@@ -0,0 +1,79 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Union
|
4
|
+
import dask.dataframe as dd
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
from sibi_dst.df_helper import DfHelper
|
8
|
+
|
9
|
+
|
10
|
+
class BaseDataCube(DfHelper):
|
11
|
+
"""
|
12
|
+
Base cube with sync/async load hooks.
|
13
|
+
|
14
|
+
Subclasses *may* override:
|
15
|
+
- fix_data(self, **kwargs): synchronous, local transforms
|
16
|
+
- async afix_data(self, **kwargs): asynchronous transforms (I/O, awaits)
|
17
|
+
|
18
|
+
Semantics:
|
19
|
+
- load() -> runs fix_data() if defined
|
20
|
+
- aload() -> runs afix_data() if subclass overrides it, else fix_data()
|
21
|
+
"""
|
22
|
+
df: Union[dd.DataFrame, pd.DataFrame, None] = None
|
23
|
+
config: dict = {}
|
24
|
+
|
25
|
+
def __init__(self, **kwargs):
|
26
|
+
# kwargs override class config
|
27
|
+
kwargs = {**self.config, **kwargs}
|
28
|
+
super().__init__(**kwargs)
|
29
|
+
|
30
|
+
# -------------------- optional hooks --------------------
|
31
|
+
|
32
|
+
def fix_data(self, **kwargs) -> None:
|
33
|
+
"""Optional sync transform hook. Override in subclasses if needed."""
|
34
|
+
return None
|
35
|
+
|
36
|
+
async def afix_data(self, **kwargs) -> None:
|
37
|
+
"""Optional async transform hook. Override in subclasses if needed."""
|
38
|
+
return None
|
39
|
+
|
40
|
+
# -------------------- internals --------------------
|
41
|
+
|
42
|
+
def _has_data(self) -> bool:
|
43
|
+
"""Check if dataframe has rows; avoids hidden heavy ops where possible."""
|
44
|
+
if self.df is None:
|
45
|
+
return False
|
46
|
+
if isinstance(self.df, dd.DataFrame):
|
47
|
+
return bool(self.df.shape[0].compute() > 0)
|
48
|
+
return not self.df.empty
|
49
|
+
|
50
|
+
def _afix_data_is_overridden(self) -> bool:
|
51
|
+
"""Check if subclass provided its own afix_data."""
|
52
|
+
return self.__class__.afix_data is not BaseDataCube.afix_data
|
53
|
+
|
54
|
+
def _fix_data_is_overridden(self) -> bool:
|
55
|
+
"""Check if subclass provided its own fix_data."""
|
56
|
+
return self.__class__.fix_data is not BaseDataCube.fix_data
|
57
|
+
|
58
|
+
# -------------------- public API --------------------
|
59
|
+
|
60
|
+
def load(self, **kwargs):
|
61
|
+
"""Sync load path with optional fix_data hook."""
|
62
|
+
self.df = super().load(**kwargs)
|
63
|
+
if self._has_data() and self._fix_data_is_overridden():
|
64
|
+
self.fix_data()
|
65
|
+
elif not self._has_data():
|
66
|
+
self.logger.debug(f"No data was found by {self.__class__.__name__} loader")
|
67
|
+
return self.df
|
68
|
+
|
69
|
+
async def aload(self, **kwargs):
|
70
|
+
"""Async load path with optional afix_data/fix_data hook."""
|
71
|
+
self.df = await super().aload(**kwargs)
|
72
|
+
if self._has_data():
|
73
|
+
if self._afix_data_is_overridden():
|
74
|
+
await self.afix_data()
|
75
|
+
elif self._fix_data_is_overridden():
|
76
|
+
self.fix_data()
|
77
|
+
else:
|
78
|
+
self.logger.debug(f"No data was found by {self.__class__.__name__} loader")
|
79
|
+
return self.df
|