sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +4 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +355 -163
- sibi_dst/df_helper/_df_helper.py +47 -30
- sibi_dst/df_helper/_parquet_artifact.py +41 -53
- sibi_dst/df_helper/_parquet_reader.py +11 -16
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +2 -1
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +1 -103
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/base.py +117 -0
- sibi_dst/utils/clickhouse_writer.py +7 -5
- sibi_dst/utils/data_wrapper.py +64 -89
- sibi_dst/utils/date_utils.py +2 -1
- sibi_dst/utils/log_utils.py +309 -77
- sibi_dst/utils/manifest_manager.py +94 -373
- sibi_dst/utils/parquet_saver.py +98 -173
- sibi_dst/utils/storage_config.py +6 -0
- sibi_dst/utils/storage_manager.py +2 -1
- sibi_dst/utils/update_planner.py +75 -25
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.6.dist-info}/METADATA +4 -1
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.6.dist-info}/RECORD +25 -28
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +0 -91
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.6.dist-info}/WHEEL +0 -0
@@ -1,13 +1,17 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Any
|
4
|
+
|
1
5
|
import dask.dataframe as dd
|
2
6
|
import pandas as pd
|
3
7
|
|
8
|
+
from sibi_dst.utils import ManagedResource
|
4
9
|
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
5
|
-
from sibi_dst.utils import Logger
|
6
10
|
from ._db_connection import SqlAlchemyConnectionConfig
|
7
11
|
from ._io_dask import SQLAlchemyDask
|
8
12
|
|
9
13
|
|
10
|
-
class SqlAlchemyLoadFromDb:
|
14
|
+
class SqlAlchemyLoadFromDb(ManagedResource):
|
11
15
|
"""
|
12
16
|
Orchestrates loading data from a database using SQLAlchemy into a Dask
|
13
17
|
DataFrame by configuring and delegating to the SQLAlchemyDask loader.
|
@@ -18,7 +22,6 @@ class SqlAlchemyLoadFromDb:
|
|
18
22
|
plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
19
23
|
plugin_query: QueryConfig = None,
|
20
24
|
plugin_params: ParamsConfig = None,
|
21
|
-
logger: Logger = None,
|
22
25
|
**kwargs,
|
23
26
|
):
|
24
27
|
"""
|
@@ -31,16 +34,16 @@ class SqlAlchemyLoadFromDb:
|
|
31
34
|
logger: An optional logger instance.
|
32
35
|
**kwargs: Must contain 'index_column' for Dask partitioning.
|
33
36
|
"""
|
37
|
+
super().__init__(**kwargs)
|
34
38
|
self.db_connection = plugin_sqlalchemy
|
35
39
|
self.model = self.db_connection.model
|
36
40
|
self.engine = self.db_connection.engine
|
37
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
38
41
|
self.query_config = plugin_query
|
39
42
|
self.params_config = plugin_params
|
40
|
-
self.debug = kwargs.get("debug", False)
|
41
43
|
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
|
44
|
+
self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
|
42
45
|
|
43
|
-
def build_and_load(self) -> dd.DataFrame:
|
46
|
+
def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
|
44
47
|
"""
|
45
48
|
Builds and loads a Dask DataFrame from a SQLAlchemy source.
|
46
49
|
|
@@ -58,17 +61,20 @@ class SqlAlchemyLoadFromDb:
|
|
58
61
|
engine=self.engine,
|
59
62
|
chunk_size=self.chunk_size,
|
60
63
|
logger=self.logger,
|
64
|
+
verbose=self.verbose,
|
61
65
|
debug=self.debug
|
62
66
|
)
|
63
|
-
# Create the lazy DataFrame
|
64
|
-
|
65
|
-
|
67
|
+
# Create the lazy DataFrame and read a record count
|
68
|
+
# if total_records less than 0, it means an error occurred during the loading process
|
69
|
+
self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
|
70
|
+
return self.total_records, dask_df
|
66
71
|
|
67
72
|
|
68
73
|
except Exception as e:
|
69
|
-
self.
|
74
|
+
self.total_records = -1
|
75
|
+
self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
|
70
76
|
# Return an empty dataframe with the correct schema on failure
|
71
77
|
columns = [c.name for c in self.model.__table__.columns]
|
72
|
-
return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
78
|
+
return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
73
79
|
|
74
80
|
|
@@ -54,8 +54,6 @@ class SqlAlchemyModelBuilder:
|
|
54
54
|
The dynamically created ORM model class.
|
55
55
|
"""
|
56
56
|
with self._lock:
|
57
|
-
# ✅ REFACTOR: Add a comment acknowledging the risk of using an
|
58
|
-
# internal API. This is a maintenance warning for future developers.
|
59
57
|
# NOTE: Using a private SQLAlchemy API. This is a performance
|
60
58
|
# optimization but may break in future versions of the library.
|
61
59
|
registered_model = Base.registry._class_registry.get(self.class_name)
|
@@ -103,104 +101,4 @@ class SqlAlchemyModelBuilder:
|
|
103
101
|
return f"{sane_name}_field"
|
104
102
|
return sane_name
|
105
103
|
|
106
|
-
|
107
|
-
# import keyword
|
108
|
-
# import threading
|
109
|
-
# from sqlalchemy import MetaData, Engine
|
110
|
-
# from sqlalchemy.orm import DeclarativeBase
|
111
|
-
#
|
112
|
-
#
|
113
|
-
#
|
114
|
-
# class Base(DeclarativeBase):
|
115
|
-
# """shared declarative base for all ORM models."""
|
116
|
-
# pass
|
117
|
-
#
|
118
|
-
#
|
119
|
-
# apps_label = "datacubes.models"
|
120
|
-
#
|
121
|
-
#
|
122
|
-
# class SqlAlchemyModelBuilder:
|
123
|
-
# """
|
124
|
-
# Builds a single SQLAlchemy ORM model from a specific database table.
|
125
|
-
# This class is thread-safe and caches reflected table metadata to
|
126
|
-
# improve performance across multiple instantiations.
|
127
|
-
# """
|
128
|
-
# _lock = threading.Lock()
|
129
|
-
# _metadata_cache: dict[str, MetaData] = {}
|
130
|
-
#
|
131
|
-
# def __init__(self, engine: Engine, table_name: str):
|
132
|
-
# """
|
133
|
-
# Initializes the model builder for a specific table.
|
134
|
-
#
|
135
|
-
# Args:
|
136
|
-
# engine: The SQLAlchemy engine connected to the database.
|
137
|
-
# table_name: The name of the table to generate the model for.
|
138
|
-
# """
|
139
|
-
# self.engine = engine
|
140
|
-
# self.table_name = table_name
|
141
|
-
# self.class_name = self._normalize_class_name(self.table_name)
|
142
|
-
#
|
143
|
-
# # Use or create a cached MetaData object for this engine to avoid
|
144
|
-
# # re-reading the schema for tables that are already known.
|
145
|
-
# engine_key = str(engine.url)
|
146
|
-
# if engine_key not in self._metadata_cache:
|
147
|
-
# self._metadata_cache[engine_key] = MetaData()
|
148
|
-
# self.metadata = self._metadata_cache[engine_key]
|
149
|
-
#
|
150
|
-
# def build_model(self) -> type:
|
151
|
-
# """
|
152
|
-
# Builds and returns a database model class for the specified table.
|
153
|
-
# This process is atomic and thread-safe.
|
154
|
-
#
|
155
|
-
# Raises:
|
156
|
-
# ValueError: If the specified table does not exist in the database.
|
157
|
-
# Returns:
|
158
|
-
# The dynamically created ORM model class.
|
159
|
-
# """
|
160
|
-
# with self._lock:
|
161
|
-
# # First, check if the model class is already registered in SQLAlchemy
|
162
|
-
# registered_model = Base.registry._class_registry.get(self.class_name)
|
163
|
-
# if registered_model:
|
164
|
-
# return registered_model
|
165
|
-
#
|
166
|
-
# # Next, check if the table's schema is in our metadata cache
|
167
|
-
# table = self.metadata.tables.get(self.table_name)
|
168
|
-
#
|
169
|
-
# # If not cached, reflect it from the database
|
170
|
-
# if table is None:
|
171
|
-
# self.metadata.reflect(bind=self.engine, only=[self.table_name])
|
172
|
-
# table = self.metadata.tables.get(self.table_name)
|
173
|
-
#
|
174
|
-
# if table is None:
|
175
|
-
# raise ValueError(
|
176
|
-
# f"Table '{self.table_name}' does not exist in the database."
|
177
|
-
# )
|
178
|
-
#
|
179
|
-
# # Create the model class dynamically.
|
180
|
-
# # No need to add columns manually; __table__ handles it.
|
181
|
-
# attrs = {
|
182
|
-
# "__tablename__": table.name,
|
183
|
-
# "__table__": table,
|
184
|
-
# "__module__": apps_label,
|
185
|
-
# }
|
186
|
-
# model = type(self.class_name, (Base,), attrs)
|
187
|
-
#
|
188
|
-
# return model
|
189
|
-
#
|
190
|
-
# @staticmethod
|
191
|
-
# def _normalize_class_name(table_name: str) -> str:
|
192
|
-
# """Converts a snake_case table_name to a CamelCase class name."""
|
193
|
-
# return "".join(word.capitalize() for word in table_name.split("_"))
|
194
|
-
#
|
195
|
-
# @staticmethod
|
196
|
-
# def _normalize_column_name(column_name: str) -> str:
|
197
|
-
# """
|
198
|
-
# Sanitizes a column name to be a valid Python identifier.
|
199
|
-
# (Kept for utility, though not used in the final model creation).
|
200
|
-
# """
|
201
|
-
# sane_name = re.sub(r"\W", "_", column_name)
|
202
|
-
# sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
|
203
|
-
#
|
204
|
-
# if keyword.iskeyword(sane_name):
|
205
|
-
# return f"{sane_name}_field"
|
206
|
-
# return sane_name
|
104
|
+
|
sibi_dst/utils/__init__.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from .log_utils import Logger
|
4
|
+
from .base import ManagedResource
|
4
5
|
from .date_utils import *
|
5
6
|
from .data_utils import DataUtils
|
6
7
|
from .file_utils import FileUtils
|
@@ -20,6 +21,7 @@ from .manifest_manager import MissingManifestManager
|
|
20
21
|
|
21
22
|
__all__ = [
|
22
23
|
"Logger",
|
24
|
+
"ManagedResource",
|
23
25
|
"ConfigManager",
|
24
26
|
"ConfigLoader",
|
25
27
|
"DateUtils",
|
@@ -38,6 +40,5 @@ __all__ = [
|
|
38
40
|
"FsRegistry",
|
39
41
|
"DataFromHttpSource",
|
40
42
|
"WebDAVClient",
|
41
|
-
"MissingManifestManager"
|
43
|
+
"MissingManifestManager"
|
42
44
|
]
|
43
|
-
|
sibi_dst/utils/base.py
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
import asyncio
|
2
|
+
from .log_utils import Logger
|
3
|
+
|
4
|
+
class ManagedResource:
|
5
|
+
"""
|
6
|
+
A base class providing context management for resources like loggers and filesystems.
|
7
|
+
|
8
|
+
It handles the creation and cleanup of these resources, ensuring they are only
|
9
|
+
closed if they were created by the instance itself.
|
10
|
+
"""
|
11
|
+
|
12
|
+
def __init__(self, **kwargs):
|
13
|
+
self.debug = kwargs.get("debug", False)
|
14
|
+
self.verbose = kwargs.get("verbose", False)
|
15
|
+
|
16
|
+
# --- Logger Management (Refactored) ---
|
17
|
+
logger = kwargs.get("logger")
|
18
|
+
if logger:
|
19
|
+
# An existing logger instance was provided by the user
|
20
|
+
self.logger = logger
|
21
|
+
self._own_logger = False
|
22
|
+
self.logger.debug(f"'{self.__class__.__name__}' is tapping into an existing logger.")
|
23
|
+
else:
|
24
|
+
# No pre-configured logger, so we will create and "own" a new one.
|
25
|
+
self._own_logger = True
|
26
|
+
logger_config = kwargs.get("logger_config", {})
|
27
|
+
|
28
|
+
# Set default logger_name if not specified in the config
|
29
|
+
logger_config.setdefault("logger_name", self.__class__.__name__)
|
30
|
+
|
31
|
+
# Set log_level based on debug flag, but respect user-provided level
|
32
|
+
default_level = Logger.DEBUG if self.debug else Logger.INFO
|
33
|
+
logger_config.setdefault("log_level", default_level)
|
34
|
+
|
35
|
+
# Create the logger using the provided or default configuration
|
36
|
+
self.logger = Logger.default_logger(**logger_config)
|
37
|
+
if self.logger:
|
38
|
+
self.logger.debug(f"'{self.__class__.__name__}' is starting its own logger.")
|
39
|
+
|
40
|
+
fs = kwargs.get("fs")
|
41
|
+
self._own_fs = fs is None
|
42
|
+
self.fs = fs or None # we want to allow None as a valid fs to trigger a failure if needed
|
43
|
+
|
44
|
+
self._entered = False
|
45
|
+
|
46
|
+
def __enter__(self):
|
47
|
+
"""Enter the runtime context."""
|
48
|
+
self._entered = True
|
49
|
+
return self
|
50
|
+
|
51
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
52
|
+
"""Exit the runtime context and trigger cleanup."""
|
53
|
+
self.cleanup()
|
54
|
+
return False # Propagate exceptions
|
55
|
+
|
56
|
+
# --- Asynchronous Context Management ---
|
57
|
+
|
58
|
+
async def __aenter__(self):
|
59
|
+
"""Enter the runtime context for 'async with' statements."""
|
60
|
+
self._entered = True
|
61
|
+
return self
|
62
|
+
|
63
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
64
|
+
"""Exit the runtime context and trigger cleanup for 'async with' statements."""
|
65
|
+
await self.acleanup()
|
66
|
+
return False # Propagate exceptions
|
67
|
+
|
68
|
+
def __repr__(self) -> str:
|
69
|
+
"""Return an unambiguous string representation of the ManagedResource."""
|
70
|
+
# Dynamically get the name of the class or subclass
|
71
|
+
class_name = self.__class__.__name__
|
72
|
+
|
73
|
+
# Determine the status of the logger and filesystem
|
74
|
+
logger_status = "own" if self._own_logger else "external"
|
75
|
+
fs_status = "own" if self._own_fs else "external"
|
76
|
+
|
77
|
+
return (
|
78
|
+
f"<{class_name} debug={self.debug}, "
|
79
|
+
f"logger='{logger_status}', fs='{fs_status}'>"
|
80
|
+
)
|
81
|
+
|
82
|
+
def cleanup(self):
|
83
|
+
"""
|
84
|
+
Cleanup resources managed by this instance.
|
85
|
+
"""
|
86
|
+
if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
|
87
|
+
if self.logger:
|
88
|
+
self.logger.debug(f"'{self.__class__.__name__}' is clearing its own filesystem cache.")
|
89
|
+
self.fs.clear_instance_cache()
|
90
|
+
|
91
|
+
if self._own_logger and hasattr(self.logger, "shutdown"):
|
92
|
+
# Ensure the logger exists before trying to use or shut it down
|
93
|
+
if self.logger:
|
94
|
+
self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
|
95
|
+
self.logger.shutdown()
|
96
|
+
self.logger = None # Set to None after shutdown
|
97
|
+
|
98
|
+
self._entered = False
|
99
|
+
|
100
|
+
async def acleanup(self):
|
101
|
+
"""
|
102
|
+
Async Cleanup resources managed by this instance.
|
103
|
+
"""
|
104
|
+
if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
|
105
|
+
if self.logger:
|
106
|
+
self.logger.debug(f"'{self.__class__.__name__}' is clearing its own filesystem cache.")
|
107
|
+
self.fs.clear_instance_cache()
|
108
|
+
|
109
|
+
if self._own_logger and hasattr(self.logger, "shutdown"):
|
110
|
+
# Ensure the logger exists before trying to use or shut it down
|
111
|
+
if self.logger:
|
112
|
+
self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
|
113
|
+
self.logger.shutdown()
|
114
|
+
self.logger = None # Set to None after shutdown
|
115
|
+
|
116
|
+
self._entered = False
|
117
|
+
|
@@ -1,14 +1,15 @@
|
|
1
1
|
from concurrent.futures import ThreadPoolExecutor
|
2
|
+
from typing import ClassVar, Dict
|
2
3
|
|
3
4
|
import clickhouse_connect
|
4
5
|
import pandas as pd
|
5
6
|
from clickhouse_driver import Client
|
6
7
|
import dask.dataframe as dd
|
7
8
|
|
8
|
-
from .
|
9
|
+
from . import ManagedResource
|
9
10
|
|
10
11
|
|
11
|
-
class ClickHouseWriter:
|
12
|
+
class ClickHouseWriter(ManagedResource):
|
12
13
|
"""
|
13
14
|
Provides functionality to write a Dask DataFrame to a ClickHouse database using
|
14
15
|
a specified schema. This class handles the creation of tables, schema generation,
|
@@ -36,7 +37,7 @@ class ClickHouseWriter:
|
|
36
37
|
:ivar order_by: Field or column name to use for table ordering.
|
37
38
|
:type order_by: str
|
38
39
|
"""
|
39
|
-
dtype_to_clickhouse = {
|
40
|
+
dtype_to_clickhouse: ClassVar[Dict[str, str]] = {
|
40
41
|
'int64': 'Int64',
|
41
42
|
'int32': 'Int32',
|
42
43
|
'float64': 'Float64',
|
@@ -48,7 +49,8 @@ class ClickHouseWriter:
|
|
48
49
|
}
|
49
50
|
df: dd.DataFrame
|
50
51
|
|
51
|
-
def __init__(self,
|
52
|
+
def __init__(self, **kwargs):
|
53
|
+
super().__init__(**kwargs)
|
52
54
|
self.clickhouse_host = kwargs.setdefault('host', "localhost")
|
53
55
|
self.clickhouse_port = kwargs.setdefault('port', 8123)
|
54
56
|
self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
|
@@ -56,7 +58,7 @@ class ClickHouseWriter:
|
|
56
58
|
self.clickhouse_password = kwargs.setdefault('password', '')
|
57
59
|
self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
|
58
60
|
|
59
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
61
|
+
#self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
60
62
|
self.client = None
|
61
63
|
self.order_by = kwargs.setdefault('order_by', 'id')
|
62
64
|
|
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -3,26 +3,27 @@ import logging
|
|
3
3
|
import threading
|
4
4
|
import time
|
5
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
|
-
from typing import Type, Any, Dict, Optional, Union, List
|
6
|
+
from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
7
7
|
|
8
8
|
import fsspec
|
9
9
|
import pandas as pd
|
10
10
|
from tqdm import tqdm
|
11
11
|
|
12
|
+
from . import ManagedResource
|
12
13
|
from .log_utils import Logger
|
13
14
|
from .parquet_saver import ParquetSaver
|
14
15
|
|
15
16
|
|
16
|
-
class DataWrapper:
|
17
|
-
DEFAULT_PRIORITY_MAP = {
|
17
|
+
class DataWrapper(ManagedResource):
|
18
|
+
DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
18
19
|
"overwrite": 1,
|
19
20
|
"missing_in_history": 2,
|
20
21
|
"existing_but_stale": 3,
|
21
22
|
"missing_outside_history": 4,
|
22
23
|
"file_is_recent": 0
|
23
24
|
}
|
24
|
-
DEFAULT_MAX_AGE_MINUTES = 1440
|
25
|
-
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
25
|
+
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
26
|
+
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
26
27
|
|
27
28
|
def __init__(
|
28
29
|
self,
|
@@ -30,26 +31,20 @@ class DataWrapper:
|
|
30
31
|
date_field: str,
|
31
32
|
data_path: str,
|
32
33
|
parquet_filename: str,
|
33
|
-
fs: Optional[fsspec.AbstractFileSystem] = None,
|
34
|
-
debug: bool = False,
|
35
|
-
verbose: bool = False,
|
36
34
|
class_params: Optional[Dict] = None,
|
37
35
|
load_params: Optional[Dict] = None,
|
38
|
-
logger: Logger = None,
|
39
36
|
show_progress: bool = False,
|
40
37
|
timeout: float = 30,
|
41
38
|
max_threads: int = 3,
|
42
39
|
**kwargs: Any,
|
43
40
|
):
|
41
|
+
super().__init__(**kwargs)
|
44
42
|
self.dataclass = dataclass
|
45
43
|
self.date_field = date_field
|
46
44
|
self.data_path = self._ensure_forward_slash(data_path)
|
47
45
|
self.parquet_filename = parquet_filename
|
48
|
-
self.fs
|
49
|
-
|
50
|
-
self.verbose = verbose
|
51
|
-
self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
|
52
|
-
self.logger.set_level(logging.DEBUG if debug else logging.INFO)
|
46
|
+
if self.fs is None:
|
47
|
+
raise ValueError("Datawrapper requires a File system (fs) to be provided .")
|
53
48
|
self.show_progress = show_progress
|
54
49
|
self.timeout = timeout
|
55
50
|
self.max_threads = max_threads
|
@@ -66,25 +61,15 @@ class DataWrapper:
|
|
66
61
|
self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
67
62
|
self.mmanifest = kwargs.get("mmanifest", None)
|
68
63
|
self.update_planner=kwargs.get("update_planner", None)
|
69
|
-
self.datacls = self.dataclass(**self.class_params)
|
70
64
|
|
71
|
-
def __enter__(self):
|
72
|
-
"""Context manager entry"""
|
73
|
-
return self
|
74
65
|
|
75
66
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
76
67
|
"""Context manager exit"""
|
77
|
-
if self.mmanifest
|
68
|
+
if self.mmanifest:
|
78
69
|
self.mmanifest.save()
|
79
|
-
|
80
|
-
if exc_type is not None:
|
81
|
-
self.logger.error(f"Exception occurred: {exc_val}")
|
70
|
+
super().__exit__(exc_type, exc_val, exc_tb)
|
82
71
|
return False
|
83
72
|
|
84
|
-
def _init_filesystem(self) -> fsspec.AbstractFileSystem:
|
85
|
-
with self._lock:
|
86
|
-
return fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
87
|
-
|
88
73
|
@staticmethod
|
89
74
|
def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
90
75
|
if isinstance(date, datetime.date):
|
@@ -101,78 +86,68 @@ class DataWrapper:
|
|
101
86
|
def process(self, max_retries: int = 3):
|
102
87
|
"""Process updates with priority-based execution, retries, benchmarking and progress updates"""
|
103
88
|
overall_start = time.perf_counter()
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
if plan_count == 0:
|
108
|
-
self.logger.info("No updates required")
|
89
|
+
tasks = list(self.update_planner.get_tasks_by_priority())
|
90
|
+
if not tasks:
|
91
|
+
self.logger.info("No updates required based on the current plan.")
|
109
92
|
return
|
110
|
-
self.logger.info(f"Update plan for {self.dataclass.__name__} includes {plan_count} items for update")
|
111
93
|
|
112
|
-
if self.
|
94
|
+
if self.update_planner.show_progress:
|
113
95
|
self.update_planner.show_update_plan()
|
114
96
|
|
115
|
-
for priority in
|
116
|
-
self.
|
97
|
+
for priority, dates in tasks:
|
98
|
+
self._execute_task_batch(priority, dates, max_retries)
|
117
99
|
|
118
100
|
total_time = time.perf_counter() - overall_start
|
119
|
-
|
120
|
-
|
121
|
-
self.logger.info(
|
122
|
-
|
123
|
-
f"(avg {total_time / processed:.1f}s per date)"
|
124
|
-
)
|
125
|
-
if self.show_progress or self.verbose:
|
101
|
+
if self.processed_dates:
|
102
|
+
count = len(self.processed_dates)
|
103
|
+
self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
|
104
|
+
if self.update_planner.show_progress:
|
126
105
|
self.show_benchmark_summary()
|
127
106
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
priority: int,
|
132
|
-
max_retries: int
|
133
|
-
):
|
134
|
-
"""Process a single priority group with parallel execution and timing"""
|
135
|
-
dates = plan[plan["update_priority"] == priority]["date"].tolist()
|
136
|
-
if not dates:
|
137
|
-
return
|
107
|
+
|
108
|
+
def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
|
109
|
+
"""Executes a single batch of tasks (dates) using a thread pool."""
|
138
110
|
desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
139
|
-
self.logger.debug(f"Starting {desc.lower()}")
|
140
|
-
group_start = time.perf_counter()
|
141
111
|
max_thr = min(len(dates), self.max_threads)
|
142
|
-
self.logger.
|
112
|
+
self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
|
113
|
+
|
143
114
|
with ThreadPoolExecutor(max_workers=max_thr) as executor:
|
144
115
|
futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
|
145
|
-
|
146
|
-
|
116
|
+
iterator = as_completed(futures)
|
117
|
+
if self.show_progress:
|
118
|
+
iterator = tqdm(iterator, total=len(futures), desc=desc)
|
119
|
+
|
120
|
+
for future in iterator:
|
147
121
|
try:
|
148
122
|
future.result(timeout=self.timeout)
|
149
123
|
except Exception as e:
|
150
|
-
self.logger.error(f"Permanent failure
|
151
|
-
group_time = time.perf_counter() - group_start
|
152
|
-
self.logger.info(f"Priority {priority} group processed {len(dates)} dates in {group_time:.1f}s")
|
124
|
+
self.logger.error(f"Permanent failure for {futures[future]}: {e}")
|
153
125
|
|
154
126
|
def _process_date_with_retry(self, date: datetime.date, max_retries: int):
|
155
|
-
|
127
|
+
"""Wrapper to apply retry logic to single date processing."""
|
128
|
+
for attempt in range(max_retries):
|
156
129
|
try:
|
157
130
|
self._process_single_date(date)
|
158
131
|
return
|
159
132
|
except Exception as e:
|
160
|
-
if attempt < max_retries:
|
161
|
-
self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {e}")
|
133
|
+
if attempt < max_retries - 1:
|
134
|
+
self.logger.warning(f"Retry {attempt + 1}/{max_retries} for {date}: {e}")
|
135
|
+
time.sleep(2 ** attempt) # Exponential backoff
|
162
136
|
else:
|
163
|
-
|
137
|
+
self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
|
138
|
+
#raise
|
164
139
|
|
165
140
|
def _process_single_date(self, date: datetime.date):
|
166
141
|
"""Core date processing logic with load/save timing and thread reporting"""
|
167
142
|
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
168
143
|
self.logger.debug(f"Processing date {date.isoformat()} for {path}")
|
169
144
|
if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
170
|
-
self.logger.
|
145
|
+
self.logger.debug(f"Skipping {date} as it exists in the skipped list")
|
171
146
|
return
|
172
147
|
full_path = f"{path}{self.parquet_filename}"
|
173
148
|
|
174
|
-
thread_name = threading.current_thread().name
|
175
|
-
self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
149
|
+
#thread_name = threading.current_thread().name
|
150
|
+
#self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
176
151
|
|
177
152
|
overall_start = time.perf_counter()
|
178
153
|
try:
|
@@ -180,30 +155,30 @@ class DataWrapper:
|
|
180
155
|
date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
181
156
|
self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
|
182
157
|
# Load data using the dataclass with the provided date filter
|
183
|
-
|
184
|
-
|
158
|
+
# Create a copy to avoid mutating the shared instance dictionary
|
159
|
+
local_load_params = self.load_params.copy()
|
160
|
+
local_load_params.update(date_filter)
|
161
|
+
local_class_instance = self.dataclass(**self.class_params)
|
162
|
+
df=local_class_instance.load(**local_load_params)
|
185
163
|
load_time = time.perf_counter() - load_start
|
186
|
-
if df.head(1, compute=True).empty:
|
187
|
-
if self.mmanifest:
|
188
|
-
schema = df._meta.dtypes.astype(str).to_dict()
|
189
|
-
self.mmanifest.record(
|
190
|
-
full_path=path
|
191
|
-
)
|
192
|
-
self.logger.info(f"No data found for {date}. Logged to missing manifest.")
|
193
|
-
return
|
194
|
-
# Dask-compatible empty check
|
195
|
-
# if len(df.index) == 0:
|
196
|
-
# self.logger.warning(f"No data found for {date}")
|
197
|
-
# return
|
198
164
|
|
165
|
+
if hasattr(local_class_instance, "total_records"):
|
166
|
+
self.logger.debug(f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
|
167
|
+
if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
|
168
|
+
if self.mmanifest:
|
169
|
+
self.mmanifest.record(
|
170
|
+
full_path=path
|
171
|
+
)
|
172
|
+
self.logger.info(f"No data found for {date}. Logged to missing manifest.")
|
173
|
+
return
|
199
174
|
save_start = time.perf_counter()
|
200
|
-
with
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
175
|
+
with ParquetSaver(
|
176
|
+
df_result=df,
|
177
|
+
parquet_storage_path=path,
|
178
|
+
fs=self.fs,
|
179
|
+
logger=self.logger
|
180
|
+
) as ps:
|
181
|
+
ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
207
182
|
save_time = time.perf_counter() - save_start
|
208
183
|
|
209
184
|
total_time = time.perf_counter() - overall_start
|
@@ -233,4 +208,4 @@ class DataWrapper:
|
|
233
208
|
return
|
234
209
|
df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
235
210
|
df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
|
236
|
-
self.logger.info("Benchmark Summary:\n" + df_bench.to_string())
|
211
|
+
self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
|
sibi_dst/utils/date_utils.py
CHANGED
@@ -29,8 +29,9 @@ class DateUtils:
|
|
29
29
|
"""
|
30
30
|
_PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
|
31
31
|
|
32
|
-
def __init__(self, logger=None):
|
32
|
+
def __init__(self, logger=None, debug=False):
|
33
33
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
34
|
+
self.debug = debug
|
34
35
|
|
35
36
|
@classmethod
|
36
37
|
def _ensure_date(cls, value: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|