sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +4 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +355 -163
- sibi_dst/df_helper/_df_helper.py +47 -30
- sibi_dst/df_helper/_parquet_artifact.py +57 -47
- sibi_dst/df_helper/_parquet_reader.py +9 -13
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +1 -103
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/base.py +97 -0
- sibi_dst/utils/clickhouse_writer.py +5 -4
- sibi_dst/utils/data_wrapper.py +69 -84
- sibi_dst/utils/date_utils.py +2 -1
- sibi_dst/utils/log_utils.py +309 -77
- sibi_dst/utils/manifest_manager.py +94 -373
- sibi_dst/utils/parquet_saver.py +98 -173
- sibi_dst/utils/storage_config.py +6 -0
- sibi_dst/utils/storage_manager.py +2 -1
- sibi_dst/utils/update_planner.py +72 -22
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.5.dist-info}/METADATA +2 -1
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.5.dist-info}/RECORD +24 -27
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +0 -91
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.5.dist-info}/WHEEL +0 -0
@@ -1,13 +1,17 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Any
|
4
|
+
|
1
5
|
import dask.dataframe as dd
|
2
6
|
import pandas as pd
|
3
7
|
|
8
|
+
from sibi_dst.utils import ManagedResource
|
4
9
|
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
5
|
-
from sibi_dst.utils import Logger
|
6
10
|
from ._db_connection import SqlAlchemyConnectionConfig
|
7
11
|
from ._io_dask import SQLAlchemyDask
|
8
12
|
|
9
13
|
|
10
|
-
class SqlAlchemyLoadFromDb:
|
14
|
+
class SqlAlchemyLoadFromDb(ManagedResource):
|
11
15
|
"""
|
12
16
|
Orchestrates loading data from a database using SQLAlchemy into a Dask
|
13
17
|
DataFrame by configuring and delegating to the SQLAlchemyDask loader.
|
@@ -18,7 +22,6 @@ class SqlAlchemyLoadFromDb:
|
|
18
22
|
plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
19
23
|
plugin_query: QueryConfig = None,
|
20
24
|
plugin_params: ParamsConfig = None,
|
21
|
-
logger: Logger = None,
|
22
25
|
**kwargs,
|
23
26
|
):
|
24
27
|
"""
|
@@ -31,16 +34,16 @@ class SqlAlchemyLoadFromDb:
|
|
31
34
|
logger: An optional logger instance.
|
32
35
|
**kwargs: Must contain 'index_column' for Dask partitioning.
|
33
36
|
"""
|
37
|
+
super().__init__(**kwargs)
|
34
38
|
self.db_connection = plugin_sqlalchemy
|
35
39
|
self.model = self.db_connection.model
|
36
40
|
self.engine = self.db_connection.engine
|
37
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
38
41
|
self.query_config = plugin_query
|
39
42
|
self.params_config = plugin_params
|
40
|
-
self.debug = kwargs.get("debug", False)
|
41
43
|
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
|
44
|
+
self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
|
42
45
|
|
43
|
-
def build_and_load(self) -> dd.DataFrame:
|
46
|
+
def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
|
44
47
|
"""
|
45
48
|
Builds and loads a Dask DataFrame from a SQLAlchemy source.
|
46
49
|
|
@@ -58,17 +61,20 @@ class SqlAlchemyLoadFromDb:
|
|
58
61
|
engine=self.engine,
|
59
62
|
chunk_size=self.chunk_size,
|
60
63
|
logger=self.logger,
|
64
|
+
verbose=self.verbose,
|
61
65
|
debug=self.debug
|
62
66
|
)
|
63
|
-
# Create the lazy DataFrame
|
64
|
-
|
65
|
-
|
67
|
+
# Create the lazy DataFrame and read a record count
|
68
|
+
# if total_records less than 0, it means an error occurred during the loading process
|
69
|
+
self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
|
70
|
+
return self.total_records, dask_df
|
66
71
|
|
67
72
|
|
68
73
|
except Exception as e:
|
69
|
-
self.
|
74
|
+
self.total_records = -1
|
75
|
+
self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
|
70
76
|
# Return an empty dataframe with the correct schema on failure
|
71
77
|
columns = [c.name for c in self.model.__table__.columns]
|
72
|
-
return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
78
|
+
return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
73
79
|
|
74
80
|
|
@@ -54,8 +54,6 @@ class SqlAlchemyModelBuilder:
|
|
54
54
|
The dynamically created ORM model class.
|
55
55
|
"""
|
56
56
|
with self._lock:
|
57
|
-
# ✅ REFACTOR: Add a comment acknowledging the risk of using an
|
58
|
-
# internal API. This is a maintenance warning for future developers.
|
59
57
|
# NOTE: Using a private SQLAlchemy API. This is a performance
|
60
58
|
# optimization but may break in future versions of the library.
|
61
59
|
registered_model = Base.registry._class_registry.get(self.class_name)
|
@@ -103,104 +101,4 @@ class SqlAlchemyModelBuilder:
|
|
103
101
|
return f"{sane_name}_field"
|
104
102
|
return sane_name
|
105
103
|
|
106
|
-
|
107
|
-
# import keyword
|
108
|
-
# import threading
|
109
|
-
# from sqlalchemy import MetaData, Engine
|
110
|
-
# from sqlalchemy.orm import DeclarativeBase
|
111
|
-
#
|
112
|
-
#
|
113
|
-
#
|
114
|
-
# class Base(DeclarativeBase):
|
115
|
-
# """shared declarative base for all ORM models."""
|
116
|
-
# pass
|
117
|
-
#
|
118
|
-
#
|
119
|
-
# apps_label = "datacubes.models"
|
120
|
-
#
|
121
|
-
#
|
122
|
-
# class SqlAlchemyModelBuilder:
|
123
|
-
# """
|
124
|
-
# Builds a single SQLAlchemy ORM model from a specific database table.
|
125
|
-
# This class is thread-safe and caches reflected table metadata to
|
126
|
-
# improve performance across multiple instantiations.
|
127
|
-
# """
|
128
|
-
# _lock = threading.Lock()
|
129
|
-
# _metadata_cache: dict[str, MetaData] = {}
|
130
|
-
#
|
131
|
-
# def __init__(self, engine: Engine, table_name: str):
|
132
|
-
# """
|
133
|
-
# Initializes the model builder for a specific table.
|
134
|
-
#
|
135
|
-
# Args:
|
136
|
-
# engine: The SQLAlchemy engine connected to the database.
|
137
|
-
# table_name: The name of the table to generate the model for.
|
138
|
-
# """
|
139
|
-
# self.engine = engine
|
140
|
-
# self.table_name = table_name
|
141
|
-
# self.class_name = self._normalize_class_name(self.table_name)
|
142
|
-
#
|
143
|
-
# # Use or create a cached MetaData object for this engine to avoid
|
144
|
-
# # re-reading the schema for tables that are already known.
|
145
|
-
# engine_key = str(engine.url)
|
146
|
-
# if engine_key not in self._metadata_cache:
|
147
|
-
# self._metadata_cache[engine_key] = MetaData()
|
148
|
-
# self.metadata = self._metadata_cache[engine_key]
|
149
|
-
#
|
150
|
-
# def build_model(self) -> type:
|
151
|
-
# """
|
152
|
-
# Builds and returns a database model class for the specified table.
|
153
|
-
# This process is atomic and thread-safe.
|
154
|
-
#
|
155
|
-
# Raises:
|
156
|
-
# ValueError: If the specified table does not exist in the database.
|
157
|
-
# Returns:
|
158
|
-
# The dynamically created ORM model class.
|
159
|
-
# """
|
160
|
-
# with self._lock:
|
161
|
-
# # First, check if the model class is already registered in SQLAlchemy
|
162
|
-
# registered_model = Base.registry._class_registry.get(self.class_name)
|
163
|
-
# if registered_model:
|
164
|
-
# return registered_model
|
165
|
-
#
|
166
|
-
# # Next, check if the table's schema is in our metadata cache
|
167
|
-
# table = self.metadata.tables.get(self.table_name)
|
168
|
-
#
|
169
|
-
# # If not cached, reflect it from the database
|
170
|
-
# if table is None:
|
171
|
-
# self.metadata.reflect(bind=self.engine, only=[self.table_name])
|
172
|
-
# table = self.metadata.tables.get(self.table_name)
|
173
|
-
#
|
174
|
-
# if table is None:
|
175
|
-
# raise ValueError(
|
176
|
-
# f"Table '{self.table_name}' does not exist in the database."
|
177
|
-
# )
|
178
|
-
#
|
179
|
-
# # Create the model class dynamically.
|
180
|
-
# # No need to add columns manually; __table__ handles it.
|
181
|
-
# attrs = {
|
182
|
-
# "__tablename__": table.name,
|
183
|
-
# "__table__": table,
|
184
|
-
# "__module__": apps_label,
|
185
|
-
# }
|
186
|
-
# model = type(self.class_name, (Base,), attrs)
|
187
|
-
#
|
188
|
-
# return model
|
189
|
-
#
|
190
|
-
# @staticmethod
|
191
|
-
# def _normalize_class_name(table_name: str) -> str:
|
192
|
-
# """Converts a snake_case table_name to a CamelCase class name."""
|
193
|
-
# return "".join(word.capitalize() for word in table_name.split("_"))
|
194
|
-
#
|
195
|
-
# @staticmethod
|
196
|
-
# def _normalize_column_name(column_name: str) -> str:
|
197
|
-
# """
|
198
|
-
# Sanitizes a column name to be a valid Python identifier.
|
199
|
-
# (Kept for utility, though not used in the final model creation).
|
200
|
-
# """
|
201
|
-
# sane_name = re.sub(r"\W", "_", column_name)
|
202
|
-
# sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
|
203
|
-
#
|
204
|
-
# if keyword.iskeyword(sane_name):
|
205
|
-
# return f"{sane_name}_field"
|
206
|
-
# return sane_name
|
104
|
+
|
sibi_dst/utils/__init__.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from .log_utils import Logger
|
4
|
+
from .base import ManagedResource
|
4
5
|
from .date_utils import *
|
5
6
|
from .data_utils import DataUtils
|
6
7
|
from .file_utils import FileUtils
|
@@ -20,6 +21,7 @@ from .manifest_manager import MissingManifestManager
|
|
20
21
|
|
21
22
|
__all__ = [
|
22
23
|
"Logger",
|
24
|
+
"ManagedResource",
|
23
25
|
"ConfigManager",
|
24
26
|
"ConfigLoader",
|
25
27
|
"DateUtils",
|
@@ -38,6 +40,5 @@ __all__ = [
|
|
38
40
|
"FsRegistry",
|
39
41
|
"DataFromHttpSource",
|
40
42
|
"WebDAVClient",
|
41
|
-
"MissingManifestManager"
|
43
|
+
"MissingManifestManager"
|
42
44
|
]
|
43
|
-
|
sibi_dst/utils/base.py
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
from .log_utils import Logger
|
2
|
+
|
3
|
+
class ManagedResource:
|
4
|
+
"""
|
5
|
+
A base class providing context management for resources like loggers and filesystems.
|
6
|
+
|
7
|
+
It handles the creation and cleanup of these resources, ensuring they are only
|
8
|
+
closed if they were created by the instance itself.
|
9
|
+
"""
|
10
|
+
|
11
|
+
def __init__(self, **kwargs):
|
12
|
+
self.debug = kwargs.get("debug", False)
|
13
|
+
self.verbose = kwargs.get("verbose", False)
|
14
|
+
|
15
|
+
# --- Logger Management (Refactored) ---
|
16
|
+
logger = kwargs.get("logger")
|
17
|
+
if logger:
|
18
|
+
# An existing logger instance was provided by the user
|
19
|
+
self.logger = logger
|
20
|
+
self._own_logger = False
|
21
|
+
self.logger.debug(f"'{self.__class__.__name__}' is tapping into an existing logger.")
|
22
|
+
else:
|
23
|
+
# No pre-configured logger, so we will create and "own" a new one.
|
24
|
+
self._own_logger = True
|
25
|
+
logger_config = kwargs.get("logger_config", {})
|
26
|
+
|
27
|
+
# Set default logger_name if not specified in the config
|
28
|
+
logger_config.setdefault("logger_name", self.__class__.__name__)
|
29
|
+
|
30
|
+
# Set log_level based on debug flag, but respect user-provided level
|
31
|
+
default_level = Logger.DEBUG if self.debug else Logger.INFO
|
32
|
+
logger_config.setdefault("log_level", default_level)
|
33
|
+
|
34
|
+
# Create the logger using the provided or default configuration
|
35
|
+
self.logger = Logger.default_logger(**logger_config)
|
36
|
+
if self.logger:
|
37
|
+
self.logger.debug(f"'{self.__class__.__name__}' is starting its own logger.")
|
38
|
+
|
39
|
+
fs = kwargs.get("fs")
|
40
|
+
self._own_fs = fs is None
|
41
|
+
self.fs = fs or None # we want to allow None as a valid fs to trigger a failure if needed
|
42
|
+
|
43
|
+
self._entered = False
|
44
|
+
|
45
|
+
def __enter__(self):
|
46
|
+
"""Enter the runtime context."""
|
47
|
+
self._entered = True
|
48
|
+
return self
|
49
|
+
|
50
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
51
|
+
"""Exit the runtime context and trigger cleanup."""
|
52
|
+
self.cleanup()
|
53
|
+
return False # Propagate exceptions
|
54
|
+
|
55
|
+
# --- Asynchronous Context Management ---
|
56
|
+
|
57
|
+
async def __aenter__(self):
|
58
|
+
"""Enter the runtime context for 'async with' statements."""
|
59
|
+
self._entered = True
|
60
|
+
return self
|
61
|
+
|
62
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
63
|
+
"""Exit the runtime context and trigger cleanup for 'async with' statements."""
|
64
|
+
self.cleanup()
|
65
|
+
return False # Propagate exceptions
|
66
|
+
|
67
|
+
def __repr__(self) -> str:
|
68
|
+
"""Return an unambiguous string representation of the ManagedResource."""
|
69
|
+
# Dynamically get the name of the class or subclass
|
70
|
+
class_name = self.__class__.__name__
|
71
|
+
|
72
|
+
# Determine the status of the logger and filesystem
|
73
|
+
logger_status = "own" if self._own_logger else "external"
|
74
|
+
fs_status = "own" if self._own_fs else "external"
|
75
|
+
|
76
|
+
return (
|
77
|
+
f"<{class_name} debug={self.debug}, "
|
78
|
+
f"logger='{logger_status}', fs='{fs_status}'>"
|
79
|
+
)
|
80
|
+
|
81
|
+
def cleanup(self):
|
82
|
+
"""
|
83
|
+
Clean up resources managed by this instance.
|
84
|
+
"""
|
85
|
+
if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
|
86
|
+
if self.logger:
|
87
|
+
self.logger.debug(f"'{self.__class__.__name__}' is clearing its own filesystem cache.")
|
88
|
+
self.fs.clear_instance_cache()
|
89
|
+
|
90
|
+
if self._own_logger and hasattr(self.logger, "shutdown"):
|
91
|
+
# Ensure logger exists before trying to use or shut it down
|
92
|
+
if self.logger:
|
93
|
+
self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
|
94
|
+
self.logger.shutdown()
|
95
|
+
self.logger = None # Set to None after shutdown
|
96
|
+
|
97
|
+
self._entered = False
|
@@ -5,10 +5,10 @@ import pandas as pd
|
|
5
5
|
from clickhouse_driver import Client
|
6
6
|
import dask.dataframe as dd
|
7
7
|
|
8
|
-
from .
|
8
|
+
from . import ManagedResource
|
9
9
|
|
10
10
|
|
11
|
-
class ClickHouseWriter:
|
11
|
+
class ClickHouseWriter(ManagedResource):
|
12
12
|
"""
|
13
13
|
Provides functionality to write a Dask DataFrame to a ClickHouse database using
|
14
14
|
a specified schema. This class handles the creation of tables, schema generation,
|
@@ -48,7 +48,8 @@ class ClickHouseWriter:
|
|
48
48
|
}
|
49
49
|
df: dd.DataFrame
|
50
50
|
|
51
|
-
def __init__(self,
|
51
|
+
def __init__(self, **kwargs):
|
52
|
+
super().__init__(**kwargs)
|
52
53
|
self.clickhouse_host = kwargs.setdefault('host', "localhost")
|
53
54
|
self.clickhouse_port = kwargs.setdefault('port', 8123)
|
54
55
|
self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
|
@@ -56,7 +57,7 @@ class ClickHouseWriter:
|
|
56
57
|
self.clickhouse_password = kwargs.setdefault('password', '')
|
57
58
|
self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
|
58
59
|
|
59
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
60
|
+
#self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
60
61
|
self.client = None
|
61
62
|
self.order_by = kwargs.setdefault('order_by', 'id')
|
62
63
|
|
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -9,11 +9,12 @@ import fsspec
|
|
9
9
|
import pandas as pd
|
10
10
|
from tqdm import tqdm
|
11
11
|
|
12
|
+
from . import ManagedResource
|
12
13
|
from .log_utils import Logger
|
13
14
|
from .parquet_saver import ParquetSaver
|
14
15
|
|
15
16
|
|
16
|
-
class DataWrapper:
|
17
|
+
class DataWrapper(ManagedResource):
|
17
18
|
DEFAULT_PRIORITY_MAP = {
|
18
19
|
"overwrite": 1,
|
19
20
|
"missing_in_history": 2,
|
@@ -30,26 +31,30 @@ class DataWrapper:
|
|
30
31
|
date_field: str,
|
31
32
|
data_path: str,
|
32
33
|
parquet_filename: str,
|
33
|
-
fs: Optional[fsspec.AbstractFileSystem] = None,
|
34
|
-
debug: bool = False,
|
35
|
-
verbose: bool = False,
|
34
|
+
#fs: Optional[fsspec.AbstractFileSystem] = None,
|
35
|
+
#debug: bool = False,
|
36
|
+
#verbose: bool = False,
|
36
37
|
class_params: Optional[Dict] = None,
|
37
38
|
load_params: Optional[Dict] = None,
|
38
|
-
logger: Logger = None,
|
39
|
+
#logger: Logger = None,
|
39
40
|
show_progress: bool = False,
|
40
41
|
timeout: float = 30,
|
41
42
|
max_threads: int = 3,
|
42
43
|
**kwargs: Any,
|
43
44
|
):
|
45
|
+
super().__init__(**kwargs)
|
44
46
|
self.dataclass = dataclass
|
45
47
|
self.date_field = date_field
|
46
48
|
self.data_path = self._ensure_forward_slash(data_path)
|
47
49
|
self.parquet_filename = parquet_filename
|
48
|
-
self.fs = fs or None
|
49
|
-
self.
|
50
|
-
|
51
|
-
self.
|
52
|
-
self.
|
50
|
+
#self.fs = fs or None
|
51
|
+
if self.fs is None:
|
52
|
+
raise ValueError("Datawrapper requires a File system (fs) to be provided .")
|
53
|
+
#self.debug = debug
|
54
|
+
#self.verbose = verbose
|
55
|
+
#self._own_logger = logger is None
|
56
|
+
#self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
|
57
|
+
#self.logger.set_level(logging.DEBUG if debug else logging.INFO)
|
53
58
|
self.show_progress = show_progress
|
54
59
|
self.timeout = timeout
|
55
60
|
self.max_threads = max_threads
|
@@ -66,25 +71,16 @@ class DataWrapper:
|
|
66
71
|
self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
67
72
|
self.mmanifest = kwargs.get("mmanifest", None)
|
68
73
|
self.update_planner=kwargs.get("update_planner", None)
|
69
|
-
self.datacls = self.dataclass(**self.class_params)
|
74
|
+
# self.datacls = self.dataclass(**self.class_params)
|
70
75
|
|
71
|
-
def __enter__(self):
|
72
|
-
"""Context manager entry"""
|
73
|
-
return self
|
74
76
|
|
75
77
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
76
78
|
"""Context manager exit"""
|
77
|
-
if self.mmanifest
|
79
|
+
if self.mmanifest:
|
78
80
|
self.mmanifest.save()
|
79
|
-
|
80
|
-
if exc_type is not None:
|
81
|
-
self.logger.error(f"Exception occurred: {exc_val}")
|
81
|
+
super().__exit__(exc_type, exc_val, exc_tb)
|
82
82
|
return False
|
83
83
|
|
84
|
-
def _init_filesystem(self) -> fsspec.AbstractFileSystem:
|
85
|
-
with self._lock:
|
86
|
-
return fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
87
|
-
|
88
84
|
@staticmethod
|
89
85
|
def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
90
86
|
if isinstance(date, datetime.date):
|
@@ -101,78 +97,68 @@ class DataWrapper:
|
|
101
97
|
def process(self, max_retries: int = 3):
|
102
98
|
"""Process updates with priority-based execution, retries, benchmarking and progress updates"""
|
103
99
|
overall_start = time.perf_counter()
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
if plan_count == 0:
|
108
|
-
self.logger.info("No updates required")
|
100
|
+
tasks = list(self.update_planner.get_tasks_by_priority())
|
101
|
+
if not tasks:
|
102
|
+
self.logger.info("No updates required based on the current plan.")
|
109
103
|
return
|
110
|
-
self.logger.info(f"Update plan for {self.dataclass.__name__} includes {plan_count} items for update")
|
111
104
|
|
112
|
-
if self.
|
105
|
+
if self.update_planner.show_progress:
|
113
106
|
self.update_planner.show_update_plan()
|
114
107
|
|
115
|
-
for priority in
|
116
|
-
self.
|
108
|
+
for priority, dates in tasks:
|
109
|
+
self._execute_task_batch(priority, dates, max_retries)
|
117
110
|
|
118
111
|
total_time = time.perf_counter() - overall_start
|
119
|
-
|
120
|
-
|
121
|
-
self.logger.info(
|
122
|
-
|
123
|
-
f"(avg {total_time / processed:.1f}s per date)"
|
124
|
-
)
|
125
|
-
if self.show_progress or self.verbose:
|
112
|
+
if self.processed_dates:
|
113
|
+
count = len(self.processed_dates)
|
114
|
+
self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
|
115
|
+
if self.update_planner.show_progress:
|
126
116
|
self.show_benchmark_summary()
|
127
117
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
priority: int,
|
132
|
-
max_retries: int
|
133
|
-
):
|
134
|
-
"""Process a single priority group with parallel execution and timing"""
|
135
|
-
dates = plan[plan["update_priority"] == priority]["date"].tolist()
|
136
|
-
if not dates:
|
137
|
-
return
|
118
|
+
|
119
|
+
def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
|
120
|
+
"""Executes a single batch of tasks (dates) using a thread pool."""
|
138
121
|
desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
139
|
-
self.logger.debug(f"Starting {desc.lower()}")
|
140
|
-
group_start = time.perf_counter()
|
141
122
|
max_thr = min(len(dates), self.max_threads)
|
142
|
-
self.logger.
|
123
|
+
self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
|
124
|
+
|
143
125
|
with ThreadPoolExecutor(max_workers=max_thr) as executor:
|
144
126
|
futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
|
145
|
-
|
146
|
-
|
127
|
+
iterator = as_completed(futures)
|
128
|
+
if self.show_progress:
|
129
|
+
iterator = tqdm(iterator, total=len(futures), desc=desc)
|
130
|
+
|
131
|
+
for future in iterator:
|
147
132
|
try:
|
148
133
|
future.result(timeout=self.timeout)
|
149
134
|
except Exception as e:
|
150
|
-
self.logger.error(f"Permanent failure
|
151
|
-
group_time = time.perf_counter() - group_start
|
152
|
-
self.logger.info(f"Priority {priority} group processed {len(dates)} dates in {group_time:.1f}s")
|
135
|
+
self.logger.error(f"Permanent failure for {futures[future]}: {e}")
|
153
136
|
|
154
137
|
def _process_date_with_retry(self, date: datetime.date, max_retries: int):
|
155
|
-
|
138
|
+
"""Wrapper to apply retry logic to single date processing."""
|
139
|
+
for attempt in range(max_retries):
|
156
140
|
try:
|
157
141
|
self._process_single_date(date)
|
158
142
|
return
|
159
143
|
except Exception as e:
|
160
|
-
if attempt < max_retries:
|
161
|
-
self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {e}")
|
144
|
+
if attempt < max_retries - 1:
|
145
|
+
self.logger.warning(f"Retry {attempt + 1}/{max_retries} for {date}: {e}")
|
146
|
+
time.sleep(2 ** attempt) # Exponential backoff
|
162
147
|
else:
|
163
|
-
|
148
|
+
self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
|
149
|
+
#raise
|
164
150
|
|
165
151
|
def _process_single_date(self, date: datetime.date):
|
166
152
|
"""Core date processing logic with load/save timing and thread reporting"""
|
167
153
|
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
168
154
|
self.logger.debug(f"Processing date {date.isoformat()} for {path}")
|
169
155
|
if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
170
|
-
self.logger.
|
156
|
+
self.logger.debug(f"Skipping {date} as it exists in the skipped list")
|
171
157
|
return
|
172
158
|
full_path = f"{path}{self.parquet_filename}"
|
173
159
|
|
174
|
-
thread_name = threading.current_thread().name
|
175
|
-
self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
160
|
+
#thread_name = threading.current_thread().name
|
161
|
+
#self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
176
162
|
|
177
163
|
overall_start = time.perf_counter()
|
178
164
|
try:
|
@@ -180,30 +166,29 @@ class DataWrapper:
|
|
180
166
|
date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
181
167
|
self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
|
182
168
|
# Load data using the dataclass with the provided date filter
|
183
|
-
|
184
|
-
|
169
|
+
# Create a copy to avoid mutating the shared instance dictionary
|
170
|
+
local_load_params = self.load_params.copy()
|
171
|
+
local_load_params.update(date_filter)
|
172
|
+
local_class_instance = self.dataclass(**self.class_params)
|
173
|
+
df=local_class_instance.load(**local_load_params)
|
185
174
|
load_time = time.perf_counter() - load_start
|
186
|
-
if df.head(1, compute=True).empty:
|
187
|
-
if self.mmanifest:
|
188
|
-
schema = df._meta.dtypes.astype(str).to_dict()
|
189
|
-
self.mmanifest.record(
|
190
|
-
full_path=path
|
191
|
-
)
|
192
|
-
self.logger.info(f"No data found for {date}. Logged to missing manifest.")
|
193
|
-
return
|
194
|
-
# Dask-compatible empty check
|
195
|
-
# if len(df.index) == 0:
|
196
|
-
# self.logger.warning(f"No data found for {date}")
|
197
|
-
# return
|
198
175
|
|
176
|
+
if hasattr(local_class_instance, "total_records"):
|
177
|
+
self.logger.debug(f"Total records loaded by {local_class_instance}: {local_class_instance.total_records}")
|
178
|
+
if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
|
179
|
+
if self.mmanifest:
|
180
|
+
self.mmanifest.record(
|
181
|
+
full_path=path
|
182
|
+
)
|
183
|
+
self.logger.info(f"No data found for {date}. Logged to missing manifest.")
|
184
|
+
return
|
199
185
|
save_start = time.perf_counter()
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
).save_to_parquet(self.parquet_filename)
|
186
|
+
ParquetSaver(
|
187
|
+
df_result=df,
|
188
|
+
parquet_storage_path=path,
|
189
|
+
fs=self.fs,
|
190
|
+
logger=self.logger
|
191
|
+
).save_to_parquet(self.parquet_filename, overwrite=True)
|
207
192
|
save_time = time.perf_counter() - save_start
|
208
193
|
|
209
194
|
total_time = time.perf_counter() - overall_start
|
sibi_dst/utils/date_utils.py
CHANGED
@@ -29,8 +29,9 @@ class DateUtils:
|
|
29
29
|
"""
|
30
30
|
_PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
|
31
31
|
|
32
|
-
def __init__(self, logger=None):
|
32
|
+
def __init__(self, logger=None, debug=False):
|
33
33
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
34
|
+
self.debug = debug
|
34
35
|
|
35
36
|
@classmethod
|
36
37
|
def _ensure_date(cls, value: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|