sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
1
5
  import dask.dataframe as dd
2
6
  import pandas as pd
3
7
 
8
+ from sibi_dst.utils import ManagedResource
4
9
  from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
5
- from sibi_dst.utils import Logger
6
10
  from ._db_connection import SqlAlchemyConnectionConfig
7
11
  from ._io_dask import SQLAlchemyDask
8
12
 
9
13
 
10
- class SqlAlchemyLoadFromDb:
14
+ class SqlAlchemyLoadFromDb(ManagedResource):
11
15
  """
12
16
  Orchestrates loading data from a database using SQLAlchemy into a Dask
13
17
  DataFrame by configuring and delegating to the SQLAlchemyDask loader.
@@ -18,7 +22,6 @@ class SqlAlchemyLoadFromDb:
18
22
  plugin_sqlalchemy: SqlAlchemyConnectionConfig,
19
23
  plugin_query: QueryConfig = None,
20
24
  plugin_params: ParamsConfig = None,
21
- logger: Logger = None,
22
25
  **kwargs,
23
26
  ):
24
27
  """
@@ -31,16 +34,16 @@ class SqlAlchemyLoadFromDb:
31
34
  logger: An optional logger instance.
32
35
  **kwargs: Must contain 'index_column' for Dask partitioning.
33
36
  """
37
+ super().__init__(**kwargs)
34
38
  self.db_connection = plugin_sqlalchemy
35
39
  self.model = self.db_connection.model
36
40
  self.engine = self.db_connection.engine
37
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
38
41
  self.query_config = plugin_query
39
42
  self.params_config = plugin_params
40
- self.debug = kwargs.get("debug", False)
41
43
  self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
44
+ self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
42
45
 
43
- def build_and_load(self) -> dd.DataFrame:
46
+ def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
44
47
  """
45
48
  Builds and loads a Dask DataFrame from a SQLAlchemy source.
46
49
 
@@ -58,17 +61,20 @@ class SqlAlchemyLoadFromDb:
58
61
  engine=self.engine,
59
62
  chunk_size=self.chunk_size,
60
63
  logger=self.logger,
64
+ verbose=self.verbose,
61
65
  debug=self.debug
62
66
  )
63
- # Create the lazy DataFrame
64
- dask_df = sqlalchemy_dask_loader.read_frame()
65
- return dask_df
67
+ # Create the lazy DataFrame and read a record count
68
+ # if total_records less than 0, it means an error occurred during the loading process
69
+ self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
70
+ return self.total_records, dask_df
66
71
 
67
72
 
68
73
  except Exception as e:
69
- self.logger.error(f"Failed to build and load data: {e}", exc_info=True)
74
+ self.total_records = -1
75
+ self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
70
76
  # Return an empty dataframe with the correct schema on failure
71
77
  columns = [c.name for c in self.model.__table__.columns]
72
- return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
78
+ return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
73
79
 
74
80
 
@@ -54,8 +54,6 @@ class SqlAlchemyModelBuilder:
54
54
  The dynamically created ORM model class.
55
55
  """
56
56
  with self._lock:
57
- # ✅ REFACTOR: Add a comment acknowledging the risk of using an
58
- # internal API. This is a maintenance warning for future developers.
59
57
  # NOTE: Using a private SQLAlchemy API. This is a performance
60
58
  # optimization but may break in future versions of the library.
61
59
  registered_model = Base.registry._class_registry.get(self.class_name)
@@ -103,104 +101,4 @@ class SqlAlchemyModelBuilder:
103
101
  return f"{sane_name}_field"
104
102
  return sane_name
105
103
 
106
- # import re
107
- # import keyword
108
- # import threading
109
- # from sqlalchemy import MetaData, Engine
110
- # from sqlalchemy.orm import DeclarativeBase
111
- #
112
- #
113
- #
114
- # class Base(DeclarativeBase):
115
- # """shared declarative base for all ORM models."""
116
- # pass
117
- #
118
- #
119
- # apps_label = "datacubes.models"
120
- #
121
- #
122
- # class SqlAlchemyModelBuilder:
123
- # """
124
- # Builds a single SQLAlchemy ORM model from a specific database table.
125
- # This class is thread-safe and caches reflected table metadata to
126
- # improve performance across multiple instantiations.
127
- # """
128
- # _lock = threading.Lock()
129
- # _metadata_cache: dict[str, MetaData] = {}
130
- #
131
- # def __init__(self, engine: Engine, table_name: str):
132
- # """
133
- # Initializes the model builder for a specific table.
134
- #
135
- # Args:
136
- # engine: The SQLAlchemy engine connected to the database.
137
- # table_name: The name of the table to generate the model for.
138
- # """
139
- # self.engine = engine
140
- # self.table_name = table_name
141
- # self.class_name = self._normalize_class_name(self.table_name)
142
- #
143
- # # Use or create a cached MetaData object for this engine to avoid
144
- # # re-reading the schema for tables that are already known.
145
- # engine_key = str(engine.url)
146
- # if engine_key not in self._metadata_cache:
147
- # self._metadata_cache[engine_key] = MetaData()
148
- # self.metadata = self._metadata_cache[engine_key]
149
- #
150
- # def build_model(self) -> type:
151
- # """
152
- # Builds and returns a database model class for the specified table.
153
- # This process is atomic and thread-safe.
154
- #
155
- # Raises:
156
- # ValueError: If the specified table does not exist in the database.
157
- # Returns:
158
- # The dynamically created ORM model class.
159
- # """
160
- # with self._lock:
161
- # # First, check if the model class is already registered in SQLAlchemy
162
- # registered_model = Base.registry._class_registry.get(self.class_name)
163
- # if registered_model:
164
- # return registered_model
165
- #
166
- # # Next, check if the table's schema is in our metadata cache
167
- # table = self.metadata.tables.get(self.table_name)
168
- #
169
- # # If not cached, reflect it from the database
170
- # if table is None:
171
- # self.metadata.reflect(bind=self.engine, only=[self.table_name])
172
- # table = self.metadata.tables.get(self.table_name)
173
- #
174
- # if table is None:
175
- # raise ValueError(
176
- # f"Table '{self.table_name}' does not exist in the database."
177
- # )
178
- #
179
- # # Create the model class dynamically.
180
- # # No need to add columns manually; __table__ handles it.
181
- # attrs = {
182
- # "__tablename__": table.name,
183
- # "__table__": table,
184
- # "__module__": apps_label,
185
- # }
186
- # model = type(self.class_name, (Base,), attrs)
187
- #
188
- # return model
189
- #
190
- # @staticmethod
191
- # def _normalize_class_name(table_name: str) -> str:
192
- # """Converts a snake_case table_name to a CamelCase class name."""
193
- # return "".join(word.capitalize() for word in table_name.split("_"))
194
- #
195
- # @staticmethod
196
- # def _normalize_column_name(column_name: str) -> str:
197
- # """
198
- # Sanitizes a column name to be a valid Python identifier.
199
- # (Kept for utility, though not used in the final model creation).
200
- # """
201
- # sane_name = re.sub(r"\W", "_", column_name)
202
- # sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
203
- #
204
- # if keyword.iskeyword(sane_name):
205
- # return f"{sane_name}_field"
206
- # return sane_name
104
+
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from .log_utils import Logger
4
+ from .base import ManagedResource
4
5
  from .date_utils import *
5
6
  from .data_utils import DataUtils
6
7
  from .file_utils import FileUtils
@@ -20,6 +21,7 @@ from .manifest_manager import MissingManifestManager
20
21
 
21
22
  __all__ = [
22
23
  "Logger",
24
+ "ManagedResource",
23
25
  "ConfigManager",
24
26
  "ConfigLoader",
25
27
  "DateUtils",
@@ -38,6 +40,5 @@ __all__ = [
38
40
  "FsRegistry",
39
41
  "DataFromHttpSource",
40
42
  "WebDAVClient",
41
- "MissingManifestManager",
43
+ "MissingManifestManager"
42
44
  ]
43
-
sibi_dst/utils/base.py ADDED
@@ -0,0 +1,117 @@
1
+ import asyncio
2
+ from .log_utils import Logger
3
+
4
+ class ManagedResource:
5
+ """
6
+ A base class providing context management for resources like loggers and filesystems.
7
+
8
+ It handles the creation and cleanup of these resources, ensuring they are only
9
+ closed if they were created by the instance itself.
10
+ """
11
+
12
+ def __init__(self, **kwargs):
13
+ self.debug = kwargs.get("debug", False)
14
+ self.verbose = kwargs.get("verbose", False)
15
+
16
+ # --- Logger Management (Refactored) ---
17
+ logger = kwargs.get("logger")
18
+ if logger:
19
+ # An existing logger instance was provided by the user
20
+ self.logger = logger
21
+ self._own_logger = False
22
+ self.logger.debug(f"'{self.__class__.__name__}' is tapping into an existing logger.")
23
+ else:
24
+ # No pre-configured logger, so we will create and "own" a new one.
25
+ self._own_logger = True
26
+ logger_config = kwargs.get("logger_config", {})
27
+
28
+ # Set default logger_name if not specified in the config
29
+ logger_config.setdefault("logger_name", self.__class__.__name__)
30
+
31
+ # Set log_level based on debug flag, but respect user-provided level
32
+ default_level = Logger.DEBUG if self.debug else Logger.INFO
33
+ logger_config.setdefault("log_level", default_level)
34
+
35
+ # Create the logger using the provided or default configuration
36
+ self.logger = Logger.default_logger(**logger_config)
37
+ if self.logger:
38
+ self.logger.debug(f"'{self.__class__.__name__}' is starting its own logger.")
39
+
40
+ fs = kwargs.get("fs")
41
+ self._own_fs = fs is None
42
+ self.fs = fs or None # we want to allow None as a valid fs to trigger a failure if needed
43
+
44
+ self._entered = False
45
+
46
+ def __enter__(self):
47
+ """Enter the runtime context."""
48
+ self._entered = True
49
+ return self
50
+
51
+ def __exit__(self, exc_type, exc_val, exc_tb):
52
+ """Exit the runtime context and trigger cleanup."""
53
+ self.cleanup()
54
+ return False # Propagate exceptions
55
+
56
+ # --- Asynchronous Context Management ---
57
+
58
+ async def __aenter__(self):
59
+ """Enter the runtime context for 'async with' statements."""
60
+ self._entered = True
61
+ return self
62
+
63
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
64
+ """Exit the runtime context and trigger cleanup for 'async with' statements."""
65
+ await self.acleanup()
66
+ return False # Propagate exceptions
67
+
68
+ def __repr__(self) -> str:
69
+ """Return an unambiguous string representation of the ManagedResource."""
70
+ # Dynamically get the name of the class or subclass
71
+ class_name = self.__class__.__name__
72
+
73
+ # Determine the status of the logger and filesystem
74
+ logger_status = "own" if self._own_logger else "external"
75
+ fs_status = "own" if self._own_fs else "external"
76
+
77
+ return (
78
+ f"<{class_name} debug={self.debug}, "
79
+ f"logger='{logger_status}', fs='{fs_status}'>"
80
+ )
81
+
82
+ def cleanup(self):
83
+ """
84
+ Cleanup resources managed by this instance.
85
+ """
86
+ if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
87
+ if self.logger:
88
+ self.logger.debug(f"'{self.__class__.__name__}' is clearing its own filesystem cache.")
89
+ self.fs.clear_instance_cache()
90
+
91
+ if self._own_logger and hasattr(self.logger, "shutdown"):
92
+ # Ensure the logger exists before trying to use or shut it down
93
+ if self.logger:
94
+ self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
95
+ self.logger.shutdown()
96
+ self.logger = None # Set to None after shutdown
97
+
98
+ self._entered = False
99
+
100
+ async def acleanup(self):
101
+ """
102
+ Async Cleanup resources managed by this instance.
103
+ """
104
+ if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
105
+ if self.logger:
106
+ self.logger.debug(f"'{self.__class__.__name__}' is clearing its own filesystem cache.")
107
+ self.fs.clear_instance_cache()
108
+
109
+ if self._own_logger and hasattr(self.logger, "shutdown"):
110
+ # Ensure the logger exists before trying to use or shut it down
111
+ if self.logger:
112
+ self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
113
+ self.logger.shutdown()
114
+ self.logger = None # Set to None after shutdown
115
+
116
+ self._entered = False
117
+
@@ -1,14 +1,15 @@
1
1
  from concurrent.futures import ThreadPoolExecutor
2
+ from typing import ClassVar, Dict
2
3
 
3
4
  import clickhouse_connect
4
5
  import pandas as pd
5
6
  from clickhouse_driver import Client
6
7
  import dask.dataframe as dd
7
8
 
8
- from .log_utils import Logger
9
+ from . import ManagedResource
9
10
 
10
11
 
11
- class ClickHouseWriter:
12
+ class ClickHouseWriter(ManagedResource):
12
13
  """
13
14
  Provides functionality to write a Dask DataFrame to a ClickHouse database using
14
15
  a specified schema. This class handles the creation of tables, schema generation,
@@ -36,7 +37,7 @@ class ClickHouseWriter:
36
37
  :ivar order_by: Field or column name to use for table ordering.
37
38
  :type order_by: str
38
39
  """
39
- dtype_to_clickhouse = {
40
+ dtype_to_clickhouse: ClassVar[Dict[str, str]] = {
40
41
  'int64': 'Int64',
41
42
  'int32': 'Int32',
42
43
  'float64': 'Float64',
@@ -48,7 +49,8 @@ class ClickHouseWriter:
48
49
  }
49
50
  df: dd.DataFrame
50
51
 
51
- def __init__(self, logger=None, **kwargs):
52
+ def __init__(self, **kwargs):
53
+ super().__init__(**kwargs)
52
54
  self.clickhouse_host = kwargs.setdefault('host', "localhost")
53
55
  self.clickhouse_port = kwargs.setdefault('port', 8123)
54
56
  self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
@@ -56,7 +58,7 @@ class ClickHouseWriter:
56
58
  self.clickhouse_password = kwargs.setdefault('password', '')
57
59
  self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
58
60
 
59
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
61
+ #self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
60
62
  self.client = None
61
63
  self.order_by = kwargs.setdefault('order_by', 'id')
62
64
 
@@ -3,26 +3,27 @@ import logging
3
3
  import threading
4
4
  import time
5
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
- from typing import Type, Any, Dict, Optional, Union, List
6
+ from typing import Type, Any, Dict, Optional, Union, List, ClassVar
7
7
 
8
8
  import fsspec
9
9
  import pandas as pd
10
10
  from tqdm import tqdm
11
11
 
12
+ from . import ManagedResource
12
13
  from .log_utils import Logger
13
14
  from .parquet_saver import ParquetSaver
14
15
 
15
16
 
16
- class DataWrapper:
17
- DEFAULT_PRIORITY_MAP = {
17
+ class DataWrapper(ManagedResource):
18
+ DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
18
19
  "overwrite": 1,
19
20
  "missing_in_history": 2,
20
21
  "existing_but_stale": 3,
21
22
  "missing_outside_history": 4,
22
23
  "file_is_recent": 0
23
24
  }
24
- DEFAULT_MAX_AGE_MINUTES = 1440
25
- DEFAULT_HISTORY_DAYS_THRESHOLD = 30
25
+ DEFAULT_MAX_AGE_MINUTES: int = 1440
26
+ DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
26
27
 
27
28
  def __init__(
28
29
  self,
@@ -30,26 +31,20 @@ class DataWrapper:
30
31
  date_field: str,
31
32
  data_path: str,
32
33
  parquet_filename: str,
33
- fs: Optional[fsspec.AbstractFileSystem] = None,
34
- debug: bool = False,
35
- verbose: bool = False,
36
34
  class_params: Optional[Dict] = None,
37
35
  load_params: Optional[Dict] = None,
38
- logger: Logger = None,
39
36
  show_progress: bool = False,
40
37
  timeout: float = 30,
41
38
  max_threads: int = 3,
42
39
  **kwargs: Any,
43
40
  ):
41
+ super().__init__(**kwargs)
44
42
  self.dataclass = dataclass
45
43
  self.date_field = date_field
46
44
  self.data_path = self._ensure_forward_slash(data_path)
47
45
  self.parquet_filename = parquet_filename
48
- self.fs = fs or None
49
- self.debug = debug
50
- self.verbose = verbose
51
- self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
52
- self.logger.set_level(logging.DEBUG if debug else logging.INFO)
46
+ if self.fs is None:
47
+ raise ValueError("Datawrapper requires a File system (fs) to be provided .")
53
48
  self.show_progress = show_progress
54
49
  self.timeout = timeout
55
50
  self.max_threads = max_threads
@@ -66,25 +61,15 @@ class DataWrapper:
66
61
  self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
67
62
  self.mmanifest = kwargs.get("mmanifest", None)
68
63
  self.update_planner=kwargs.get("update_planner", None)
69
- self.datacls = self.dataclass(**self.class_params)
70
64
 
71
- def __enter__(self):
72
- """Context manager entry"""
73
- return self
74
65
 
75
66
  def __exit__(self, exc_type, exc_val, exc_tb):
76
67
  """Context manager exit"""
77
- if self.mmanifest and self.mmanifest._new_records:
68
+ if self.mmanifest:
78
69
  self.mmanifest.save()
79
- self.mmanifest.cleanup_temp_manifests()
80
- if exc_type is not None:
81
- self.logger.error(f"Exception occurred: {exc_val}")
70
+ super().__exit__(exc_type, exc_val, exc_tb)
82
71
  return False
83
72
 
84
- def _init_filesystem(self) -> fsspec.AbstractFileSystem:
85
- with self._lock:
86
- return fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
87
-
88
73
  @staticmethod
89
74
  def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
90
75
  if isinstance(date, datetime.date):
@@ -101,78 +86,68 @@ class DataWrapper:
101
86
  def process(self, max_retries: int = 3):
102
87
  """Process updates with priority-based execution, retries, benchmarking and progress updates"""
103
88
  overall_start = time.perf_counter()
104
- plan = self.update_planner.plan
105
- # Use len(plan.index) instead of plan.empty for Dask compatibility
106
- plan_count = len(plan.index)
107
- if plan_count == 0:
108
- self.logger.info("No updates required")
89
+ tasks = list(self.update_planner.get_tasks_by_priority())
90
+ if not tasks:
91
+ self.logger.info("No updates required based on the current plan.")
109
92
  return
110
- self.logger.info(f"Update plan for {self.dataclass.__name__} includes {plan_count} items for update")
111
93
 
112
- if self.verbose:
94
+ if self.update_planner.show_progress:
113
95
  self.update_planner.show_update_plan()
114
96
 
115
- for priority in sorted(plan["update_priority"].unique()):
116
- self._process_priority_group(plan, priority, max_retries)
97
+ for priority, dates in tasks:
98
+ self._execute_task_batch(priority, dates, max_retries)
117
99
 
118
100
  total_time = time.perf_counter() - overall_start
119
- processed = len(self.processed_dates)
120
- if processed:
121
- self.logger.info(
122
- f"Processed {processed} dates in {total_time:.1f}s "
123
- f"(avg {total_time / processed:.1f}s per date)"
124
- )
125
- if self.show_progress or self.verbose:
101
+ if self.processed_dates:
102
+ count = len(self.processed_dates)
103
+ self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
104
+ if self.update_planner.show_progress:
126
105
  self.show_benchmark_summary()
127
106
 
128
- def _process_priority_group(
129
- self,
130
- plan: pd.DataFrame,
131
- priority: int,
132
- max_retries: int
133
- ):
134
- """Process a single priority group with parallel execution and timing"""
135
- dates = plan[plan["update_priority"] == priority]["date"].tolist()
136
- if not dates:
137
- return
107
+
108
+ def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
109
+ """Executes a single batch of tasks (dates) using a thread pool."""
138
110
  desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
139
- self.logger.debug(f"Starting {desc.lower()}")
140
- group_start = time.perf_counter()
141
111
  max_thr = min(len(dates), self.max_threads)
142
- self.logger.debug(f"Max threads for priority {priority}: {max_thr}")
112
+ self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
113
+
143
114
  with ThreadPoolExecutor(max_workers=max_thr) as executor:
144
115
  futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
145
- for future in tqdm(as_completed(futures), total=len(futures), desc=desc, disable=not self.show_progress):
146
- date = futures[future]
116
+ iterator = as_completed(futures)
117
+ if self.show_progress:
118
+ iterator = tqdm(iterator, total=len(futures), desc=desc)
119
+
120
+ for future in iterator:
147
121
  try:
148
122
  future.result(timeout=self.timeout)
149
123
  except Exception as e:
150
- self.logger.error(f"Permanent failure processing {date}: {e}")
151
- group_time = time.perf_counter() - group_start
152
- self.logger.info(f"Priority {priority} group processed {len(dates)} dates in {group_time:.1f}s")
124
+ self.logger.error(f"Permanent failure for {futures[future]}: {e}")
153
125
 
154
126
  def _process_date_with_retry(self, date: datetime.date, max_retries: int):
155
- for attempt in range(1, max_retries + 1):
127
+ """Wrapper to apply retry logic to single date processing."""
128
+ for attempt in range(max_retries):
156
129
  try:
157
130
  self._process_single_date(date)
158
131
  return
159
132
  except Exception as e:
160
- if attempt < max_retries:
161
- self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {e}")
133
+ if attempt < max_retries - 1:
134
+ self.logger.warning(f"Retry {attempt + 1}/{max_retries} for {date}: {e}")
135
+ time.sleep(2 ** attempt) # Exponential backoff
162
136
  else:
163
- raise RuntimeError(f"Failed processing {date} after {max_retries} attempts") from e
137
+ self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
138
+ #raise
164
139
 
165
140
  def _process_single_date(self, date: datetime.date):
166
141
  """Core date processing logic with load/save timing and thread reporting"""
167
142
  path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
168
143
  self.logger.debug(f"Processing date {date.isoformat()} for {path}")
169
144
  if path in self.update_planner.skipped and self.update_planner.ignore_missing:
170
- self.logger.info(f"Skipping {date} as it exists in the skipped list")
145
+ self.logger.debug(f"Skipping {date} as it exists in the skipped list")
171
146
  return
172
147
  full_path = f"{path}{self.parquet_filename}"
173
148
 
174
- thread_name = threading.current_thread().name
175
- self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
149
+ #thread_name = threading.current_thread().name
150
+ #self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
176
151
 
177
152
  overall_start = time.perf_counter()
178
153
  try:
@@ -180,30 +155,30 @@ class DataWrapper:
180
155
  date_filter = {f"{self.date_field}__date": {date.isoformat()}}
181
156
  self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
182
157
  # Load data using the dataclass with the provided date filter
183
- self.load_params.update(date_filter)
184
- df = self.datacls.load(**self.load_params)
158
+ # Create a copy to avoid mutating the shared instance dictionary
159
+ local_load_params = self.load_params.copy()
160
+ local_load_params.update(date_filter)
161
+ local_class_instance = self.dataclass(**self.class_params)
162
+ df=local_class_instance.load(**local_load_params)
185
163
  load_time = time.perf_counter() - load_start
186
- if df.head(1, compute=True).empty:
187
- if self.mmanifest:
188
- schema = df._meta.dtypes.astype(str).to_dict()
189
- self.mmanifest.record(
190
- full_path=path
191
- )
192
- self.logger.info(f"No data found for {date}. Logged to missing manifest.")
193
- return
194
- # Dask-compatible empty check
195
- # if len(df.index) == 0:
196
- # self.logger.warning(f"No data found for {date}")
197
- # return
198
164
 
165
+ if hasattr(local_class_instance, "total_records"):
166
+ self.logger.debug(f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
167
+ if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
168
+ if self.mmanifest:
169
+ self.mmanifest.record(
170
+ full_path=path
171
+ )
172
+ self.logger.info(f"No data found for {date}. Logged to missing manifest.")
173
+ return
199
174
  save_start = time.perf_counter()
200
- with self._lock:
201
- ParquetSaver(
202
- df_result=df,
203
- parquet_storage_path=path,
204
- fs=self.fs,
205
- logger=self.logger
206
- ).save_to_parquet(self.parquet_filename)
175
+ with ParquetSaver(
176
+ df_result=df,
177
+ parquet_storage_path=path,
178
+ fs=self.fs,
179
+ logger=self.logger
180
+ ) as ps:
181
+ ps.save_to_parquet(self.parquet_filename, overwrite=True)
207
182
  save_time = time.perf_counter() - save_start
208
183
 
209
184
  total_time = time.perf_counter() - overall_start
@@ -233,4 +208,4 @@ class DataWrapper:
233
208
  return
234
209
  df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
235
210
  df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
236
- self.logger.info("Benchmark Summary:\n" + df_bench.to_string())
211
+ self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
@@ -29,8 +29,9 @@ class DateUtils:
29
29
  """
30
30
  _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
31
31
 
32
- def __init__(self, logger=None):
32
+ def __init__(self, logger=None, debug=False):
33
33
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
34
+ self.debug = debug
34
35
 
35
36
  @classmethod
36
37
  def _ensure_date(cls, value: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date: