sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
1
5
  import dask.dataframe as dd
2
6
  import pandas as pd
3
7
 
8
+ from sibi_dst.utils import ManagedResource
4
9
  from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
5
- from sibi_dst.utils import Logger
6
10
  from ._db_connection import SqlAlchemyConnectionConfig
7
11
  from ._io_dask import SQLAlchemyDask
8
12
 
9
13
 
10
- class SqlAlchemyLoadFromDb:
14
+ class SqlAlchemyLoadFromDb(ManagedResource):
11
15
  """
12
16
  Orchestrates loading data from a database using SQLAlchemy into a Dask
13
17
  DataFrame by configuring and delegating to the SQLAlchemyDask loader.
@@ -18,7 +22,6 @@ class SqlAlchemyLoadFromDb:
18
22
  plugin_sqlalchemy: SqlAlchemyConnectionConfig,
19
23
  plugin_query: QueryConfig = None,
20
24
  plugin_params: ParamsConfig = None,
21
- logger: Logger = None,
22
25
  **kwargs,
23
26
  ):
24
27
  """
@@ -31,16 +34,16 @@ class SqlAlchemyLoadFromDb:
31
34
  logger: An optional logger instance.
32
35
  **kwargs: Must contain 'index_column' for Dask partitioning.
33
36
  """
37
+ super().__init__(**kwargs)
34
38
  self.db_connection = plugin_sqlalchemy
35
39
  self.model = self.db_connection.model
36
40
  self.engine = self.db_connection.engine
37
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
38
41
  self.query_config = plugin_query
39
42
  self.params_config = plugin_params
40
- self.debug = kwargs.get("debug", False)
41
43
  self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
44
+ self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
42
45
 
43
- def build_and_load(self) -> dd.DataFrame:
46
+ def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
44
47
  """
45
48
  Builds and loads a Dask DataFrame from a SQLAlchemy source.
46
49
 
@@ -58,17 +61,20 @@ class SqlAlchemyLoadFromDb:
58
61
  engine=self.engine,
59
62
  chunk_size=self.chunk_size,
60
63
  logger=self.logger,
64
+ verbose=self.verbose,
61
65
  debug=self.debug
62
66
  )
63
- # Create the lazy DataFrame
64
- dask_df = sqlalchemy_dask_loader.read_frame()
65
- return dask_df
67
+ # Create the lazy DataFrame and read a record count
68
+ # if total_records less than 0, it means an error occurred during the loading process
69
+ self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
70
+ return self.total_records, dask_df
66
71
 
67
72
 
68
73
  except Exception as e:
69
- self.logger.error(f"Failed to build and load data: {e}", exc_info=True)
74
+ self.total_records = -1
75
+ self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
70
76
  # Return an empty dataframe with the correct schema on failure
71
77
  columns = [c.name for c in self.model.__table__.columns]
72
- return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
78
+ return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
73
79
 
74
80
 
@@ -54,8 +54,6 @@ class SqlAlchemyModelBuilder:
54
54
  The dynamically created ORM model class.
55
55
  """
56
56
  with self._lock:
57
- # ✅ REFACTOR: Add a comment acknowledging the risk of using an
58
- # internal API. This is a maintenance warning for future developers.
59
57
  # NOTE: Using a private SQLAlchemy API. This is a performance
60
58
  # optimization but may break in future versions of the library.
61
59
  registered_model = Base.registry._class_registry.get(self.class_name)
@@ -103,104 +101,4 @@ class SqlAlchemyModelBuilder:
103
101
  return f"{sane_name}_field"
104
102
  return sane_name
105
103
 
106
- # import re
107
- # import keyword
108
- # import threading
109
- # from sqlalchemy import MetaData, Engine
110
- # from sqlalchemy.orm import DeclarativeBase
111
- #
112
- #
113
- #
114
- # class Base(DeclarativeBase):
115
- # """shared declarative base for all ORM models."""
116
- # pass
117
- #
118
- #
119
- # apps_label = "datacubes.models"
120
- #
121
- #
122
- # class SqlAlchemyModelBuilder:
123
- # """
124
- # Builds a single SQLAlchemy ORM model from a specific database table.
125
- # This class is thread-safe and caches reflected table metadata to
126
- # improve performance across multiple instantiations.
127
- # """
128
- # _lock = threading.Lock()
129
- # _metadata_cache: dict[str, MetaData] = {}
130
- #
131
- # def __init__(self, engine: Engine, table_name: str):
132
- # """
133
- # Initializes the model builder for a specific table.
134
- #
135
- # Args:
136
- # engine: The SQLAlchemy engine connected to the database.
137
- # table_name: The name of the table to generate the model for.
138
- # """
139
- # self.engine = engine
140
- # self.table_name = table_name
141
- # self.class_name = self._normalize_class_name(self.table_name)
142
- #
143
- # # Use or create a cached MetaData object for this engine to avoid
144
- # # re-reading the schema for tables that are already known.
145
- # engine_key = str(engine.url)
146
- # if engine_key not in self._metadata_cache:
147
- # self._metadata_cache[engine_key] = MetaData()
148
- # self.metadata = self._metadata_cache[engine_key]
149
- #
150
- # def build_model(self) -> type:
151
- # """
152
- # Builds and returns a database model class for the specified table.
153
- # This process is atomic and thread-safe.
154
- #
155
- # Raises:
156
- # ValueError: If the specified table does not exist in the database.
157
- # Returns:
158
- # The dynamically created ORM model class.
159
- # """
160
- # with self._lock:
161
- # # First, check if the model class is already registered in SQLAlchemy
162
- # registered_model = Base.registry._class_registry.get(self.class_name)
163
- # if registered_model:
164
- # return registered_model
165
- #
166
- # # Next, check if the table's schema is in our metadata cache
167
- # table = self.metadata.tables.get(self.table_name)
168
- #
169
- # # If not cached, reflect it from the database
170
- # if table is None:
171
- # self.metadata.reflect(bind=self.engine, only=[self.table_name])
172
- # table = self.metadata.tables.get(self.table_name)
173
- #
174
- # if table is None:
175
- # raise ValueError(
176
- # f"Table '{self.table_name}' does not exist in the database."
177
- # )
178
- #
179
- # # Create the model class dynamically.
180
- # # No need to add columns manually; __table__ handles it.
181
- # attrs = {
182
- # "__tablename__": table.name,
183
- # "__table__": table,
184
- # "__module__": apps_label,
185
- # }
186
- # model = type(self.class_name, (Base,), attrs)
187
- #
188
- # return model
189
- #
190
- # @staticmethod
191
- # def _normalize_class_name(table_name: str) -> str:
192
- # """Converts a snake_case table_name to a CamelCase class name."""
193
- # return "".join(word.capitalize() for word in table_name.split("_"))
194
- #
195
- # @staticmethod
196
- # def _normalize_column_name(column_name: str) -> str:
197
- # """
198
- # Sanitizes a column name to be a valid Python identifier.
199
- # (Kept for utility, though not used in the final model creation).
200
- # """
201
- # sane_name = re.sub(r"\W", "_", column_name)
202
- # sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
203
- #
204
- # if keyword.iskeyword(sane_name):
205
- # return f"{sane_name}_field"
206
- # return sane_name
104
+
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from .log_utils import Logger
4
+ from .base import ManagedResource
4
5
  from .date_utils import *
5
6
  from .data_utils import DataUtils
6
7
  from .file_utils import FileUtils
@@ -20,6 +21,7 @@ from .manifest_manager import MissingManifestManager
20
21
 
21
22
  __all__ = [
22
23
  "Logger",
24
+ "ManagedResource",
23
25
  "ConfigManager",
24
26
  "ConfigLoader",
25
27
  "DateUtils",
@@ -38,6 +40,5 @@ __all__ = [
38
40
  "FsRegistry",
39
41
  "DataFromHttpSource",
40
42
  "WebDAVClient",
41
- "MissingManifestManager",
43
+ "MissingManifestManager"
42
44
  ]
43
-
sibi_dst/utils/base.py ADDED
@@ -0,0 +1,97 @@
1
+ from .log_utils import Logger
2
+
3
+ class ManagedResource:
4
+ """
5
+ A base class providing context management for resources like loggers and filesystems.
6
+
7
+ It handles the creation and cleanup of these resources, ensuring they are only
8
+ closed if they were created by the instance itself.
9
+ """
10
+
11
+ def __init__(self, **kwargs):
12
+ self.debug = kwargs.get("debug", False)
13
+ self.verbose = kwargs.get("verbose", False)
14
+
15
+ # --- Logger Management (Refactored) ---
16
+ logger = kwargs.get("logger")
17
+ if logger:
18
+ # An existing logger instance was provided by the user
19
+ self.logger = logger
20
+ self._own_logger = False
21
+ self.logger.debug(f"'{self.__class__.__name__}' is tapping into an existing logger.")
22
+ else:
23
+ # No pre-configured logger, so we will create and "own" a new one.
24
+ self._own_logger = True
25
+ logger_config = kwargs.get("logger_config", {})
26
+
27
+ # Set default logger_name if not specified in the config
28
+ logger_config.setdefault("logger_name", self.__class__.__name__)
29
+
30
+ # Set log_level based on debug flag, but respect user-provided level
31
+ default_level = Logger.DEBUG if self.debug else Logger.INFO
32
+ logger_config.setdefault("log_level", default_level)
33
+
34
+ # Create the logger using the provided or default configuration
35
+ self.logger = Logger.default_logger(**logger_config)
36
+ if self.logger:
37
+ self.logger.debug(f"'{self.__class__.__name__}' is starting its own logger.")
38
+
39
+ fs = kwargs.get("fs")
40
+ self._own_fs = fs is None
41
+ self.fs = fs or None # we want to allow None as a valid fs to trigger a failure if needed
42
+
43
+ self._entered = False
44
+
45
+ def __enter__(self):
46
+ """Enter the runtime context."""
47
+ self._entered = True
48
+ return self
49
+
50
+ def __exit__(self, exc_type, exc_val, exc_tb):
51
+ """Exit the runtime context and trigger cleanup."""
52
+ self.cleanup()
53
+ return False # Propagate exceptions
54
+
55
+ # --- Asynchronous Context Management ---
56
+
57
+ async def __aenter__(self):
58
+ """Enter the runtime context for 'async with' statements."""
59
+ self._entered = True
60
+ return self
61
+
62
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
63
+ """Exit the runtime context and trigger cleanup for 'async with' statements."""
64
+ self.cleanup()
65
+ return False # Propagate exceptions
66
+
67
+ def __repr__(self) -> str:
68
+ """Return an unambiguous string representation of the ManagedResource."""
69
+ # Dynamically get the name of the class or subclass
70
+ class_name = self.__class__.__name__
71
+
72
+ # Determine the status of the logger and filesystem
73
+ logger_status = "own" if self._own_logger else "external"
74
+ fs_status = "own" if self._own_fs else "external"
75
+
76
+ return (
77
+ f"<{class_name} debug={self.debug}, "
78
+ f"logger='{logger_status}', fs='{fs_status}'>"
79
+ )
80
+
81
+ def cleanup(self):
82
+ """
83
+ Clean up resources managed by this instance.
84
+ """
85
+ if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
86
+ if self.logger:
87
+ self.logger.debug(f"'{self.__class__.__name__}' is clearing its own filesystem cache.")
88
+ self.fs.clear_instance_cache()
89
+
90
+ if self._own_logger and hasattr(self.logger, "shutdown"):
91
+ # Ensure logger exists before trying to use or shut it down
92
+ if self.logger:
93
+ self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
94
+ self.logger.shutdown()
95
+ self.logger = None # Set to None after shutdown
96
+
97
+ self._entered = False
@@ -5,10 +5,10 @@ import pandas as pd
5
5
  from clickhouse_driver import Client
6
6
  import dask.dataframe as dd
7
7
 
8
- from .log_utils import Logger
8
+ from . import ManagedResource
9
9
 
10
10
 
11
- class ClickHouseWriter:
11
+ class ClickHouseWriter(ManagedResource):
12
12
  """
13
13
  Provides functionality to write a Dask DataFrame to a ClickHouse database using
14
14
  a specified schema. This class handles the creation of tables, schema generation,
@@ -48,7 +48,8 @@ class ClickHouseWriter:
48
48
  }
49
49
  df: dd.DataFrame
50
50
 
51
- def __init__(self, logger=None, **kwargs):
51
+ def __init__(self, **kwargs):
52
+ super().__init__(**kwargs)
52
53
  self.clickhouse_host = kwargs.setdefault('host', "localhost")
53
54
  self.clickhouse_port = kwargs.setdefault('port', 8123)
54
55
  self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
@@ -56,7 +57,7 @@ class ClickHouseWriter:
56
57
  self.clickhouse_password = kwargs.setdefault('password', '')
57
58
  self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
58
59
 
59
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
60
+ #self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
60
61
  self.client = None
61
62
  self.order_by = kwargs.setdefault('order_by', 'id')
62
63
 
@@ -9,11 +9,12 @@ import fsspec
9
9
  import pandas as pd
10
10
  from tqdm import tqdm
11
11
 
12
+ from . import ManagedResource
12
13
  from .log_utils import Logger
13
14
  from .parquet_saver import ParquetSaver
14
15
 
15
16
 
16
- class DataWrapper:
17
+ class DataWrapper(ManagedResource):
17
18
  DEFAULT_PRIORITY_MAP = {
18
19
  "overwrite": 1,
19
20
  "missing_in_history": 2,
@@ -30,26 +31,30 @@ class DataWrapper:
30
31
  date_field: str,
31
32
  data_path: str,
32
33
  parquet_filename: str,
33
- fs: Optional[fsspec.AbstractFileSystem] = None,
34
- debug: bool = False,
35
- verbose: bool = False,
34
+ #fs: Optional[fsspec.AbstractFileSystem] = None,
35
+ #debug: bool = False,
36
+ #verbose: bool = False,
36
37
  class_params: Optional[Dict] = None,
37
38
  load_params: Optional[Dict] = None,
38
- logger: Logger = None,
39
+ #logger: Logger = None,
39
40
  show_progress: bool = False,
40
41
  timeout: float = 30,
41
42
  max_threads: int = 3,
42
43
  **kwargs: Any,
43
44
  ):
45
+ super().__init__(**kwargs)
44
46
  self.dataclass = dataclass
45
47
  self.date_field = date_field
46
48
  self.data_path = self._ensure_forward_slash(data_path)
47
49
  self.parquet_filename = parquet_filename
48
- self.fs = fs or None
49
- self.debug = debug
50
- self.verbose = verbose
51
- self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
52
- self.logger.set_level(logging.DEBUG if debug else logging.INFO)
50
+ #self.fs = fs or None
51
+ if self.fs is None:
52
+ raise ValueError("Datawrapper requires a File system (fs) to be provided .")
53
+ #self.debug = debug
54
+ #self.verbose = verbose
55
+ #self._own_logger = logger is None
56
+ #self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
57
+ #self.logger.set_level(logging.DEBUG if debug else logging.INFO)
53
58
  self.show_progress = show_progress
54
59
  self.timeout = timeout
55
60
  self.max_threads = max_threads
@@ -66,25 +71,16 @@ class DataWrapper:
66
71
  self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
67
72
  self.mmanifest = kwargs.get("mmanifest", None)
68
73
  self.update_planner=kwargs.get("update_planner", None)
69
- self.datacls = self.dataclass(**self.class_params)
74
+ # self.datacls = self.dataclass(**self.class_params)
70
75
 
71
- def __enter__(self):
72
- """Context manager entry"""
73
- return self
74
76
 
75
77
  def __exit__(self, exc_type, exc_val, exc_tb):
76
78
  """Context manager exit"""
77
- if self.mmanifest and self.mmanifest._new_records:
79
+ if self.mmanifest:
78
80
  self.mmanifest.save()
79
- self.mmanifest.cleanup_temp_manifests()
80
- if exc_type is not None:
81
- self.logger.error(f"Exception occurred: {exc_val}")
81
+ super().__exit__(exc_type, exc_val, exc_tb)
82
82
  return False
83
83
 
84
- def _init_filesystem(self) -> fsspec.AbstractFileSystem:
85
- with self._lock:
86
- return fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
87
-
88
84
  @staticmethod
89
85
  def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
90
86
  if isinstance(date, datetime.date):
@@ -101,78 +97,68 @@ class DataWrapper:
101
97
  def process(self, max_retries: int = 3):
102
98
  """Process updates with priority-based execution, retries, benchmarking and progress updates"""
103
99
  overall_start = time.perf_counter()
104
- plan = self.update_planner.plan
105
- # Use len(plan.index) instead of plan.empty for Dask compatibility
106
- plan_count = len(plan.index)
107
- if plan_count == 0:
108
- self.logger.info("No updates required")
100
+ tasks = list(self.update_planner.get_tasks_by_priority())
101
+ if not tasks:
102
+ self.logger.info("No updates required based on the current plan.")
109
103
  return
110
- self.logger.info(f"Update plan for {self.dataclass.__name__} includes {plan_count} items for update")
111
104
 
112
- if self.verbose:
105
+ if self.update_planner.show_progress:
113
106
  self.update_planner.show_update_plan()
114
107
 
115
- for priority in sorted(plan["update_priority"].unique()):
116
- self._process_priority_group(plan, priority, max_retries)
108
+ for priority, dates in tasks:
109
+ self._execute_task_batch(priority, dates, max_retries)
117
110
 
118
111
  total_time = time.perf_counter() - overall_start
119
- processed = len(self.processed_dates)
120
- if processed:
121
- self.logger.info(
122
- f"Processed {processed} dates in {total_time:.1f}s "
123
- f"(avg {total_time / processed:.1f}s per date)"
124
- )
125
- if self.show_progress or self.verbose:
112
+ if self.processed_dates:
113
+ count = len(self.processed_dates)
114
+ self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
115
+ if self.update_planner.show_progress:
126
116
  self.show_benchmark_summary()
127
117
 
128
- def _process_priority_group(
129
- self,
130
- plan: pd.DataFrame,
131
- priority: int,
132
- max_retries: int
133
- ):
134
- """Process a single priority group with parallel execution and timing"""
135
- dates = plan[plan["update_priority"] == priority]["date"].tolist()
136
- if not dates:
137
- return
118
+
119
+ def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
120
+ """Executes a single batch of tasks (dates) using a thread pool."""
138
121
  desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
139
- self.logger.debug(f"Starting {desc.lower()}")
140
- group_start = time.perf_counter()
141
122
  max_thr = min(len(dates), self.max_threads)
142
- self.logger.debug(f"Max threads for priority {priority}: {max_thr}")
123
+ self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
124
+
143
125
  with ThreadPoolExecutor(max_workers=max_thr) as executor:
144
126
  futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
145
- for future in tqdm(as_completed(futures), total=len(futures), desc=desc, disable=not self.show_progress):
146
- date = futures[future]
127
+ iterator = as_completed(futures)
128
+ if self.show_progress:
129
+ iterator = tqdm(iterator, total=len(futures), desc=desc)
130
+
131
+ for future in iterator:
147
132
  try:
148
133
  future.result(timeout=self.timeout)
149
134
  except Exception as e:
150
- self.logger.error(f"Permanent failure processing {date}: {e}")
151
- group_time = time.perf_counter() - group_start
152
- self.logger.info(f"Priority {priority} group processed {len(dates)} dates in {group_time:.1f}s")
135
+ self.logger.error(f"Permanent failure for {futures[future]}: {e}")
153
136
 
154
137
  def _process_date_with_retry(self, date: datetime.date, max_retries: int):
155
- for attempt in range(1, max_retries + 1):
138
+ """Wrapper to apply retry logic to single date processing."""
139
+ for attempt in range(max_retries):
156
140
  try:
157
141
  self._process_single_date(date)
158
142
  return
159
143
  except Exception as e:
160
- if attempt < max_retries:
161
- self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {e}")
144
+ if attempt < max_retries - 1:
145
+ self.logger.warning(f"Retry {attempt + 1}/{max_retries} for {date}: {e}")
146
+ time.sleep(2 ** attempt) # Exponential backoff
162
147
  else:
163
- raise RuntimeError(f"Failed processing {date} after {max_retries} attempts") from e
148
+ self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
149
+ #raise
164
150
 
165
151
  def _process_single_date(self, date: datetime.date):
166
152
  """Core date processing logic with load/save timing and thread reporting"""
167
153
  path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
168
154
  self.logger.debug(f"Processing date {date.isoformat()} for {path}")
169
155
  if path in self.update_planner.skipped and self.update_planner.ignore_missing:
170
- self.logger.info(f"Skipping {date} as it exists in the skipped list")
156
+ self.logger.debug(f"Skipping {date} as it exists in the skipped list")
171
157
  return
172
158
  full_path = f"{path}{self.parquet_filename}"
173
159
 
174
- thread_name = threading.current_thread().name
175
- self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
160
+ #thread_name = threading.current_thread().name
161
+ #self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
176
162
 
177
163
  overall_start = time.perf_counter()
178
164
  try:
@@ -180,30 +166,29 @@ class DataWrapper:
180
166
  date_filter = {f"{self.date_field}__date": {date.isoformat()}}
181
167
  self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
182
168
  # Load data using the dataclass with the provided date filter
183
- self.load_params.update(date_filter)
184
- df = self.datacls.load(**self.load_params)
169
+ # Create a copy to avoid mutating the shared instance dictionary
170
+ local_load_params = self.load_params.copy()
171
+ local_load_params.update(date_filter)
172
+ local_class_instance = self.dataclass(**self.class_params)
173
+ df=local_class_instance.load(**local_load_params)
185
174
  load_time = time.perf_counter() - load_start
186
- if df.head(1, compute=True).empty:
187
- if self.mmanifest:
188
- schema = df._meta.dtypes.astype(str).to_dict()
189
- self.mmanifest.record(
190
- full_path=path
191
- )
192
- self.logger.info(f"No data found for {date}. Logged to missing manifest.")
193
- return
194
- # Dask-compatible empty check
195
- # if len(df.index) == 0:
196
- # self.logger.warning(f"No data found for {date}")
197
- # return
198
175
 
176
+ if hasattr(local_class_instance, "total_records"):
177
+ self.logger.debug(f"Total records loaded by {local_class_instance}: {local_class_instance.total_records}")
178
+ if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
179
+ if self.mmanifest:
180
+ self.mmanifest.record(
181
+ full_path=path
182
+ )
183
+ self.logger.info(f"No data found for {date}. Logged to missing manifest.")
184
+ return
199
185
  save_start = time.perf_counter()
200
- with self._lock:
201
- ParquetSaver(
202
- df_result=df,
203
- parquet_storage_path=path,
204
- fs=self.fs,
205
- logger=self.logger
206
- ).save_to_parquet(self.parquet_filename)
186
+ ParquetSaver(
187
+ df_result=df,
188
+ parquet_storage_path=path,
189
+ fs=self.fs,
190
+ logger=self.logger
191
+ ).save_to_parquet(self.parquet_filename, overwrite=True)
207
192
  save_time = time.perf_counter() - save_start
208
193
 
209
194
  total_time = time.perf_counter() - overall_start
@@ -29,8 +29,9 @@ class DateUtils:
29
29
  """
30
30
  _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
31
31
 
32
- def __init__(self, logger=None):
32
+ def __init__(self, logger=None, debug=False):
33
33
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
34
+ self.debug = debug
34
35
 
35
36
  @classmethod
36
37
  def _ensure_date(cls, value: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date: