sibi-dst 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  import datetime
2
2
  import logging
3
+ import threading
3
4
  from typing import Optional, Any, Dict
4
5
 
5
6
  import dask.dataframe as dd
@@ -78,6 +79,7 @@ class ParquetArtifact(DfHelper):
78
79
  `parquet_filename`, `parquet_start_date`,
79
80
  or `parquet_end_date`) are missing or not set properly.
80
81
  """
82
+ self._lock = threading.Lock()
81
83
  self.config = {
82
84
  **self.DEFAULT_CONFIG,
83
85
  **kwargs,
@@ -119,21 +121,36 @@ class ParquetArtifact(DfHelper):
119
121
  super().__init__(**self.config)
120
122
 
121
123
  def load(self, **kwargs):
122
- self.df = super().load(**kwargs)
124
+ with self._lock:
125
+ self.df = super().load(**kwargs)
123
126
  return self.df
124
127
 
125
128
  def generate_parquet(self, **kwargs) -> None:
126
129
  """
127
130
  Generate a Parquet file using the configured DataWrapper class.
128
131
  """
129
- params = self._prepare_params(kwargs)
130
- dw = DataWrapper(self.data_wrapper_class, **params)
131
- dw.process()
132
+ with self._lock:
133
+ params = self._prepare_params(kwargs)
134
+ dw = DataWrapper(self.data_wrapper_class, **params)
135
+ dw.process()
136
+
137
+ def __enter__(self):
138
+ if getattr(self, "_entered", False):
139
+ return self
140
+ self._entered = True
141
+ self.ensure_directory_exists(self.parquet_storage_path)
142
+ return self
132
143
 
133
144
  def __exit__(self, exc_type, exc_value, traceback):
134
- # Ensure resources are cleaned up
135
- if self.fs:
136
- self.fs.close()
145
+ try:
146
+ if getattr(self, "_entered", False) and self.fs:
147
+ self.fs.close()
148
+ except Exception as e:
149
+ self.logger.warning(f"Error closing filesystem: {e}")
150
+ finally:
151
+ self._entered = False
152
+ # return False so exceptions aren’t suppressed
153
+ return False
137
154
 
138
155
  def update_parquet(self, period: str = 'today', **kwargs) -> None:
139
156
  """Update the Parquet file with data from a specific period."""
@@ -223,7 +240,8 @@ class ParquetArtifact(DfHelper):
223
240
 
224
241
  def ensure_directory_exists(self, path: str) -> None:
225
242
  """Ensure the directory exists in the specified filesystem."""
226
- try:
227
- self.fs.makedirs(path, exist_ok=True)
228
- except Exception as e:
229
- raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
243
+ with self._lock:
244
+ try:
245
+ self.fs.makedirs(path, exist_ok=True)
246
+ except Exception as e:
247
+ raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
@@ -1,109 +1,202 @@
1
- from typing import Any, Optional, ClassVar
1
+ from __future__ import annotations
2
+ from typing import Any, Optional, ClassVar, Generator, Type
2
3
  import threading
3
-
4
- from pydantic import BaseModel, model_validator
5
- from sqlalchemy import create_engine
4
+ from contextlib import contextmanager
5
+ from pydantic import BaseModel, field_validator, ValidationError, model_validator
6
+ from sqlalchemy import create_engine, event, text
7
+ from sqlalchemy.engine import url as sqlalchemy_url
8
+ from sqlalchemy.engine import Engine
6
9
  from sqlalchemy.exc import OperationalError
7
- from sqlalchemy.sql import text
10
+ from sqlalchemy.pool import QueuePool, NullPool, StaticPool
11
+ from sqlalchemy.orm import sessionmaker, Session
8
12
  from sibi_dst.utils import Logger
9
-
10
13
  from ._sql_model_builder import SqlAlchemyModelBuilder
11
14
 
15
+
12
16
  class SqlAlchemyConnectionConfig(BaseModel):
13
17
  """
14
- Configuration class for managing an SQLAlchemy database connection.
15
-
16
- This class provides configurations to establish a connection to a database,
17
- validate the connection, and dynamically build a SQLAlchemy model for a specific
18
- table if required. It initializes the database engine using the provided connection URL
19
- and ensures that the connection and table information are properly validated.
20
-
21
- :ivar connection_url: The URL used to connect to the database.
22
- :type connection_url: str
23
- :ivar table: The name of the database table for which a model will be constructed.
24
- :type table: Optional[str]
25
- :ivar model: The dynamically built SQLAlchemy model for the specified table.
26
- :type model: Any
27
- :ivar engine: The SQLAlchemy engine instance reused for database connections.
28
- :type engine: Optional[Any]
18
+ Thread-safe, registry-backed SQLAlchemy connection manager with:
19
+ - Shared engine reuse
20
+ - Active connection tracking
21
+ - Idle-pool and database-level cleanup
22
+ - Dynamic ORM model building via SqlAlchemyModelBuilder
23
+ - Optional session factory
29
24
  """
30
25
  connection_url: str
31
26
  table: Optional[str] = None
32
- model: Any = None
33
- engine: Optional[Any] = None
34
- logger: Optional[Any] = None
35
- pool_size: int = 10
36
- max_overflow: int = 5
27
+ model: Optional[Any] = None
28
+ engine: Optional[Engine] = None
29
+ logger: Logger = None
30
+ debug: bool = False
31
+
32
+ pool_size: int = 5
33
+ max_overflow: int = 10
37
34
  pool_timeout: int = 30
38
- pool_recycle:int = 300
35
+ pool_recycle: int = 300
36
+ pool_pre_ping: bool = True
37
+ poolclass: Type = QueuePool
39
38
 
40
- # Class-level registry and lock for thread-safe engine reuse
41
- _engine_registry: ClassVar[dict] = {}
39
+ session_factory: Optional[sessionmaker] = None
40
+ _owns_engine: bool = False
41
+
42
+ _engine_registry: ClassVar[dict[tuple, Engine]] = {}
42
43
  _registry_lock: ClassVar[threading.Lock] = threading.Lock()
44
+ _active_connections: ClassVar[int] = 0
43
45
 
44
- @model_validator(mode="after")
45
- def validate_and_initialize(self):
46
- """
47
- Validate connection parameters, initialize the engine, and build the dynamic model if necessary.
48
- """
49
- if not self.logger:
50
- self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
51
-
52
- if not self.connection_url:
53
- raise ValueError("`connection_url` must be provided.")
54
-
55
- # Validate `connection_url`
56
- if self.engine is not None:
57
- engine_url = str(self.engine.url)
58
- if engine_url != self.connection_url:
59
- raise ValueError(f"Engine URL '{engine_url}' does not match the provided connection URL '{self.connection_url}'.")
60
- else:
61
- # Generate a unique key for the engine registry based on the connection URL
62
- engine_key = (
63
- self.connection_url,
64
- self.pool_size,
65
- self.max_overflow,
66
- self.pool_timeout,
67
- self.pool_recycle
68
- )
69
- with self.__class__._registry_lock:
70
- if engine_key in self.__class__._engine_registry:
71
- # Reuse the existing engine
72
- self.logger.info(f"Reusing existing engine for connection URL: {self.connection_url}")
73
- self.engine = self.__class__._engine_registry[engine_key]
74
- else:
75
- # Initialize the engine
76
- self.logger.info(f"Creating new engine for connection URL: {self.connection_url}")
77
- self.engine = create_engine(self.connection_url,
78
- pool_size=self.pool_size,
79
- max_overflow=self.max_overflow,
80
- pool_timeout=self.pool_timeout,
81
- pool_recycle=self.pool_recycle)
82
- self.__class__._engine_registry[engine_key] = self.engine
83
-
84
- # Validate the connection
85
- self.validate_connection()
86
- if not self.table:
87
- raise ValueError("`table_name` must be provided to build the model.")
88
- try:
89
- self.model = SqlAlchemyModelBuilder(self.engine, self.table).build_model()
90
- except Exception as e:
91
- raise ValueError(f"Failed to build model for table '{self.table}': {e}")
46
+ class Config:
47
+ arbitrary_types_allowed = True
48
+ underscore_attrs_are_private = True
49
+
50
+ @field_validator("pool_size", "max_overflow", "pool_timeout", "pool_recycle")
51
+ @classmethod
52
+ def _validate_pool_params(cls, v: int) -> int:
53
+ if v < 0:
54
+ raise ValueError("Pool parameters must be non-negative")
55
+ return v
92
56
 
57
+ @model_validator(mode="after")
58
+ def _init_all(self) -> SqlAlchemyConnectionConfig:
59
+ self._init_logger()
60
+ self._init_engine()
61
+ self._validate_conn()
62
+ self._build_model()
63
+ self.session_factory = sessionmaker(bind=self.engine, expire_on_commit=False)
93
64
  return self
94
65
 
95
- def validate_connection(self):
96
- """
97
- Test the database connection by executing a simple query.
98
- """
66
+ def _init_logger(self) -> None:
67
+ self.logger = self.logger or Logger.default_logger(logger_name=self.__class__.__name__)
68
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
69
+
70
+ def _engine_key(self) -> tuple:
71
+ parsed = sqlalchemy_url.make_url(self.connection_url)
72
+ query = {k: v for k, v in parsed.query.items() if not k.startswith("pool_")}
73
+ normalized = parsed.set(query=query)
74
+ key = [str(normalized)]
75
+ if self.poolclass not in (NullPool, StaticPool):
76
+ key += [self.pool_size, self.max_overflow, self.pool_timeout, self.pool_recycle, self.pool_pre_ping, self.table]
77
+ return tuple(key)
78
+
79
+ def _init_engine(self) -> None:
80
+ key = self._engine_key()
81
+ with self._registry_lock:
82
+ existing = self._engine_registry.get(key)
83
+ if existing:
84
+ self.engine = existing
85
+ self._owns_engine = False
86
+ self.logger.debug(f"Reusing engine {key}")
87
+ else:
88
+ self.logger.debug(f"Creating engine {key}")
89
+ self.engine = create_engine(
90
+ self.connection_url,
91
+ pool_size=self.pool_size,
92
+ max_overflow=self.max_overflow,
93
+ pool_timeout=self.pool_timeout,
94
+ pool_recycle=self.pool_recycle,
95
+ pool_pre_ping=self.pool_pre_ping,
96
+ poolclass=self.poolclass,
97
+ )
98
+ self._attach_events()
99
+ self._engine_registry[key] = self.engine
100
+ self._owns_engine = True
101
+
102
+ def _attach_events(self) -> None:
103
+ event.listen(self.engine, "checkout", self._on_checkout)
104
+ event.listen(self.engine, "checkin", self._on_checkin)
105
+
106
+ def _on_checkout(self, *args) -> None:
107
+ with self._registry_lock:
108
+ type(self)._active_connections += 1
109
+ self.logger.debug(f"Checked out, active: {self.active_connections}")
110
+
111
+ def _on_checkin(self, *args) -> None:
112
+ with self._registry_lock:
113
+ type(self)._active_connections = max(type(self)._active_connections - 1, 0)
114
+ self.logger.debug(f"Checked in, active: {self.active_connections}")
115
+
116
+ @property
117
+ def active_connections(self) -> int:
118
+ return type(self)._active_connections
119
+
120
+ def _validate_conn(self) -> None:
99
121
  try:
100
- with self.engine.connect() as connection:
101
- connection.execute(text("SELECT 1"))
122
+ with self.managed_connection() as conn:
123
+ conn.execute(text("SELECT 1"))
124
+ self.logger.debug("Connection OK")
102
125
  except OperationalError as e:
103
- raise ValueError(f"Failed to connect to the database: {e}")
126
+ self.logger.error(f"Connection failed: {e}")
127
+ raise ValidationError(f"DB connection failed: {e}")
104
128
 
105
- @classmethod
106
- def clear_engine_registry(cls):
107
- """Clear the global engine registry (useful for testing)."""
108
- with cls._registry_lock:
109
- cls._engine_registry.clear()
129
+ @contextmanager
130
+ def managed_connection(self) -> Generator[Any, None, Any]:
131
+ conn = self.engine.connect()
132
+ try:
133
+ yield conn
134
+ finally:
135
+ conn.close()
136
+
137
+ def get_session(self) -> Session:
138
+ if not self.session_factory:
139
+ raise RuntimeError("Session factory not initialized")
140
+ return self.session_factory()
141
+
142
+ def _build_model(self) -> None:
143
+ """Dynamically build and assign the ORM model if table is set"""
144
+ if not self.table or not self.engine:
145
+ return
146
+ try:
147
+ builder = SqlAlchemyModelBuilder(self.engine, self.table)
148
+ self.model = builder.build_model()
149
+ self.logger.debug(f"Model built for table: {self.table}")
150
+ except Exception as e:
151
+ self.logger.error(f"Model build failed: {e}")
152
+ raise ValidationError(f"Model construction error: {e}") from e
153
+
154
+ def dispose_idle_connections(self) -> int:
155
+ key = self._engine_key()
156
+ with self._registry_lock:
157
+ if self._engine_registry.get(key) is not self.engine:
158
+ self.logger.warning("Engine changed")
159
+ return 0
160
+ pool = self.engine.pool
161
+ if isinstance(pool, QueuePool):
162
+ count = pool.checkedin()
163
+ pool.dispose()
164
+ self.logger.debug(f"Disposed {count}")
165
+ return count
166
+ self.logger.warning(f"No idle dispose for {type(pool).__name__}")
167
+ return 0
168
+
169
+ def terminate_idle_connections(self, idle_seconds: int = 300) -> int:
170
+ terminated = 0
171
+ dialect = self.engine.dialect.name
172
+ with self.managed_connection() as conn:
173
+ if dialect == 'postgresql':
174
+ res = conn.execute(text(
175
+ f"SELECT pg_terminate_backend(pid) FROM pg_stat_activity "
176
+ f"WHERE state='idle' AND (now() - query_start) > interval '{idle_seconds} seconds' "
177
+ f"AND pid<>pg_backend_pid()"
178
+ ))
179
+ terminated = res.rowcount
180
+ elif dialect == 'mysql':
181
+ for row in conn.execute(text("SHOW PROCESSLIST")):
182
+ if row.Command == 'Sleep' and row.Time > idle_seconds:
183
+ conn.execute(text(f"KILL {row.Id}"))
184
+ terminated += 1
185
+ else:
186
+ self.logger.warning(f"Idle termination not supported: {dialect}")
187
+ self.logger.debug(f"Terminated {terminated}")
188
+ return terminated
189
+
190
+ def close(self) -> None:
191
+ with self._registry_lock:
192
+ key = self._engine_key()
193
+ if not self._owns_engine:
194
+ self.logger.warning("Not owner, skipping close")
195
+ return
196
+ if self._engine_registry.get(key) != self.engine:
197
+ self.logger.warning("Engine not in registry")
198
+ return
199
+ self.engine.dispose()
200
+ del self._engine_registry[key]
201
+ type(self)._active_connections = 0
202
+ self.logger.debug(f"Engine closed {key}")
@@ -133,9 +133,13 @@ class SqlAlchemyLoadFromDb:
133
133
  dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
134
134
 
135
135
  return dask_df
136
+
136
137
  return self.df
138
+ except RuntimeError as e:
139
+ self.logger.info(f"Runtime Error {e}:Failed to load data into Dask DataFrame.")
140
+ dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
141
+ return dask_df
137
142
  except Exception as e:
138
- self.logger.debug(f"Failed to load data into Dask DataFrame.{e}")
143
+ self.logger.info(f"Exception {e}:Failed to load data into Dask DataFrame.")
139
144
  dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
140
-
141
145
  return dask_df
@@ -12,11 +12,13 @@ from .parquet_saver import ParquetSaver
12
12
  from .clickhouse_writer import ClickHouseWriter
13
13
  from .airflow_manager import AirflowDAGManager
14
14
  from .credentials import *
15
+ from .update_planner import UpdatePlanner
15
16
  from .data_wrapper import DataWrapper
16
17
  from .storage_config import StorageConfig
17
18
  from .data_from_http_source import DataFromHttpSource
18
19
  from .webdav_client import WebDAVClient
19
20
 
21
+
20
22
  __all__ = [
21
23
  "Logger",
22
24
  "ConfigManager",
@@ -2,7 +2,7 @@ import datetime
2
2
  import logging
3
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
4
  from typing import Type, Any, Dict, Optional, Union, List, Tuple
5
- from threading import Lock
5
+ import threading
6
6
  import fsspec
7
7
  import pandas as pd
8
8
  from IPython.display import display
@@ -11,6 +11,7 @@ from tqdm import tqdm
11
11
  from .log_utils import Logger
12
12
  from .date_utils import FileAgeChecker
13
13
  from .parquet_saver import ParquetSaver
14
+ from .update_planner import UpdatePlanner
14
15
 
15
16
 
16
17
  class DataWrapper:
@@ -48,7 +49,7 @@ class DataWrapper:
48
49
  timeout: float = 60,
49
50
  reference_date: datetime.date = None,
50
51
  custom_priority_map: Dict[str, int] = None,
51
- max_threads: int = 10):
52
+ max_threads: int = 3):
52
53
  self.dataclass = dataclass
53
54
  self.date_field = date_field
54
55
  self.data_path = self._ensure_forward_slash(data_path)
@@ -75,10 +76,26 @@ class DataWrapper:
75
76
 
76
77
  self.start_date = self._convert_to_date(start_date)
77
78
  self.end_date = self._convert_to_date(end_date)
78
- self._lock = Lock()
79
+ self._lock = threading.Lock()
79
80
  self.processed_dates = []
80
81
  self.age_checker = FileAgeChecker(logger=self.logger)
81
- self.data_class_instance = self.dataclass(**self.class_params) or None
82
+
83
+ self.update_planner_params = {
84
+ "data_path": self.data_path,
85
+ "filename": self.parquet_filename,
86
+ "fs": self.fs,
87
+ "debug": self.debug,
88
+ "logger": self.logger,
89
+ "reverse_order": self.reverse_order,
90
+ "overwrite": self.overwrite,
91
+ "ignore_missing": self.ignore_missing,
92
+ "history_days_threshold": history_days_threshold,
93
+ "max_age_minutes": max_age_minutes,
94
+ "show_progress": self.show_progress,
95
+ "description": f"{self.dataclass.__name__}"
96
+ }
97
+ self.update_plan = UpdatePlanner(**self.update_planner_params).generate_plan(self.start_date, self.end_date)
98
+
82
99
 
83
100
  def _init_filesystem(self) -> fsspec.AbstractFileSystem:
84
101
  with self._lock:
@@ -115,13 +132,13 @@ class DataWrapper:
115
132
 
116
133
  def process(self, max_retries: int = 3):
117
134
  """Process updates with priority-based execution and retries"""
118
- update_plan = self.generate_update_plan()
119
-
135
+ #update_plan = self.generate_update_plan()
136
+ update_plan = self.update_plan
120
137
  if update_plan.empty:
121
138
  self.logger.info("No updates required")
122
139
  return
123
140
  # Filter for required updates first
124
- update_plan = update_plan[update_plan["update_required"] == True]
141
+ #update_plan = update_plan[update_plan["update_required"] == True]
125
142
 
126
143
  if self.show_progress:
127
144
  #display(self._enhanced_display_table(update_plan))
@@ -140,9 +157,9 @@ class DataWrapper:
140
157
  return
141
158
 
142
159
  desc = f"Processing {self.dataclass.__name__}, task: {self._priority_label(priority)}"
143
- self.logger.info(f"Starting {desc.lower()}")
160
+ self.logger.debug(f"Starting {desc.lower()}")
144
161
  max_threads = min(len(dates), self.max_threads)
145
- self.logger.info(f"DataWrapper Max threads set at: {max_threads}")
162
+ self.logger.debug(f"DataWrapper Max threads set at: {max_threads}")
146
163
  with ThreadPoolExecutor(max_workers=max_threads) as executor:
147
164
  futures = {
148
165
  executor.submit(self._process_date_with_retry, date, max_retries): date
@@ -166,83 +183,6 @@ class DataWrapper:
166
183
  f"Unknown Priority {priority}"
167
184
  )
168
185
 
169
- def _enhanced_display_table(self, df: pd.DataFrame) -> pd.DataFrame.style:
170
- """Format the update plan table for better readability"""
171
- return df.style \
172
- .bar(subset=["file_age_minutes"], color="#5fba7d") \
173
- .background_gradient(subset=["update_priority"], cmap="YlOrBr") \
174
- .set_caption(f"Update Plan: {self.dataclass.__name__}")
175
-
176
- def generate_update_plan(self) -> pd.DataFrame:
177
- """Generate update plan with parallel file status checks"""
178
- dates = self.generate_date_range()
179
- history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
180
- rows = []
181
-
182
- with ThreadPoolExecutor() as executor:
183
- future_to_date = {
184
- executor.submit(self._get_file_status, date): date
185
- for date in dates
186
- }
187
-
188
- for future in tqdm(as_completed(future_to_date),
189
- total=len(future_to_date),
190
- desc=f"Analyzing files for {self.dataclass.__name__} ",
191
- disable=not self.show_progress):
192
- current_date = future_to_date[future]
193
- file_exists, file_age = future.result()
194
- rows.append(self._create_plan_row(
195
- current_date,
196
- history_start,
197
- file_exists,
198
- file_age
199
- ))
200
-
201
- return pd.DataFrame(rows).sort_values("update_priority")
202
-
203
- def _get_file_status(self, date: datetime.date) -> Tuple[bool, float]:
204
- """Get file existence and age with error handling"""
205
- path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/{self.parquet_filename}"
206
- try:
207
- exists = self.fs.exists(path)
208
- age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
209
- return exists, age
210
- except Exception as e:
211
- self.logger.warning(f"Error checking {path}: {str(e)}")
212
- return False, None
213
-
214
- def _create_plan_row(self,
215
- date: datetime.date,
216
- history_start: datetime.date,
217
- file_exists: bool,
218
- file_age: float) -> dict:
219
- """Create a row for the update plan DataFrame"""
220
- within_history = history_start <= date <= self.reference_date
221
- category, update_required = "file_is_recent", False
222
-
223
- if self.overwrite:
224
- category, update_required = "overwrite", True
225
- elif within_history:
226
- if not file_exists:
227
- category, update_required = "missing_in_history", True
228
- elif file_age > self.max_age_minutes:
229
- category, update_required = "existing_but_stale", True
230
- elif not file_exists and not self.ignore_missing:
231
- category, update_required = "missing_outside_history", True
232
-
233
- return {
234
- "date": date,
235
- "file_exists": file_exists,
236
- "file_age_minutes": file_age,
237
- "age_threshold": self.max_age_minutes,
238
- "within_history": within_history,
239
- "ignore_missing": self.ignore_missing,
240
- "update_category": category,
241
- "update_priority": self.priority_map[category],
242
- "update_required": update_required,
243
- "class": self.dataclass.__name__
244
- }
245
-
246
186
  def _process_date_with_retry(self, date: datetime.date, max_retries: int):
247
187
  """Process a date with retry logic"""
248
188
  for attempt in range(1, max_retries + 1):
@@ -267,13 +207,14 @@ class DataWrapper:
267
207
  self.logger.debug(f"Class Params: {self.class_params}")
268
208
  self.logger.debug(f"Load Params: {self.load_params}")
269
209
 
270
- #data = self.dataclass(**self.class_params)
271
- df = self.data_class_instance.load_period(
272
- dt_field=self.date_field,
273
- start=date,
274
- end=date,
275
- **self.load_params
276
- )
210
+ df = pd.DataFrame()
211
+ with self.dataclass(**self.class_params) as data:
212
+ df = data.load_period(
213
+ dt_field=self.date_field,
214
+ start=date,
215
+ end=date,
216
+ **self.load_params
217
+ )
277
218
 
278
219
  if len(df.index)==0:
279
220
  self.logger.warning(f"No data found for {date}")
@@ -1,6 +1,7 @@
1
1
  import base64
2
2
  import hashlib
3
3
  import logging
4
+ import threading
4
5
  import warnings
5
6
  from typing import Optional
6
7
 
@@ -27,6 +28,7 @@ class ParquetSaver:
27
28
  self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
28
29
  self.fs = fs
29
30
  self.protocol = self.parquet_storage_path.split(":")[0]
31
+ self._lock = threading.Lock()
30
32
 
31
33
  def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
32
34
  """
@@ -34,18 +36,19 @@ class ParquetSaver:
34
36
  :param parquet_filename: Filename for the Parquet file.
35
37
  :param clear_existing: Whether to clear existing files in the target directory.
36
38
  """
37
- full_path = self._construct_full_path(parquet_filename)
38
- self.logger.info(f"Save method for :{full_path}")
39
- # Ensure directory exists and clear if necessary
40
- self._ensure_directory_exists(full_path, clear_existing=clear_existing)
41
-
42
- # Define schema and save DataFrame to Parquet
43
- schema = self._define_schema()
44
- self._convert_dtypes(schema)
45
- self._save_dataframe_to_parquet(full_path, schema)
46
- # Close the filesystem if the close method exists
47
- if hasattr(self.fs, 'close') and callable(getattr(self.fs, 'close', None)):
48
- self.fs.close()
39
+ with self._lock:
40
+ full_path = self._construct_full_path(parquet_filename)
41
+ self.logger.info(f"Save method for :{full_path}")
42
+ # Ensure directory exists and clear if necessary
43
+ self._ensure_directory_exists(full_path, clear_existing=clear_existing)
44
+
45
+ # Define schema and save DataFrame to Parquet
46
+ schema = self._define_schema()
47
+ self._convert_dtypes(schema)
48
+ self._save_dataframe_to_parquet(full_path, schema)
49
+ # Close the filesystem if the close method exists
50
+ if hasattr(self.fs, 'close') and callable(getattr(self.fs, 'close', None)):
51
+ self.fs.close()
49
52
 
50
53
  def _define_schema(self) -> pa.Schema:
51
54
  """Define a PyArrow schema dynamically based on df_result column types."""