sibi-dst 0.3.63__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,143 +3,72 @@ import pandas as pd
3
3
 
4
4
  from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
5
5
  from sibi_dst.utils import Logger
6
- from ._io_dask import SQLAlchemyDask
7
6
  from ._db_connection import SqlAlchemyConnectionConfig
7
+ from ._io_dask import SQLAlchemyDask
8
8
 
9
9
 
10
10
  class SqlAlchemyLoadFromDb:
11
11
  """
12
- The SqlAlchemyLoadFromDb class provides functionality to load data from a
13
- database using SQLAlchemy into a Dask DataFrame. It is capable of handling
14
- large datasets efficiently by utilizing the Dask framework for parallel
15
- computations.
16
-
17
- This class is initialized with a database connection configuration, query
18
- configuration, optional parameters, and a logger. It can execute a query
19
- using the specified configurations and read the results into a Dask
20
- DataFrame. This is useful for processing and analyzing large-scale data.
21
-
22
- :ivar df: Dask DataFrame to store the loaded data.
23
- :type df: dd.DataFrame
24
- :ivar db_connection: Database connection configuration object, containing details
25
- such as the table, model, and engine to be used for the query.
26
- :type db_connection: SqlAlchemyConnectionConfig
27
- :ivar table_name: Name of the database table being queried.
28
- :type table_name: str
29
- :ivar model: SQLAlchemy model associated with the database connection.
30
- :type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
31
- :ivar engine: SQLAlchemy engine used for executing queries.
32
- :type engine: sqlalchemy.engine.base.Engine
33
- :ivar logger: Logger instance for logging debug and error information.
34
- :type logger: Logger
35
- :ivar query_config: Query configuration, including query-related details such
36
- as the SQL query or query settings.
37
- :type query_config: QueryConfig
38
- :ivar params_config: Parameters configuration, including filter parameters for
39
- the query.
40
- :type params_config: ParamsConfig
41
- :ivar debug: Debug flag indicating whether debug mode is enabled.
42
- :type debug: bool
43
- :ivar chunk_size: Size of data chunks to process at a time.
44
- :type chunk_size: int
12
+ Orchestrates loading data from a database using SQLAlchemy into a Dask
13
+ DataFrame by configuring and delegating to the SQLAlchemyDask loader.
45
14
  """
46
- df: dd.DataFrame = None
47
15
 
48
16
  def __init__(
49
17
  self,
50
- plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
18
+ plugin_sqlalchemy: SqlAlchemyConnectionConfig,
51
19
  plugin_query: QueryConfig = None,
52
20
  plugin_params: ParamsConfig = None,
53
21
  logger: Logger = None,
54
22
  **kwargs,
55
23
  ):
56
24
  """
57
- Initializes an instance of the class, setting up a database connection,
58
- query configuration, parameter configuration, and other optional settings
59
- like debugging and logging. The class aims to manage the integration and
60
- interaction with SQLAlchemy-based database operations.
61
-
62
- :param plugin_sqlalchemy:
63
- The SQLAlchemy connection configuration object, which provides
64
- the connection details like engine, table name, and model
65
- associated with the database operations.
66
- :param plugin_query:
67
- The query configuration object, used to define specific query
68
- options or rules. Defaults to None.
69
- :param plugin_params:
70
- The parameters configuration object, used for any additional
71
- parameterized settings or configurations. Defaults to None.
72
- :param logger:
73
- Optional logger instance for logging purposes. If not provided,
74
- a default logger is instantiated using the standard logging system.
75
- :param kwargs:
76
- Optional additional keyword arguments for customization. Can
77
- include optional settings like `debug` mode or `chunk_size`
78
- for batch operations.
25
+ Initializes the loader with all necessary configurations.
26
+
27
+ Args:
28
+ plugin_sqlalchemy: The database connection configuration object.
29
+ plugin_query: The query configuration object.
30
+ plugin_params: The parameters and filters configuration object.
31
+ logger: An optional logger instance.
32
+ **kwargs: Must contain 'index_column' for Dask partitioning.
79
33
  """
80
34
  self.db_connection = plugin_sqlalchemy
81
- self.table_name = self.db_connection.table
82
35
  self.model = self.db_connection.model
83
36
  self.engine = self.db_connection.engine
84
37
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
85
38
  self.query_config = plugin_query
86
39
  self.params_config = plugin_params
87
- self.debug = kwargs.pop("debug", False)
88
- self.chunk_size = kwargs.pop("chunk_size", 1000)
40
+ self.debug = kwargs.get("debug", False)
41
+ self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
89
42
 
90
43
  def build_and_load(self) -> dd.DataFrame:
91
44
  """
92
- Builds and returns the resulting dataframe after calling the internal
93
- build and load function. This method triggers the `_build_and_load`
94
- function to process and prepare the data before returning it as
95
- a dask dataframe.
45
+ Builds and loads a Dask DataFrame from a SQLAlchemy source.
96
46
 
97
- :raises RuntimeError: If any error occurs during the build or load process.
47
+ This method is stateless and returns the DataFrame directly.
98
48
 
99
- :return: The processed data in a dask dataframe.
100
- :rtype: dd.DataFrame
101
- """
102
- self._build_and_load()
103
- return self.df
104
-
105
- def _build_and_load(self) -> dd.DataFrame:
106
- """
107
- Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
108
-
109
- This method initializes a SQLAlchemyDask object with the provided model,
110
- filters, engine URL, logger, chunk size, and debug configuration.
111
- It attempts to load the data using the ``read_frame`` method of
112
- SQLAlchemyDask. If the data cannot be loaded or the query returns
113
- no rows, it creates and returns an empty Dask DataFrame.
114
-
115
- :raises Exception: On failure to load data or to create a DataFrame.
116
-
117
- :return: A Dask DataFrame object containing the queried data or an
118
- empty DataFrame if the query returns no results or fails.
119
- :rtype: dask.dataframe.DataFrame
49
+ Returns:
50
+ A Dask DataFrame containing the queried data or an empty,
51
+ correctly structured DataFrame if the query fails or returns no results.
120
52
  """
121
53
  try:
122
- self.df = SQLAlchemyDask(
54
+ # Instantiate and use the low-level Dask loader
55
+ sqlalchemy_dask_loader=SQLAlchemyDask(
123
56
  model=self.model,
124
- filters=self.params_config.filters,
125
- engine_url=self.engine.url,
126
- logger=self.logger,
57
+ filters=self.params_config.filters if self.params_config else {},
58
+ engine=self.engine,
127
59
  chunk_size=self.chunk_size,
60
+ logger=self.logger,
128
61
  debug=self.debug
129
- ).read_frame()
130
-
131
- if self.df is None or len(self.df.head().index) == 0:
132
- self.logger.debug("Query returned no results.")
133
- dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
62
+ )
63
+ # Create the lazy DataFrame
64
+ dask_df = sqlalchemy_dask_loader.read_frame()
65
+ return dask_df
134
66
 
135
- return dask_df
136
67
 
137
- return self.df
138
- except RuntimeError as e:
139
- self.logger.info(f"Runtime Error {e}:Failed to load data into Dask DataFrame.")
140
- dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
141
- return dask_df
142
68
  except Exception as e:
143
- self.logger.info(f"Exception {e}:Failed to load data into Dask DataFrame.")
144
- dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
145
- return dask_df
69
+ self.logger.error(f"Failed to build and load data: {e}", exc_info=True)
70
+ # Return an empty dataframe with the correct schema on failure
71
+ columns = [c.name for c in self.model.__table__.columns]
72
+ return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
73
+
74
+
@@ -1,193 +1,206 @@
1
1
  import re
2
+ import keyword
3
+ import threading
4
+ from sqlalchemy import MetaData, Engine
5
+ from sqlalchemy.orm import DeclarativeBase
2
6
 
3
- from sqlalchemy import MetaData, Table
4
- from sqlalchemy.orm import declarative_base, relationship
5
7
 
6
- # Base class for dynamically created models
7
- Base = declarative_base()
8
+ class Base(DeclarativeBase):
9
+ """Shared declarative base for all ORM models."""
10
+ pass
8
11
 
9
- apps_label = "datacubes"
12
+
13
+ apps_label = "datacubes.models"
10
14
 
11
15
 
12
16
  class SqlAlchemyModelBuilder:
13
17
  """
14
- Provides functionality for building SQLAlchemy ORM models dynamically from
15
- reflected database tables. This class is intended for use with a SQLAlchemy
16
- engine and metadata to automatically generate ORM models for specified
17
- database tables.
18
-
19
- The primary purpose of this class is to simplify the process of creating
20
- SQLAlchemy ORM models by reflecting tables from a connected database,
21
- dynamically generating model classes, and handling relationships between
22
- tables.
23
-
24
- :ivar engine: SQLAlchemy engine connected to the database.
25
- :type engine: Engine
26
- :ivar table_name: Name of the table for which the model is generated.
27
- :type table_name: str
28
- :ivar metadata: SQLAlchemy MetaData instance for reflecting tables.
29
- :type metadata: MetaData
30
- :ivar table: Reflected SQLAlchemy Table object for the specified table name.
31
- :type table: Optional[Table]
32
- :ivar class_name: Dynamically normalized class name derived from table_name.
33
- :type class_name: str
18
+ Builds a single SQLAlchemy ORM model from a specific database table.
19
+ This class is thread-safe and caches reflected table metadata to
20
+ improve performance across multiple instantiations.
34
21
  """
35
- _model_cache = {} # Local cache for model classes
22
+ _lock = threading.Lock()
23
+ _metadata_cache: dict[str, MetaData] = {}
36
24
 
37
- def __init__(self, engine, table_name):
25
+ def __init__(self, engine: Engine, table_name: str):
38
26
  """
39
- Initialize the model builder with a database engine and specific table.
27
+ Initializes the model builder for a specific table.
40
28
 
41
29
  Args:
42
- engine: SQLAlchemy engine connected to the database.
43
- table_name (str): Name of the table to generate the model for.
30
+ engine: The SQLAlchemy engine connected to the database.
31
+ table_name: The name of the table to generate the model for.
44
32
  """
45
33
  self.engine = engine
46
34
  self.table_name = table_name
47
- self.metadata = MetaData()
48
- self.table = None # Placeholder for the specific table
49
- self.class_name = self.normalize_class_name(self.table_name)
35
+ self.class_name = self._normalize_class_name(self.table_name)
50
36
 
51
- def build_model(self) -> type:
52
- """
53
- Builds and returns a database model class corresponding to the specified table name.
54
- The method checks if the model is already registered in the ORM's registry. If not,
55
- it reflects the database schema of the specified table and dynamically creates the
56
- model class.
57
-
58
- :raises ValueError: If the specified table does not exist in the database.
59
- :return: A database model class corresponding to the specified table name.
60
- :rtype: type
61
- """
62
- # Check if the model is already registered
63
- model = Base.registry._class_registry.get(self.class_name)
64
- if model:
65
- return model
37
+ engine_key = str(engine.url)
66
38
 
67
- self.metadata.reflect(only=[self.table_name], bind=self.engine)
68
- self.table = self.metadata.tables.get(self.table_name)
69
- if self.table is None:
70
- raise ValueError(f"Table '{self.table_name}' does not exist in the database.")
39
+ # ✅ REFACTOR: Acquire lock to make cache access and creation atomic,
40
+ # preventing a race condition between multiple threads.
41
+ with self._lock:
42
+ if engine_key not in self._metadata_cache:
43
+ self._metadata_cache[engine_key] = MetaData()
44
+ self.metadata = self._metadata_cache[engine_key]
71
45
 
72
- model = self.create_model()
73
- return model
74
-
75
- def create_model(self) -> type:
46
+ def build_model(self) -> type:
76
47
  """
77
- Generates a SQLAlchemy model class dynamically based on the specified table and
78
- its columns. The method extracts column information, defines the necessary
79
- attributes, and creates the model class if it doesn't already exist in the
80
- SQLAlchemy base registry.
48
+ Builds and returns a database model class for the specified table.
49
+ This process is atomic and thread-safe.
81
50
 
82
- :raises KeyError: If the table or table name does not exist in the provided
83
- schema.
84
- :raises Exception: If the model creation fails for any reason.
85
-
86
- :return: The dynamically created or fetched model class.
87
- :rtype: type
51
+ Raises:
52
+ ValueError: If the specified table does not exist in the database.
53
+ Returns:
54
+ The dynamically created ORM model class.
88
55
  """
89
- # Normalize the class name from the table name
90
- columns = self.get_columns(self.table)
91
-
92
- # Define attributes for the model class
93
- attrs = {
94
- "__tablename__": self.table_name,
95
- "__table__": self.table,
96
- "__module__": f"{apps_label}.models",
97
- "__mapper_args__": {"eager_defaults": True},
98
- }
99
-
100
- # Add columns and relationships to the model
101
- attrs.update(columns)
102
- #self.add_relationships(attrs, self.table)
103
- model = Base.registry._class_registry.get(self.class_name)
104
- if not model:
56
+ with self._lock:
57
+ # REFACTOR: Add a comment acknowledging the risk of using an
58
+ # internal API. This is a maintenance warning for future developers.
59
+ # NOTE: Using a private SQLAlchemy API. This is a performance
60
+ # optimization but may break in future versions of the library.
61
+ registered_model = Base.registry._class_registry.get(self.class_name)
62
+ if registered_model:
63
+ return registered_model
64
+
65
+ # Check if the table's schema is in our metadata cache
66
+ table = self.metadata.tables.get(self.table_name)
67
+
68
+ # If not cached, reflect it from the database
69
+ if table is None:
70
+ self.metadata.reflect(bind=self.engine, only=[self.table_name])
71
+ table = self.metadata.tables.get(self.table_name)
72
+
73
+ if table is None:
74
+ raise ValueError(
75
+ f"Table '{self.table_name}' does not exist in the database."
76
+ )
77
+
78
+ # Create the model class dynamically.
79
+ attrs = {
80
+ "__tablename__": table.name,
81
+ "__table__": table,
82
+ "__module__": apps_label,
83
+ }
105
84
  model = type(self.class_name, (Base,), attrs)
106
- # Add the class to Base.registry so it is registered
107
- Base.registry._class_registry[self.class_name] = model
108
- return model
109
-
110
- def get_columns(self, table: Table):
111
- """
112
- Extracts and returns a dictionary of column names and their corresponding column
113
- objects from a given table, excluding reserved names. Reserved names are used
114
- internally and should not overlap with column names in the provided table. The
115
- method ensures sanitized column names through normalization and filters out any
116
- column matching reserved keywords.
117
-
118
- :param table: The table object from which columns are to be extracted.
119
- :type table: Table
120
- :return: A dictionary containing the sanitized column names as keys and their
121
- corresponding column objects as values, excluding reserved names.
122
- :rtype: dict
123
- """
124
- columns = {}
125
- reserved_names = ["metadata", "class_", "table"]
126
-
127
- for column in table.columns:
128
- column_name = self.normalize_column_name(column.name)
129
- if column_name not in reserved_names:
130
- columns[column_name] = column
131
- return columns
132
-
133
- def add_relationships(self, attrs, table: Table):
134
- """
135
- Adds relationships to the provided attributes dictionary for a given database table.
136
-
137
- This method iterates through the foreign keys of the provided table, constructs
138
- relationship attributes, and updates the attributes dictionary with relationships
139
- that connect the current table to related tables.
140
-
141
- :param attrs: Dictionary of attributes to which relationships will be added.
142
- The dictionary will be updated with new relationship mappings.
143
- :type attrs: dict
144
- :param table: A database table object containing foreign key relationships.
145
- The method will use this table to establish relationships.
146
- :return: None
147
- """
148
- for fk in table.foreign_keys:
149
- related_table_name = fk.column.table.name
150
- related_class_name = self.normalize_class_name(related_table_name)
151
- relationship_name = self.normalize_column_name(related_table_name)
152
- attrs[relationship_name] = relationship(related_class_name, back_populates=None)
153
85
 
86
+ return model
154
87
 
155
88
  @staticmethod
156
- def normalize_class_name(table_name: str) -> str:
157
- """
158
- Generate a normalized class name from a given table name by capitalizing
159
- each word separated by underscores and concatenating them.
160
-
161
- This static method takes a string representation of a table name, where
162
- words are separated by underscores, and converts it into a camel case
163
- class name. It processes the string by capitalizing the first letter of
164
- each word and removing the underscores. The normalized class name
165
- returned can be used programmatically for various purposes, such as
166
- class generation or naming conventions.
167
-
168
- :param table_name: The table name to normalize, with words separated by
169
- underscores. E.g., 'sample_table' becomes 'SampleTable'.
170
- :type table_name: str
171
- :return: A normalized class name in camel case format.
172
- :rtype: str
173
- """
89
+ def _normalize_class_name(table_name: str) -> str:
90
+ """Converts a snake_case table_name to a CamelCase class name."""
174
91
  return "".join(word.capitalize() for word in table_name.split("_"))
175
92
 
176
93
  @staticmethod
177
- def normalize_column_name(column_name: str) -> str:
94
+ def _normalize_column_name(column_name: str) -> str:
178
95
  """
179
- Normalize a column name by replacing any non-word characters or leading numbers
180
- with underscores, while ensuring it does not conflict with reserved keywords
181
- such as 'class', 'def', 'return', etc. If the normalized name conflicts with
182
- a Python reserved keyword, "_field" is appended to it.
183
-
184
- :param column_name: The original name of the column to be normalized.
185
- :type column_name: str
186
- :return: A normalized column name that is safe and compatible for usage
187
- in various contexts such as database columns or Python code.
188
- :rtype: str
96
+ Sanitizes a column name to be a valid Python identifier.
97
+ (Kept for utility, though not used in the final model creation).
189
98
  """
190
- column_name = re.sub(r"\W|^(?=\d)", "_", column_name)
191
- if column_name in {"class", "def", "return", "yield", "global"}:
192
- column_name += "_field"
193
- return column_name
99
+ sane_name = re.sub(r"\W", "_", column_name)
100
+ sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
101
+
102
+ if keyword.iskeyword(sane_name):
103
+ return f"{sane_name}_field"
104
+ return sane_name
105
+
106
+ # import re
107
+ # import keyword
108
+ # import threading
109
+ # from sqlalchemy import MetaData, Engine
110
+ # from sqlalchemy.orm import DeclarativeBase
111
+ #
112
+ #
113
+ #
114
+ # class Base(DeclarativeBase):
115
+ # """shared declarative base for all ORM models."""
116
+ # pass
117
+ #
118
+ #
119
+ # apps_label = "datacubes.models"
120
+ #
121
+ #
122
+ # class SqlAlchemyModelBuilder:
123
+ # """
124
+ # Builds a single SQLAlchemy ORM model from a specific database table.
125
+ # This class is thread-safe and caches reflected table metadata to
126
+ # improve performance across multiple instantiations.
127
+ # """
128
+ # _lock = threading.Lock()
129
+ # _metadata_cache: dict[str, MetaData] = {}
130
+ #
131
+ # def __init__(self, engine: Engine, table_name: str):
132
+ # """
133
+ # Initializes the model builder for a specific table.
134
+ #
135
+ # Args:
136
+ # engine: The SQLAlchemy engine connected to the database.
137
+ # table_name: The name of the table to generate the model for.
138
+ # """
139
+ # self.engine = engine
140
+ # self.table_name = table_name
141
+ # self.class_name = self._normalize_class_name(self.table_name)
142
+ #
143
+ # # Use or create a cached MetaData object for this engine to avoid
144
+ # # re-reading the schema for tables that are already known.
145
+ # engine_key = str(engine.url)
146
+ # if engine_key not in self._metadata_cache:
147
+ # self._metadata_cache[engine_key] = MetaData()
148
+ # self.metadata = self._metadata_cache[engine_key]
149
+ #
150
+ # def build_model(self) -> type:
151
+ # """
152
+ # Builds and returns a database model class for the specified table.
153
+ # This process is atomic and thread-safe.
154
+ #
155
+ # Raises:
156
+ # ValueError: If the specified table does not exist in the database.
157
+ # Returns:
158
+ # The dynamically created ORM model class.
159
+ # """
160
+ # with self._lock:
161
+ # # First, check if the model class is already registered in SQLAlchemy
162
+ # registered_model = Base.registry._class_registry.get(self.class_name)
163
+ # if registered_model:
164
+ # return registered_model
165
+ #
166
+ # # Next, check if the table's schema is in our metadata cache
167
+ # table = self.metadata.tables.get(self.table_name)
168
+ #
169
+ # # If not cached, reflect it from the database
170
+ # if table is None:
171
+ # self.metadata.reflect(bind=self.engine, only=[self.table_name])
172
+ # table = self.metadata.tables.get(self.table_name)
173
+ #
174
+ # if table is None:
175
+ # raise ValueError(
176
+ # f"Table '{self.table_name}' does not exist in the database."
177
+ # )
178
+ #
179
+ # # Create the model class dynamically.
180
+ # # No need to add columns manually; __table__ handles it.
181
+ # attrs = {
182
+ # "__tablename__": table.name,
183
+ # "__table__": table,
184
+ # "__module__": apps_label,
185
+ # }
186
+ # model = type(self.class_name, (Base,), attrs)
187
+ #
188
+ # return model
189
+ #
190
+ # @staticmethod
191
+ # def _normalize_class_name(table_name: str) -> str:
192
+ # """Converts a snake_case table_name to a CamelCase class name."""
193
+ # return "".join(word.capitalize() for word in table_name.split("_"))
194
+ #
195
+ # @staticmethod
196
+ # def _normalize_column_name(column_name: str) -> str:
197
+ # """
198
+ # Sanitizes a column name to be a valid Python identifier.
199
+ # (Kept for utility, though not used in the final model creation).
200
+ # """
201
+ # sane_name = re.sub(r"\W", "_", column_name)
202
+ # sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
203
+ #
204
+ # if keyword.iskeyword(sane_name):
205
+ # return f"{sane_name}_field"
206
+ # return sane_name
@@ -7,8 +7,8 @@ class QueryConfig(BaseModel):
7
7
  use_exclude: bool = False
8
8
  n_records: int = 100
9
9
  dt_field: Optional[str] = None
10
- use_dask: bool = False
11
- as_dask: bool = False
10
+ use_dask: bool = True
11
+ as_dask: bool = True
12
12
 
13
13
  @model_validator(mode='after')
14
14
  def check_n_records(self):
@@ -115,22 +115,26 @@ class Logger:
115
115
  """
116
116
  self.logger.setLevel(level)
117
117
 
118
- def debug(self, msg: str):
118
+ def debug(self, msg: str, *args, **kwargs):
119
119
  """Log a debug message."""
120
- self.logger.debug(msg)
120
+ self.logger.debug(msg, *args, **kwargs)
121
121
 
122
- def info(self, msg: str):
122
+ def info(self, msg: str, *args, **kwargs):
123
123
  """Log an info message."""
124
- self.logger.info(msg)
124
+ self.logger.info(msg, *args, **kwargs)
125
125
 
126
- def warning(self, msg: str):
126
+ def warning(self, msg: str, *args, **kwargs):
127
127
  """Log a warning message."""
128
- self.logger.warning(msg)
128
+ self.logger.warning(msg, *args, **kwargs)
129
129
 
130
- def error(self, msg: str):
131
- """Log an error message."""
132
- self.logger.error(msg)
130
+ def error(self, msg: str, *args, **kwargs):
131
+ """
132
+ Log an error message.
133
+
134
+ To log exception information, use the `exc_info=True` keyword argument.
135
+ """
136
+ self.logger.error(msg, *args, **kwargs)
133
137
 
134
- def critical(self, msg: str):
138
+ def critical(self, msg: str, *args, **kwargs):
135
139
  """Log a critical message."""
136
- self.logger.critical(msg)
140
+ self.logger.critical(msg, *args, **kwargs)