sibi-dst 0.3.45__py3-none-any.whl → 0.3.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +38 -0
- sibi_dst/{df_helper → v1/df_helper}/_artifact_updater_multi_wrapper.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/_df_helper.py +3 -3
- sibi_dst/{df_helper → v1/df_helper}/_parquet_artifact.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/_parquet_reader.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/django/_load_from_db.py +3 -3
- sibi_dst/{df_helper → v1/df_helper}/backends/http/_http_config.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_filter_handler.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_parquet_options.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_load_from_db.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_sql_model_builder.py +2 -1
- sibi_dst/{df_helper → v1/df_helper}/core/_filter_handler.py +1 -1
- sibi_dst/v1/osmnx_helper/__init__.py +6 -0
- sibi_dst/{tests → v1/tests}/test_data_wrapper_class.py +11 -10
- sibi_dst/{utils → v1/utils}/__init__.py +2 -0
- sibi_dst/{utils → v1/utils}/clickhouse_writer.py +1 -1
- sibi_dst/v1/utils/data_from_http_source.py +49 -0
- sibi_dst/{utils → v1/utils}/data_utils.py +5 -3
- sibi_dst/{utils → v1/utils}/data_wrapper.py +3 -1
- sibi_dst/{utils → v1/utils}/date_utils.py +1 -1
- sibi_dst/{utils → v1/utils}/file_utils.py +1 -1
- sibi_dst/{utils → v1/utils}/filepath_generator.py +1 -1
- sibi_dst/{utils → v1/utils}/parquet_saver.py +1 -1
- sibi_dst/v1/utils/storage_config.py +28 -0
- sibi_dst/v2/df_helper/__init__.py +7 -0
- sibi_dst/v2/df_helper/_df_helper.py +214 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +10 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +82 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +135 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +142 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +297 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +9 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +78 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +122 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +142 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +283 -0
- sibi_dst/v2/df_helper/core/__init__.py +9 -0
- sibi_dst/v2/df_helper/core/_filter_handler.py +236 -0
- sibi_dst/v2/df_helper/core/_params_config.py +139 -0
- sibi_dst/v2/df_helper/core/_query_config.py +17 -0
- sibi_dst/v2/utils/__init__.py +5 -0
- sibi_dst/v2/utils/log_utils.py +120 -0
- {sibi_dst-0.3.45.dist-info → sibi_dst-0.3.46.dist-info}/METADATA +3 -2
- sibi_dst-0.3.46.dist-info/RECORD +80 -0
- sibi_dst/osmnx_helper/__init__.py +0 -9
- sibi_dst/osmnx_helper/v2/base_osm_map.py +0 -153
- sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
- sibi_dst-0.3.45.dist-info/RECORD +0 -62
- /sibi_dst/{df_helper/backends → v1}/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/__init__.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/df_helper/backends}/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_db_connection.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_io_dask.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_sql_model_builder.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/http/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/parquet/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_db_connection.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_filter_handler.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_defaults.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_params_config.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_query_config.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/data_cleaner.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/__init__.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/geo_location_service.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/utils.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/base_osm_map.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/__init__.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/calendar_html.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/router_plotter.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/utils.py +0 -0
- /sibi_dst/{osmnx_helper/v2 → v1/tests}/__init__.py +0 -0
- /sibi_dst/{utils → v1/utils}/airflow_manager.py +0 -0
- /sibi_dst/{utils → v1/utils}/credentials.py +0 -0
- /sibi_dst/{utils → v1/utils}/df_utils.py +0 -0
- /sibi_dst/{utils → v1/utils}/log_utils.py +0 -0
- /sibi_dst/{utils → v1/utils}/phone_formatter.py +0 -0
- /sibi_dst/{utils → v1/utils}/storage_manager.py +0 -0
- /sibi_dst/{osmnx_helper/v2/basemaps → v2}/__init__.py +0 -0
- /sibi_dst/{tests → v2/df_helper/backends}/__init__.py +0 -0
- {sibi_dst-0.3.45.dist-info → sibi_dst-0.3.46.dist-info}/WHEEL +0 -0
@@ -0,0 +1,10 @@
|
|
1
|
+
from ._db_connection import SqlAlchemyConnectionConfig
|
2
|
+
from ._model_builder import SqlAlchemyModelBuilder
|
3
|
+
from ._load_from_db import SqlAlchemyLoadFromDb
|
4
|
+
|
5
|
+
__all__ = [
|
6
|
+
'SqlAlchemyConnectionConfig',
|
7
|
+
'SqlAlchemyModelBuilder',
|
8
|
+
'SqlAlchemyLoadFromDb',
|
9
|
+
]
|
10
|
+
|
@@ -0,0 +1,82 @@
|
|
1
|
+
from typing import Any, Optional
|
2
|
+
|
3
|
+
from pydantic import BaseModel, model_validator, ConfigDict
|
4
|
+
from sqlalchemy import create_engine, text
|
5
|
+
from sqlalchemy.engine import Engine
|
6
|
+
from sqlalchemy.exc import OperationalError
|
7
|
+
|
8
|
+
from sibi_dst.v2.utils import Logger
|
9
|
+
from ._model_builder import SqlAlchemyModelBuilder
|
10
|
+
|
11
|
+
|
12
|
+
class SqlAlchemyConnectionConfig(BaseModel):
|
13
|
+
"""
|
14
|
+
Configuration for establishing an SQLAlchemy database connection and dynamically building
|
15
|
+
an ORM model for a specific table.
|
16
|
+
|
17
|
+
Attributes:
|
18
|
+
connection_url (str): The URL used to connect to the database.
|
19
|
+
table_name (Optional[str]): The name of the table for which the model will be built.
|
20
|
+
model (Any): The dynamically built SQLAlchemy model.
|
21
|
+
engine (Optional[Engine]): The SQLAlchemy engine instance.
|
22
|
+
"""
|
23
|
+
|
24
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
25
|
+
connection_url: str
|
26
|
+
table: Optional[str] = None
|
27
|
+
model: Any = None
|
28
|
+
engine: Optional[Engine] = None
|
29
|
+
debug: bool = False
|
30
|
+
logger: Optional[Logger] = None
|
31
|
+
add_relationships: bool = False
|
32
|
+
export_models: bool = False
|
33
|
+
export_file_name: str = 'models.py'
|
34
|
+
|
35
|
+
@model_validator(mode="after")
|
36
|
+
def validate_and_initialize(self) -> "SqlAlchemyConnectionConfig":
|
37
|
+
"""
|
38
|
+
Validate the configuration, initialize the engine, test the connection, and build the model.
|
39
|
+
|
40
|
+
Raises:
|
41
|
+
ValueError: If `connection_url` or `table_name` is missing, or if the connection or model
|
42
|
+
building fails.
|
43
|
+
"""
|
44
|
+
self.logger = self.logger or Logger.default_logger(logger_name="sqlalchemy_connection", debug=self.debug)
|
45
|
+
self.logger.debug("Validating and initializing SQLAlchemy connection configuration.")
|
46
|
+
if not self.connection_url:
|
47
|
+
raise ValueError("`connection_url` must be provided.")
|
48
|
+
|
49
|
+
# Initialize the engine.
|
50
|
+
self.engine = create_engine(self.connection_url)
|
51
|
+
self.logger.debug(f"Engine created for URL")
|
52
|
+
|
53
|
+
# Validate the connection.
|
54
|
+
self.validate_connection()
|
55
|
+
|
56
|
+
if not self.table:
|
57
|
+
raise ValueError("`table` must be provided to build the model.")
|
58
|
+
|
59
|
+
try:
|
60
|
+
builder = SqlAlchemyModelBuilder(self.engine, self.table, self.add_relationships, self.debug, self.logger)
|
61
|
+
self.model = builder.build_model()
|
62
|
+
if self.export_models:
|
63
|
+
builder.export_models_to_file(self.export_file_name)
|
64
|
+
self.logger.debug(f"Successfully built model for table: {self.table}")
|
65
|
+
except Exception as e:
|
66
|
+
raise ValueError(f"Failed to build model for table {self.table}: {e}")
|
67
|
+
|
68
|
+
return self
|
69
|
+
|
70
|
+
def validate_connection(self) -> None:
|
71
|
+
"""
|
72
|
+
Test the database connection by executing a simple query.
|
73
|
+
|
74
|
+
Raises:
|
75
|
+
ValueError: If the connection cannot be established.
|
76
|
+
"""
|
77
|
+
try:
|
78
|
+
with self.engine.connect() as connection:
|
79
|
+
connection.execute(text("SELECT 1"))
|
80
|
+
self.logger.debug("Database connection validated.")
|
81
|
+
except OperationalError as e:
|
82
|
+
raise ValueError(f"Failed to connect to the database: {e}")
|
@@ -0,0 +1,135 @@
|
|
1
|
+
import itertools
|
2
|
+
|
3
|
+
import dask.dataframe as dd
|
4
|
+
import pandas as pd
|
5
|
+
from sqlalchemy import create_engine, inspect, select
|
6
|
+
from sqlalchemy.orm import sessionmaker
|
7
|
+
|
8
|
+
from sibi_dst.v2.df_helper.core import FilterHandler
|
9
|
+
from sibi_dst.v2.utils import Logger
|
10
|
+
|
11
|
+
|
12
|
+
class SQLAlchemyDask:
|
13
|
+
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
|
14
|
+
"""
|
15
|
+
Initialize with an SQLAlchemy query and database engine URL.
|
16
|
+
|
17
|
+
:param model: SQLAlchemy ORM model.
|
18
|
+
:param filters: Filters to apply on the query.
|
19
|
+
:param engine_url: Database connection string for SQLAlchemy engine.
|
20
|
+
:param chunk_size: Number of records per chunk for Dask partitions.
|
21
|
+
:param logger: Logger instance for logging.
|
22
|
+
:param debug: Whether to print detailed logs.
|
23
|
+
"""
|
24
|
+
self.query = None
|
25
|
+
self.model = model
|
26
|
+
self.filters = filters
|
27
|
+
self.chunk_size = chunk_size
|
28
|
+
self.debug = debug
|
29
|
+
self.engine = create_engine(engine_url)
|
30
|
+
self.Session = sessionmaker(bind=self.engine)
|
31
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
32
|
+
self.logger.set_level(logger.DEBUG if debug else logger.INFO)
|
33
|
+
|
34
|
+
@staticmethod
|
35
|
+
def infer_dtypes_from_model(model):
|
36
|
+
"""
|
37
|
+
Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
|
38
|
+
"""
|
39
|
+
mapper = inspect(model)
|
40
|
+
sqlalchemy_to_dask_dtype = {
|
41
|
+
'INTEGER': 'Int64',
|
42
|
+
'SMALLINT': 'Int64',
|
43
|
+
'BIGINT': 'Int64',
|
44
|
+
'FLOAT': 'float64',
|
45
|
+
'NUMERIC': 'float64',
|
46
|
+
'BOOLEAN': 'bool',
|
47
|
+
'VARCHAR': 'object',
|
48
|
+
'TEXT': 'object',
|
49
|
+
'DATE': 'datetime64[ns]',
|
50
|
+
'DATETIME': 'datetime64[ns]',
|
51
|
+
'TIME': 'object',
|
52
|
+
'UUID': 'object',
|
53
|
+
}
|
54
|
+
|
55
|
+
dtypes = {}
|
56
|
+
for column in mapper.columns:
|
57
|
+
dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
|
58
|
+
dtypes[column.name] = dtype
|
59
|
+
|
60
|
+
return dtypes
|
61
|
+
|
62
|
+
def read_frame(self, fillna_value=None):
|
63
|
+
"""
|
64
|
+
Load data from an SQLAlchemy query into a Dask DataFrame.
|
65
|
+
|
66
|
+
:param fillna_value: Value to replace NaN or NULL values with, if any.
|
67
|
+
:return: Dask DataFrame.
|
68
|
+
"""
|
69
|
+
with self.Session() as session:
|
70
|
+
try:
|
71
|
+
# Build query
|
72
|
+
self.query = select(self.model)
|
73
|
+
if self.filters:
|
74
|
+
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger, debug=self.debug).apply_filters(self.query,
|
75
|
+
model=self.model,
|
76
|
+
filters=self.filters)
|
77
|
+
else:
|
78
|
+
n_records = 100
|
79
|
+
self.query = self.query.limit(n_records)
|
80
|
+
self.logger.debug(f"query:{self.query}")
|
81
|
+
# Infer dtypes
|
82
|
+
dtypes = self.infer_dtypes_from_model(self.model)
|
83
|
+
# Get the column order from the SQLAlchemy model
|
84
|
+
ordered_columns = [column.name for column in self.model.__table__.columns]
|
85
|
+
|
86
|
+
# Execute query and fetch results in chunks
|
87
|
+
result_proxy = session.execute(self.query)
|
88
|
+
results = result_proxy.scalars().all() # Fetch all rows
|
89
|
+
iterator = iter(results)
|
90
|
+
|
91
|
+
partitions = []
|
92
|
+
|
93
|
+
while True:
|
94
|
+
chunk = list(itertools.islice(iterator, self.chunk_size))
|
95
|
+
if not chunk:
|
96
|
+
break
|
97
|
+
|
98
|
+
# Convert chunk to Pandas DataFrame
|
99
|
+
df = pd.DataFrame.from_records(
|
100
|
+
[row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
|
101
|
+
)
|
102
|
+
# Drop internal SQLAlchemy state if it exists
|
103
|
+
df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
|
104
|
+
|
105
|
+
# Reorder columns to match the model's order
|
106
|
+
df = df[ordered_columns]
|
107
|
+
|
108
|
+
# Fill NaN values
|
109
|
+
if fillna_value is not None:
|
110
|
+
df = df.fillna(fillna_value)
|
111
|
+
|
112
|
+
# Convert timezone-aware columns to naive
|
113
|
+
for col in df.columns:
|
114
|
+
if isinstance(df[col].dtype, pd.DatetimeTZDtype):
|
115
|
+
df[col] = df[col].dt.tz_localize(None)
|
116
|
+
|
117
|
+
# Apply inferred dtypes
|
118
|
+
df = df.astype(dtypes)
|
119
|
+
# Create a Dask partition
|
120
|
+
partitions.append(dd.from_pandas(df, npartitions=1))
|
121
|
+
|
122
|
+
# Concatenate all partitions
|
123
|
+
if partitions:
|
124
|
+
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
125
|
+
else:
|
126
|
+
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
127
|
+
|
128
|
+
self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
129
|
+
|
130
|
+
return dask_df
|
131
|
+
|
132
|
+
except Exception as e:
|
133
|
+
self.logger.error(f"Error executing query: {str(e)}")
|
134
|
+
self.logger.error(self.query)
|
135
|
+
return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
@@ -0,0 +1,142 @@
|
|
1
|
+
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
4
|
+
from sibi_dst.v2.df_helper.core import ParamsConfig, QueryConfig
|
5
|
+
from sibi_dst.v2.utils import Logger
|
6
|
+
from ._io_dask import SQLAlchemyDask
|
7
|
+
from ._db_connection import SqlAlchemyConnectionConfig
|
8
|
+
|
9
|
+
|
10
|
+
class SqlAlchemyLoadFromDb:
|
11
|
+
"""
|
12
|
+
The SqlAlchemyLoadFromDb class provides functionality to load data from a
|
13
|
+
database using SQLAlchemy into a Dask DataFrame. It is capable of handling
|
14
|
+
large datasets efficiently by utilizing the Dask framework for parallel
|
15
|
+
computations.
|
16
|
+
|
17
|
+
This class is initialized with a database connection configuration, query
|
18
|
+
configuration, optional parameters, and a logger. It can execute a query
|
19
|
+
using the specified configurations and read the results into a Dask
|
20
|
+
DataFrame. This is useful for processing and analyzing large-scale data.
|
21
|
+
|
22
|
+
:ivar df: Dask DataFrame to store the loaded data.
|
23
|
+
:type df: dd.DataFrame
|
24
|
+
:ivar db_connection: Database connection configuration object, containing details
|
25
|
+
such as the table, model, and engine to be used for the query.
|
26
|
+
:type db_connection: SqlAlchemyConnectionConfig
|
27
|
+
:ivar table_name: Name of the database table being queried.
|
28
|
+
:type table_name: str
|
29
|
+
:ivar model: SQLAlchemy model associated with the database connection.
|
30
|
+
:type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
|
31
|
+
:ivar engine: SQLAlchemy engine used for executing queries.
|
32
|
+
:type engine: sqlalchemy.engine.base.Engine
|
33
|
+
:ivar logger: Logger instance for logging debug and error information.
|
34
|
+
:type logger: Logger
|
35
|
+
:ivar query_config: Query configuration, including query-related details such
|
36
|
+
as the SQL query or query settings.
|
37
|
+
:type query_config: QueryConfig
|
38
|
+
:ivar params_config: Parameters configuration, including filter parameters for
|
39
|
+
the query.
|
40
|
+
:type params_config: ParamsConfig
|
41
|
+
:ivar debug: Debug flag indicating whether debug mode is enabled.
|
42
|
+
:type debug: bool
|
43
|
+
:ivar chunk_size: Size of data chunks to process at a time.
|
44
|
+
:type chunk_size: int
|
45
|
+
"""
|
46
|
+
df: dd.DataFrame = None
|
47
|
+
|
48
|
+
def __init__(
|
49
|
+
self,
|
50
|
+
plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
|
51
|
+
plugin_query: QueryConfig = None,
|
52
|
+
plugin_params: ParamsConfig = None,
|
53
|
+
debug: bool = False,
|
54
|
+
logger: Logger = None,
|
55
|
+
**kwargs,
|
56
|
+
):
|
57
|
+
"""
|
58
|
+
Initializes an instance of the class, setting up a database connection,
|
59
|
+
query configuration, parameter configuration, and other optional settings
|
60
|
+
like debugging and logging. The class aims to manage the integration and
|
61
|
+
interaction with SQLAlchemy-based database operations.
|
62
|
+
|
63
|
+
:param plugin_sqlalchemy:
|
64
|
+
The SQLAlchemy connection configuration object, which provides
|
65
|
+
the connection details like engine, table name, and model
|
66
|
+
associated with the database operations.
|
67
|
+
:param plugin_query:
|
68
|
+
The query configuration object, used to define specific query
|
69
|
+
options or rules. Defaults to None.
|
70
|
+
:param plugin_params:
|
71
|
+
The parameters configuration object, used for any additional
|
72
|
+
parameterized settings or configurations. Defaults to None.
|
73
|
+
:param logger:
|
74
|
+
Optional logger instance for logging purposes. If not provided,
|
75
|
+
a default logger is instantiated using the standard logging system.
|
76
|
+
:param kwargs:
|
77
|
+
Optional additional keyword arguments for customization. Can
|
78
|
+
include optional settings like `debug` mode or `chunk_size`
|
79
|
+
for batch operations.
|
80
|
+
"""
|
81
|
+
self.db_connection = plugin_sqlalchemy
|
82
|
+
self.table_name = self.db_connection.table
|
83
|
+
self.model = self.db_connection.model
|
84
|
+
self.engine = self.db_connection.engine
|
85
|
+
self.debug = debug
|
86
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__, debug=self.debug)
|
87
|
+
self.query_config = plugin_query
|
88
|
+
self.params_config = plugin_params
|
89
|
+
self.chunk_size = kwargs.pop("chunk_size", 1000)
|
90
|
+
|
91
|
+
def build_and_load(self) -> dd.DataFrame:
|
92
|
+
"""
|
93
|
+
Builds and returns the resulting dataframe after calling the internal
|
94
|
+
build and load function. This method triggers the `_build_and_load`
|
95
|
+
function to process and prepare the data before returning it as
|
96
|
+
a dask dataframe.
|
97
|
+
|
98
|
+
:raises RuntimeError: If any error occurs during the build or load process.
|
99
|
+
|
100
|
+
:return: The processed data in a dask dataframe.
|
101
|
+
:rtype: dd.DataFrame
|
102
|
+
"""
|
103
|
+
self._build_and_load()
|
104
|
+
return self.df
|
105
|
+
|
106
|
+
def _build_and_load(self) -> dd.DataFrame:
|
107
|
+
"""
|
108
|
+
Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
|
109
|
+
|
110
|
+
This method initializes a SQLAlchemyDask object with the provided model,
|
111
|
+
filters, engine URL, logger, chunk size, and debug configuration.
|
112
|
+
It attempts to load the data using the ``read_frame`` method of
|
113
|
+
SQLAlchemyDask. If the data cannot be loaded or the query returns
|
114
|
+
no rows, it creates and returns an empty Dask DataFrame.
|
115
|
+
|
116
|
+
:raises Exception: On failure to load data or to create a DataFrame.
|
117
|
+
|
118
|
+
:return: A Dask DataFrame object containing the queried data or an
|
119
|
+
empty DataFrame if the query returns no results or fails.
|
120
|
+
:rtype: dask.dataframe.DataFrame
|
121
|
+
"""
|
122
|
+
try:
|
123
|
+
self.df = SQLAlchemyDask(
|
124
|
+
model=self.model,
|
125
|
+
filters=self.params_config.filters,
|
126
|
+
engine_url=self.engine.url,
|
127
|
+
logger=self.logger,
|
128
|
+
chunk_size=self.chunk_size,
|
129
|
+
debug=self.debug
|
130
|
+
).read_frame()
|
131
|
+
|
132
|
+
if self.df is None or len(self.df.head().index) == 0:
|
133
|
+
self.logger.debug("Query returned no results.")
|
134
|
+
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
135
|
+
|
136
|
+
return dask_df
|
137
|
+
return self.df
|
138
|
+
except Exception as e:
|
139
|
+
self.logger.debug(f"Failed to load data into Dask DataFrame.{e}")
|
140
|
+
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
141
|
+
|
142
|
+
return dask_df
|
@@ -0,0 +1,297 @@
|
|
1
|
+
import re
|
2
|
+
from collections import defaultdict
|
3
|
+
from typing import Dict, Any, Type
|
4
|
+
|
5
|
+
from sqlalchemy import MetaData, Table, and_
|
6
|
+
from sqlalchemy import inspect
|
7
|
+
from sqlalchemy.engine import Engine
|
8
|
+
from sqlalchemy.exc import ArgumentError, NoForeignKeysError
|
9
|
+
from sqlalchemy.orm import declarative_base, relationship, foreign, configure_mappers, clear_mappers
|
10
|
+
|
11
|
+
from sibi_dst.v2.utils import Logger
|
12
|
+
|
13
|
+
# Base class for dynamically created models.
|
14
|
+
Base = declarative_base()
|
15
|
+
|
16
|
+
# Constants.
|
17
|
+
APPS_LABEL = "datacubes"
|
18
|
+
RESERVED_COLUMN_NAMES = {"metadata", "class_", "table"}
|
19
|
+
RESERVED_KEYWORDS = {"class", "def", "return", "yield", "global"}
|
20
|
+
|
21
|
+
# Global registry keyed by the original table name (snake_case).
|
22
|
+
MODEL_REGISTRY: Dict[str, Type] = {}
|
23
|
+
|
24
|
+
|
25
|
+
class SqlAlchemyModelBuilder:
|
26
|
+
"""
|
27
|
+
Dynamically builds an ORM model for a single table by reflecting its columns
|
28
|
+
and reverse-engineering its relationships from foreign key metadata.
|
29
|
+
|
30
|
+
In add_relationships(), the builder groups FKs by related table, then uses SQLAlchemy's
|
31
|
+
inspect() to retrieve the remote model's mapped columns (instead of accessing __table__ directly).
|
32
|
+
This ensures that the columns used in the join condition are actually present on the mapped models.
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(self, engine: Engine, table_name: str, add_relationships: bool = False, debug: bool = False,
|
36
|
+
logger: Logger = None) -> None:
|
37
|
+
self.engine = engine
|
38
|
+
self.table_name = table_name
|
39
|
+
self.add_relationships = add_relationships
|
40
|
+
self.debug = debug
|
41
|
+
self.logger = logger or Logger.default_logger(logger_name="sqlalchemy_model_builder", debug=self.debug)
|
42
|
+
self.metadata = MetaData()
|
43
|
+
# Try to reflect the specified table.
|
44
|
+
try:
|
45
|
+
self.metadata.reflect(only=[table_name], bind=self.engine)
|
46
|
+
except Exception as e:
|
47
|
+
self.logger.warning(f"Could not reflect table '{table_name}': {e}. Skipping model build.")
|
48
|
+
self.table = None
|
49
|
+
else:
|
50
|
+
self.table = self.metadata.tables.get(table_name)
|
51
|
+
if self.table is None:
|
52
|
+
self.logger.warning(f"Table '{table_name}' not found in the database. Skipping model build.")
|
53
|
+
# Generate a CamelCase model name.
|
54
|
+
|
55
|
+
self.model_name: str = self.normalize_class_name(table_name)
|
56
|
+
|
57
|
+
def build_model(self) -> Type:
|
58
|
+
try:
|
59
|
+
self.metadata.reflect(only=[self.table_name], bind=self.engine)
|
60
|
+
except Exception as e:
|
61
|
+
self.logger.warning(
|
62
|
+
f"Could not reflect table '{self.table_name}': {e}. Skipping model build."
|
63
|
+
)
|
64
|
+
return None
|
65
|
+
|
66
|
+
self.table = self.metadata.tables.get(self.table_name)
|
67
|
+
if self.table is None:
|
68
|
+
self.logger.warning(
|
69
|
+
f"Table '{self.table_name}' not found in the database. Skipping model build."
|
70
|
+
)
|
71
|
+
return None
|
72
|
+
columns = self.get_columns(self.table)
|
73
|
+
attrs: Dict[str, Any] = {
|
74
|
+
"__tablename__": self.table_name,
|
75
|
+
"__table__": self.table,
|
76
|
+
"__module__": f"{APPS_LABEL}.models",
|
77
|
+
"__mapper_args__": {"eager_defaults": True},
|
78
|
+
}
|
79
|
+
attrs.update(columns)
|
80
|
+
if self.add_relationships:
|
81
|
+
self._add_relationships(attrs, self.table)
|
82
|
+
model = type(self.model_name, (Base,), attrs)
|
83
|
+
MODEL_REGISTRY[self.table_name] = model
|
84
|
+
# Validate relationships by forcing SQLAlchemy to configure all mappers.
|
85
|
+
try:
|
86
|
+
configure_mappers()
|
87
|
+
self.logger.debug(f"Successfully configured mappers for model {self.model_name}.")
|
88
|
+
except Exception as e:
|
89
|
+
self.logger.error(f"Mapper configuration error for model {self.model_name}: {e}")
|
90
|
+
# Optionally, you could remove or adjust relationships here before proceeding.
|
91
|
+
raise ValueError(f"Invalid relationship configuration in model {self.model_name}: {e}") from e
|
92
|
+
|
93
|
+
self.logger.debug(f"Created model {self.model_name} for table {self.table_name} with relationships.")
|
94
|
+
return model
|
95
|
+
|
96
|
+
def get_columns(self, table: Table) -> Dict[str, Any]:
|
97
|
+
cols: Dict[str, Any] = {}
|
98
|
+
for column in table.columns:
|
99
|
+
norm_name = self.normalize_column_name(column.name)
|
100
|
+
if norm_name not in RESERVED_COLUMN_NAMES:
|
101
|
+
cols[norm_name] = column
|
102
|
+
return cols
|
103
|
+
|
104
|
+
def _add_relationships(self, attrs: Dict[str, Any], table: Table) -> None:
|
105
|
+
"""
|
106
|
+
Groups foreign keys by related table name and builds explicit join conditions.
|
107
|
+
For each group, it uses the first FK to define foreign_keys and remote_side.
|
108
|
+
Uses SQLAlchemy’s inspect() to obtain the remote model's mapped columns.
|
109
|
+
Temporarily adds the relationship, forces mapper configuration, and if the relationship
|
110
|
+
fails configuration (for example, if the FK column is not marked as a foreign key on either side),
|
111
|
+
the relationship is removed.
|
112
|
+
"""
|
113
|
+
inspector = inspect(self.engine)
|
114
|
+
fk_info_list = inspector.get_foreign_keys(self.table.name)
|
115
|
+
|
116
|
+
fk_groups = defaultdict(list)
|
117
|
+
for fk_info in fk_info_list:
|
118
|
+
referred_table = fk_info.get("referred_table")
|
119
|
+
if referred_table:
|
120
|
+
fk_groups[referred_table].append(fk_info)
|
121
|
+
|
122
|
+
for related_table_name, fk_dicts in fk_groups.items():
|
123
|
+
# Ensure the remote model is built.
|
124
|
+
try:
|
125
|
+
if related_table_name not in MODEL_REGISTRY:
|
126
|
+
self.logger.debug(f"Building missing model for related table {related_table_name}.")
|
127
|
+
remote_model = SqlAlchemyModelBuilder(
|
128
|
+
self.engine,
|
129
|
+
related_table_name,
|
130
|
+
add_relationships=False, # Skip recursive relationship building.
|
131
|
+
debug=self.debug,
|
132
|
+
logger=self.logger
|
133
|
+
).build_model()
|
134
|
+
if related_table_name not in MODEL_REGISTRY or remote_model is None:
|
135
|
+
raise ValueError(f"Failed to build model for table {related_table_name}.")
|
136
|
+
else:
|
137
|
+
remote_model = MODEL_REGISTRY[related_table_name]
|
138
|
+
except Exception as e:
|
139
|
+
self.logger.warning(f"Could not build model for table {related_table_name}: {e}")
|
140
|
+
continue
|
141
|
+
|
142
|
+
# Get the mapper directly.
|
143
|
+
remote_mapper = remote_model.__mapper__
|
144
|
+
join_conditions = []
|
145
|
+
local_foreign_keys = []
|
146
|
+
remote_side_keys = []
|
147
|
+
|
148
|
+
# Build join conditions from FK dictionaries.
|
149
|
+
for fk_info in fk_dicts:
|
150
|
+
local_cols = fk_info.get("constrained_columns", [])
|
151
|
+
remote_cols = fk_info.get("referred_columns", [])
|
152
|
+
if not local_cols or not remote_cols:
|
153
|
+
self.logger.warning(
|
154
|
+
f"Incomplete foreign key definition for table {related_table_name} in table {self.table_name}."
|
155
|
+
)
|
156
|
+
continue
|
157
|
+
|
158
|
+
local_col_name = local_cols[0]
|
159
|
+
remote_col_name = remote_cols[0]
|
160
|
+
|
161
|
+
try:
|
162
|
+
local_col = self.table.c[local_col_name]
|
163
|
+
except KeyError:
|
164
|
+
self.logger.warning(
|
165
|
+
f"Local column {local_col_name} not found in table {self.table_name}. Skipping FK."
|
166
|
+
)
|
167
|
+
continue
|
168
|
+
|
169
|
+
try:
|
170
|
+
remote_col = remote_mapper.columns[remote_col_name]
|
171
|
+
except KeyError:
|
172
|
+
self.logger.warning(
|
173
|
+
f"Remote column {remote_col_name} not found in model {remote_model.__name__}. Skipping FK."
|
174
|
+
)
|
175
|
+
continue
|
176
|
+
|
177
|
+
# --- Extra Validation Step ---
|
178
|
+
# Ensure the local column is actually defined as a foreign key.
|
179
|
+
if not local_col.foreign_keys:
|
180
|
+
self.logger.warning(
|
181
|
+
f"Local column {local_col_name} in table {self.table_name} is not defined as a foreign key. Skipping relationship."
|
182
|
+
)
|
183
|
+
continue
|
184
|
+
# Optionally, check that the remote column is part of the remote table.
|
185
|
+
if remote_col.name not in remote_model.__table__.columns.keys():
|
186
|
+
self.logger.warning(
|
187
|
+
f"Remote column {remote_col_name} is not present in table for model {remote_model.__name__}. Skipping relationship."
|
188
|
+
)
|
189
|
+
continue
|
190
|
+
|
191
|
+
# Annotate the local column as foreign.
|
192
|
+
join_conditions.append(foreign(local_col) == remote_col)
|
193
|
+
local_foreign_keys.append(local_col)
|
194
|
+
remote_side_keys.append(remote_col)
|
195
|
+
|
196
|
+
if not join_conditions:
|
197
|
+
self.logger.warning(
|
198
|
+
f"No valid join conditions for relationship from {self.table_name} to {related_table_name}."
|
199
|
+
)
|
200
|
+
continue
|
201
|
+
|
202
|
+
primaryjoin_expr = join_conditions[0] if len(join_conditions) == 1 else and_(*join_conditions)
|
203
|
+
relationship_name = self.normalize_column_name(related_table_name)
|
204
|
+
if relationship_name in attrs:
|
205
|
+
continue
|
206
|
+
|
207
|
+
# --- Temporarily add the relationship with the fixed lambda ---
|
208
|
+
try:
|
209
|
+
attrs[relationship_name] = relationship(
|
210
|
+
lambda rt=related_table_name: MODEL_REGISTRY[rt],
|
211
|
+
primaryjoin=primaryjoin_expr,
|
212
|
+
foreign_keys=local_foreign_keys,
|
213
|
+
remote_side=remote_side_keys,
|
214
|
+
lazy="joined",
|
215
|
+
viewonly=True # Use viewonly=True if persistence is not needed.
|
216
|
+
)
|
217
|
+
self.logger.debug(
|
218
|
+
f"Temporarily added relationship {relationship_name} on model {self.model_name} for testing."
|
219
|
+
)
|
220
|
+
except (ArgumentError, NoForeignKeysError) as e:
|
221
|
+
self.logger.error(
|
222
|
+
f"Error creating relationship '{relationship_name}' on model {self.model_name} referencing {related_table_name}: {e}"
|
223
|
+
)
|
224
|
+
continue
|
225
|
+
|
226
|
+
# --- Validate the relationship by forcing mapper configuration ---
|
227
|
+
try:
|
228
|
+
configure_mappers()
|
229
|
+
self.logger.debug(
|
230
|
+
f"Relationship {relationship_name} on model {self.model_name} validated successfully."
|
231
|
+
)
|
232
|
+
except Exception as e:
|
233
|
+
self.logger.error(
|
234
|
+
f"Relationship '{relationship_name}' on model {self.model_name} failed configuration: {e}"
|
235
|
+
)
|
236
|
+
del attrs[relationship_name]
|
237
|
+
self.logger.debug(
|
238
|
+
f"Removed relationship '{relationship_name}' from model {self.model_name} due to configuration error."
|
239
|
+
)
|
240
|
+
clear_mappers()
|
241
|
+
continue
|
242
|
+
|
243
|
+
@staticmethod
|
244
|
+
def normalize_class_name(table_name: str) -> str:
|
245
|
+
table_name = str(table_name)
|
246
|
+
return "".join(word.capitalize() for word in table_name.split("_"))
|
247
|
+
|
248
|
+
def normalize_column_name(self, column_name: Any) -> str:
|
249
|
+
try:
|
250
|
+
# Force the column name into a string.
|
251
|
+
s = str(column_name)
|
252
|
+
except Exception as e:
|
253
|
+
self.logger.debug(f"Failed to convert column name {column_name} to string: {e}")
|
254
|
+
s = ""
|
255
|
+
norm_name = re.sub(r"\W|^(?=\d)", "_", s)
|
256
|
+
if norm_name in RESERVED_KEYWORDS:
|
257
|
+
norm_name += "_field"
|
258
|
+
return norm_name
|
259
|
+
|
260
|
+
@staticmethod
|
261
|
+
def export_models_to_file(filename: str) -> None:
|
262
|
+
"""
|
263
|
+
Export dynamically built models (from MODEL_REGISTRY) to a Python file.
|
264
|
+
This function writes out a simplified version of each model definition.
|
265
|
+
"""
|
266
|
+
with open(filename, "w") as f:
|
267
|
+
# Write header imports.
|
268
|
+
f.write("from sqlalchemy import Column, Integer, String, Float, DateTime, Boolean, ForeignKey\n")
|
269
|
+
f.write("from sqlalchemy.orm import relationship\n")
|
270
|
+
f.write("from sqlalchemy.ext.declarative import declarative_base\n\n")
|
271
|
+
f.write("Base = declarative_base()\n\n\n")
|
272
|
+
|
273
|
+
for table_name, model in MODEL_REGISTRY.items():
|
274
|
+
print(f"Exporting model for table {table_name} as {model.__name__}...")
|
275
|
+
# Write the class header.
|
276
|
+
f.write(f"class {model.__name__}(Base):\n")
|
277
|
+
f.write(f" __tablename__ = '{table_name}'\n")
|
278
|
+
|
279
|
+
# Write column definitions.
|
280
|
+
for column in model.__table__.columns:
|
281
|
+
# Get the column type name (this is a simple conversion).
|
282
|
+
col_type = column.type.__class__.__name__
|
283
|
+
col_def = f" {column.name} = Column({col_type}"
|
284
|
+
if column.primary_key:
|
285
|
+
col_def += ", primary_key=True"
|
286
|
+
# If needed, you can add more column attributes here.
|
287
|
+
col_def += ")\n"
|
288
|
+
f.write(col_def)
|
289
|
+
|
290
|
+
# Write relationship definitions.
|
291
|
+
# This simple example prints relationships with just the target class name.
|
292
|
+
for rel in model.__mapper__.relationships:
|
293
|
+
f.write(f" {rel.key} = relationship('{rel.mapper.class_.__name__}')\n")
|
294
|
+
|
295
|
+
f.write("\n\n")
|
296
|
+
|
297
|
+
print(f"Models exported to {filename}")
|