sibi-dst 0.3.45__py3-none-any.whl → 0.3.47__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +38 -0
- sibi_dst/{df_helper → v1/df_helper}/_artifact_updater_multi_wrapper.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/_df_helper.py +3 -3
- sibi_dst/{df_helper → v1/df_helper}/_parquet_artifact.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/_parquet_reader.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/django/_load_from_db.py +3 -3
- sibi_dst/{df_helper → v1/df_helper}/backends/http/_http_config.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_filter_handler.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_parquet_options.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_load_from_db.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_sql_model_builder.py +2 -1
- sibi_dst/{df_helper → v1/df_helper}/core/_filter_handler.py +1 -1
- sibi_dst/v1/osmnx_helper/__init__.py +6 -0
- sibi_dst/{tests → v1/tests}/test_data_wrapper_class.py +11 -10
- sibi_dst/{utils → v1/utils}/__init__.py +4 -0
- sibi_dst/{utils → v1/utils}/clickhouse_writer.py +1 -1
- sibi_dst/v1/utils/data_from_http_source.py +49 -0
- sibi_dst/{utils → v1/utils}/data_utils.py +5 -3
- sibi_dst/{utils → v1/utils}/data_wrapper.py +3 -1
- sibi_dst/{utils → v1/utils}/date_utils.py +1 -1
- sibi_dst/{utils → v1/utils}/file_utils.py +1 -1
- sibi_dst/{utils → v1/utils}/filepath_generator.py +1 -1
- sibi_dst/{utils → v1/utils}/parquet_saver.py +1 -1
- sibi_dst/v1/utils/storage_config.py +28 -0
- sibi_dst/v2/df_helper/__init__.py +7 -0
- sibi_dst/v2/df_helper/_df_helper.py +214 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +10 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +82 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +135 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +142 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +297 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +9 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +78 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +122 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +142 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +283 -0
- sibi_dst/v2/df_helper/core/__init__.py +9 -0
- sibi_dst/v2/df_helper/core/_filter_handler.py +236 -0
- sibi_dst/v2/df_helper/core/_params_config.py +139 -0
- sibi_dst/v2/df_helper/core/_query_config.py +17 -0
- sibi_dst/v2/utils/__init__.py +5 -0
- sibi_dst/v2/utils/log_utils.py +120 -0
- {sibi_dst-0.3.45.dist-info → sibi_dst-0.3.47.dist-info}/METADATA +3 -2
- sibi_dst-0.3.47.dist-info/RECORD +80 -0
- sibi_dst/osmnx_helper/__init__.py +0 -9
- sibi_dst/osmnx_helper/v2/base_osm_map.py +0 -153
- sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
- sibi_dst-0.3.45.dist-info/RECORD +0 -62
- /sibi_dst/{df_helper/backends → v1}/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/__init__.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/df_helper/backends}/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_db_connection.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_io_dask.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_sql_model_builder.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/http/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/parquet/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_db_connection.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_filter_handler.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_defaults.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_params_config.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_query_config.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/data_cleaner.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/__init__.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/geo_location_service.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/utils.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/base_osm_map.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/__init__.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/calendar_html.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/router_plotter.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/utils.py +0 -0
- /sibi_dst/{osmnx_helper/v2 → v1/tests}/__init__.py +0 -0
- /sibi_dst/{utils → v1/utils}/airflow_manager.py +0 -0
- /sibi_dst/{utils → v1/utils}/credentials.py +0 -0
- /sibi_dst/{utils → v1/utils}/df_utils.py +0 -0
- /sibi_dst/{utils → v1/utils}/log_utils.py +0 -0
- /sibi_dst/{utils → v1/utils}/phone_formatter.py +0 -0
- /sibi_dst/{utils → v1/utils}/storage_manager.py +0 -0
- /sibi_dst/{osmnx_helper/v2/basemaps → v2}/__init__.py +0 -0
- /sibi_dst/{tests → v2/df_helper/backends}/__init__.py +0 -0
- {sibi_dst-0.3.45.dist-info → sibi_dst-0.3.47.dist-info}/WHEEL +0 -0
@@ -0,0 +1,78 @@
|
|
1
|
+
from typing import Any, Optional
|
2
|
+
|
3
|
+
from pydantic import BaseModel, model_validator, ConfigDict
|
4
|
+
from sqlmodel import create_engine
|
5
|
+
from sqlalchemy import text
|
6
|
+
from sqlalchemy.exc import OperationalError
|
7
|
+
|
8
|
+
from sibi_dst.v2.utils import Logger
|
9
|
+
from ._model_builder import SQLModelModelBuilder # Refactored builder for SQLModel
|
10
|
+
|
11
|
+
|
12
|
+
class SQLModelConnectionConfig(BaseModel):
|
13
|
+
"""
|
14
|
+
Configuration for establishing an SQLModel database connection and dynamically building
|
15
|
+
an ORM model for a specific table.
|
16
|
+
"""
|
17
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
18
|
+
connection_url: str
|
19
|
+
table: Optional[str] = None
|
20
|
+
model: Any = None
|
21
|
+
engine: Optional[Any] = None
|
22
|
+
debug: bool = False
|
23
|
+
logger: Optional[Logger] = None
|
24
|
+
add_relationships: bool = False
|
25
|
+
export_models: bool = False
|
26
|
+
export_file_name: str = 'models.py'
|
27
|
+
|
28
|
+
@model_validator(mode="after")
|
29
|
+
def validate_and_initialize(self) -> "SQLModelConnectionConfig":
|
30
|
+
"""
|
31
|
+
Validates the configuration, initializes the engine, tests the connection,
|
32
|
+
and builds the ORM model for the specified table.
|
33
|
+
"""
|
34
|
+
self.logger = self.logger or Logger.default_logger(logger_name="sqlmodel_connection", debug=self.debug)
|
35
|
+
self.logger.debug("Validating and initializing SQLModel connection configuration.")
|
36
|
+
|
37
|
+
if not self.connection_url:
|
38
|
+
raise ValueError("`connection_url` must be provided.")
|
39
|
+
|
40
|
+
# Initialize the engine using SQLModel's create_engine.
|
41
|
+
self.engine = create_engine(self.connection_url)
|
42
|
+
self.logger.debug("Engine created for connection URL.")
|
43
|
+
|
44
|
+
# Validate the connection.
|
45
|
+
self.validate_connection()
|
46
|
+
|
47
|
+
if not self.table:
|
48
|
+
raise ValueError("`table` must be provided to build the model.")
|
49
|
+
|
50
|
+
try:
|
51
|
+
builder = SQLModelModelBuilder(
|
52
|
+
self.engine,
|
53
|
+
self.table,
|
54
|
+
self.add_relationships,
|
55
|
+
self.debug,
|
56
|
+
self.logger
|
57
|
+
)
|
58
|
+
self.model = builder.build_model()
|
59
|
+
if self.export_models:
|
60
|
+
builder.export_models_to_file(self.export_file_name)
|
61
|
+
self.logger.debug(f"Successfully built model for table: {self.table}")
|
62
|
+
except Exception as e:
|
63
|
+
raise ValueError(f"Failed to build model for table {self.table}: {e}")
|
64
|
+
|
65
|
+
return self
|
66
|
+
|
67
|
+
def validate_connection(self) -> None:
|
68
|
+
"""
|
69
|
+
Tests the database connection by executing a simple query.
|
70
|
+
Raises:
|
71
|
+
ValueError: If the connection cannot be established.
|
72
|
+
"""
|
73
|
+
try:
|
74
|
+
with self.engine.connect() as connection:
|
75
|
+
connection.execute(text("SELECT 1"))
|
76
|
+
self.logger.debug("Database connection validated.")
|
77
|
+
except OperationalError as e:
|
78
|
+
raise ValueError(f"Failed to connect to the database: {e}")
|
@@ -0,0 +1,122 @@
|
|
1
|
+
import itertools
|
2
|
+
import dask.dataframe as dd
|
3
|
+
import pandas as pd
|
4
|
+
from sqlmodel import create_engine, Session, select
|
5
|
+
from sibi_dst.v2.df_helper.core import FilterHandler
|
6
|
+
from sibi_dst.v2.utils import Logger
|
7
|
+
|
8
|
+
|
9
|
+
class SQLModelDask:
|
10
|
+
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
|
11
|
+
"""
|
12
|
+
Initialize with a SQLModel query and a database connection URL.
|
13
|
+
|
14
|
+
:param model: SQLModel ORM model.
|
15
|
+
:param filters: Filters to apply on the query.
|
16
|
+
:param engine_url: Database connection string.
|
17
|
+
:param chunk_size: Number of records per chunk for Dask partitions.
|
18
|
+
:param logger: Logger instance for logging.
|
19
|
+
:param debug: Whether to enable detailed logging.
|
20
|
+
"""
|
21
|
+
self.query = None
|
22
|
+
self.model = model
|
23
|
+
self.filters = filters
|
24
|
+
self.chunk_size = chunk_size
|
25
|
+
self.debug = debug
|
26
|
+
# Create the engine using SQLModel's create_engine
|
27
|
+
self.engine = create_engine(engine_url)
|
28
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__, debug=debug)
|
29
|
+
self.logger.set_level(self.logger.DEBUG if debug else self.logger.INFO)
|
30
|
+
|
31
|
+
@staticmethod
|
32
|
+
def infer_dtypes_from_model(model):
|
33
|
+
"""
|
34
|
+
Infer Dask DataFrame dtypes based on the SQLModel columns.
|
35
|
+
"""
|
36
|
+
# Mapping SQLAlchemy type names to Dask/Pandas dtypes.
|
37
|
+
sqlalchemy_to_dask_dtype = {
|
38
|
+
'INTEGER': 'Int64',
|
39
|
+
'SMALLINT': 'Int64',
|
40
|
+
'BIGINT': 'Int64',
|
41
|
+
'FLOAT': 'float64',
|
42
|
+
'NUMERIC': 'float64',
|
43
|
+
'BOOLEAN': 'bool',
|
44
|
+
'VARCHAR': 'object',
|
45
|
+
'TEXT': 'object',
|
46
|
+
'DATE': 'datetime64[ns]',
|
47
|
+
'DATETIME': 'datetime64[ns]',
|
48
|
+
'TIME': 'object',
|
49
|
+
'UUID': 'object',
|
50
|
+
}
|
51
|
+
dtypes = {}
|
52
|
+
for column in model.__table__.columns:
|
53
|
+
# Get the column type name in uppercase.
|
54
|
+
type_str = str(column.type).upper()
|
55
|
+
dtype = sqlalchemy_to_dask_dtype.get(type_str, 'object')
|
56
|
+
dtypes[column.name] = dtype
|
57
|
+
return dtypes
|
58
|
+
|
59
|
+
def read_frame(self, fillna_value=None):
|
60
|
+
"""
|
61
|
+
Load data from a SQLModel query into a Dask DataFrame.
|
62
|
+
|
63
|
+
:param fillna_value: Value to replace NaN/NULL values with, if any.
|
64
|
+
:return: A Dask DataFrame containing the query results.
|
65
|
+
"""
|
66
|
+
try:
|
67
|
+
with Session(self.engine) as session:
|
68
|
+
# Build the base query.
|
69
|
+
self.query = select(self.model.__table__)
|
70
|
+
if self.filters:
|
71
|
+
# Apply filters using FilterHandler (assumed to work for SQLModel as well)
|
72
|
+
self.query = FilterHandler(backend="sqlmodel", logger=self.logger, debug=self.debug).apply_filters(
|
73
|
+
self.query, model=self.model, filters=self.filters
|
74
|
+
)
|
75
|
+
else:
|
76
|
+
# If no filters provided, limit to a small number of records for safety.
|
77
|
+
n_records = 100
|
78
|
+
self.query = self.query.limit(n_records)
|
79
|
+
self.logger.debug(f"query: {self.query}")
|
80
|
+
|
81
|
+
# Infer dtypes from the model.
|
82
|
+
dtypes = self.infer_dtypes_from_model(self.model)
|
83
|
+
# Get the column order from the model's table.
|
84
|
+
ordered_columns = [column.name for column in self.model.__table__.columns]
|
85
|
+
|
86
|
+
# Execute the query and fetch all results.
|
87
|
+
results = session.exec(self.query).all()
|
88
|
+
iterator = iter(results)
|
89
|
+
partitions = []
|
90
|
+
|
91
|
+
while True:
|
92
|
+
chunk = list(itertools.islice(iterator, self.chunk_size))
|
93
|
+
if not chunk:
|
94
|
+
break
|
95
|
+
# Convert each SQLModel instance to a dictionary using the built-in .dict() method.
|
96
|
+
df = pd.DataFrame([row.dict() for row in chunk])
|
97
|
+
# Drop SQLModel/SQLAlchemy internal state if present.
|
98
|
+
df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
|
99
|
+
# Reorder columns to match the model's column order.
|
100
|
+
df = df[ordered_columns]
|
101
|
+
if fillna_value is not None:
|
102
|
+
df = df.fillna(fillna_value)
|
103
|
+
# Remove timezone information from datetime columns.
|
104
|
+
for col in df.columns:
|
105
|
+
if isinstance(df[col].dtype, pd.DatetimeTZDtype):
|
106
|
+
df[col] = df[col].dt.tz_localize(None)
|
107
|
+
df = df.astype(dtypes)
|
108
|
+
partitions.append(dd.from_pandas(df, npartitions=1))
|
109
|
+
|
110
|
+
if partitions:
|
111
|
+
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
112
|
+
else:
|
113
|
+
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
114
|
+
|
115
|
+
self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
116
|
+
return dask_df
|
117
|
+
|
118
|
+
except Exception as e:
|
119
|
+
self.logger.error(f"Error executing query: {str(e)}")
|
120
|
+
self.logger.error(self.query)
|
121
|
+
# In case of error, return an empty Dask DataFrame with the expected columns.
|
122
|
+
return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
@@ -0,0 +1,142 @@
|
|
1
|
+
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
4
|
+
from sibi_dst.v2.df_helper.core import ParamsConfig, QueryConfig
|
5
|
+
from sibi_dst.v2.utils import Logger
|
6
|
+
from ._io_dask import SQLModelDask
|
7
|
+
from ._db_connection import SQLModelConnectionConfig
|
8
|
+
|
9
|
+
|
10
|
+
class SQLModelLoadFromDb:
|
11
|
+
"""
|
12
|
+
The SqlAlchemyLoadFromDb class provides functionality to load data from a
|
13
|
+
database using SQLAlchemy into a Dask DataFrame. It is capable of handling
|
14
|
+
large datasets efficiently by utilizing the Dask framework for parallel
|
15
|
+
computations.
|
16
|
+
|
17
|
+
This class is initialized with a database connection configuration, query
|
18
|
+
configuration, optional parameters, and a logger. It can execute a query
|
19
|
+
using the specified configurations and read the results into a Dask
|
20
|
+
DataFrame. This is useful for processing and analyzing large-scale data.
|
21
|
+
|
22
|
+
:ivar df: Dask DataFrame to store the loaded data.
|
23
|
+
:type df: dd.DataFrame
|
24
|
+
:ivar db_connection: Database connection configuration object, containing details
|
25
|
+
such as the table, model, and engine to be used for the query.
|
26
|
+
:type db_connection: SqlAlchemyConnectionConfig
|
27
|
+
:ivar table_name: Name of the database table being queried.
|
28
|
+
:type table_name: str
|
29
|
+
:ivar model: SQLAlchemy model associated with the database connection.
|
30
|
+
:type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
|
31
|
+
:ivar engine: SQLAlchemy engine used for executing queries.
|
32
|
+
:type engine: sqlalchemy.engine.base.Engine
|
33
|
+
:ivar logger: Logger instance for logging debug and error information.
|
34
|
+
:type logger: Logger
|
35
|
+
:ivar query_config: Query configuration, including query-related details such
|
36
|
+
as the SQL query or query settings.
|
37
|
+
:type query_config: QueryConfig
|
38
|
+
:ivar params_config: Parameters configuration, including filter parameters for
|
39
|
+
the query.
|
40
|
+
:type params_config: ParamsConfig
|
41
|
+
:ivar debug: Debug flag indicating whether debug mode is enabled.
|
42
|
+
:type debug: bool
|
43
|
+
:ivar chunk_size: Size of data chunks to process at a time.
|
44
|
+
:type chunk_size: int
|
45
|
+
"""
|
46
|
+
df: dd.DataFrame = None
|
47
|
+
|
48
|
+
def __init__(
|
49
|
+
self,
|
50
|
+
plugin_sqlalchemy: SQLModelConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
|
51
|
+
plugin_query: QueryConfig = None,
|
52
|
+
plugin_params: ParamsConfig = None,
|
53
|
+
debug: bool = False,
|
54
|
+
logger: Logger = None,
|
55
|
+
**kwargs,
|
56
|
+
):
|
57
|
+
"""
|
58
|
+
Initializes an instance of the class, setting up a database connection,
|
59
|
+
query configuration, parameter configuration, and other optional settings
|
60
|
+
like debugging and logging. The class aims to manage the integration and
|
61
|
+
interaction with SQLAlchemy-based database operations.
|
62
|
+
|
63
|
+
:param plugin_sqlalchemy:
|
64
|
+
The SQLAlchemy connection configuration object, which provides
|
65
|
+
the connection details like engine, table name, and model
|
66
|
+
associated with the database operations.
|
67
|
+
:param plugin_query:
|
68
|
+
The query configuration object, used to define specific query
|
69
|
+
options or rules. Defaults to None.
|
70
|
+
:param plugin_params:
|
71
|
+
The parameters configuration object, used for any additional
|
72
|
+
parameterized settings or configurations. Defaults to None.
|
73
|
+
:param logger:
|
74
|
+
Optional logger instance for logging purposes. If not provided,
|
75
|
+
a default logger is instantiated using the standard logging system.
|
76
|
+
:param kwargs:
|
77
|
+
Optional additional keyword arguments for customization. Can
|
78
|
+
include optional settings like `debug` mode or `chunk_size`
|
79
|
+
for batch operations.
|
80
|
+
"""
|
81
|
+
self.db_connection = plugin_sqlalchemy
|
82
|
+
self.table_name = self.db_connection.table
|
83
|
+
self.model = self.db_connection.model
|
84
|
+
self.engine = self.db_connection.engine
|
85
|
+
self.debug = debug
|
86
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__, debug=self.debug)
|
87
|
+
self.query_config = plugin_query
|
88
|
+
self.params_config = plugin_params
|
89
|
+
self.chunk_size = kwargs.pop("chunk_size", 1000)
|
90
|
+
|
91
|
+
def build_and_load(self) -> dd.DataFrame:
|
92
|
+
"""
|
93
|
+
Builds and returns the resulting dataframe after calling the internal
|
94
|
+
build and load function. This method triggers the `_build_and_load`
|
95
|
+
function to process and prepare the data before returning it as
|
96
|
+
a dask dataframe.
|
97
|
+
|
98
|
+
:raises RuntimeError: If any error occurs during the build or load process.
|
99
|
+
|
100
|
+
:return: The processed data in a dask dataframe.
|
101
|
+
:rtype: dd.DataFrame
|
102
|
+
"""
|
103
|
+
self._build_and_load()
|
104
|
+
return self.df
|
105
|
+
|
106
|
+
def _build_and_load(self) -> dd.DataFrame:
|
107
|
+
"""
|
108
|
+
Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
|
109
|
+
|
110
|
+
This method initializes a SQLAlchemyDask object with the provided model,
|
111
|
+
filters, engine URL, logger, chunk size, and debug configuration.
|
112
|
+
It attempts to load the data using the ``read_frame`` method of
|
113
|
+
SQLAlchemyDask. If the data cannot be loaded or the query returns
|
114
|
+
no rows, it creates and returns an empty Dask DataFrame.
|
115
|
+
|
116
|
+
:raises Exception: On failure to load data or to create a DataFrame.
|
117
|
+
|
118
|
+
:return: A Dask DataFrame object containing the queried data or an
|
119
|
+
empty DataFrame if the query returns no results or fails.
|
120
|
+
:rtype: dask.dataframe.DataFrame
|
121
|
+
"""
|
122
|
+
try:
|
123
|
+
self.df = SQLModelDask(
|
124
|
+
model=self.model,
|
125
|
+
filters=self.params_config.filters,
|
126
|
+
engine_url=self.engine.url,
|
127
|
+
logger=self.logger,
|
128
|
+
chunk_size=self.chunk_size,
|
129
|
+
debug=self.debug
|
130
|
+
).read_frame()
|
131
|
+
|
132
|
+
if self.df is None or len(self.df.head().index) == 0:
|
133
|
+
self.logger.debug("Query returned no results.")
|
134
|
+
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
135
|
+
|
136
|
+
return dask_df
|
137
|
+
return self.df
|
138
|
+
except Exception as e:
|
139
|
+
self.logger.debug(f"Failed to load data into Dask DataFrame.{e}")
|
140
|
+
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
141
|
+
|
142
|
+
return dask_df
|
@@ -0,0 +1,283 @@
|
|
1
|
+
import re
|
2
|
+
from collections import defaultdict
|
3
|
+
from datetime import datetime
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, get_args, get_origin
|
5
|
+
|
6
|
+
from sqlalchemy import and_, inspect, cast, func
|
7
|
+
from sqlalchemy.exc import ArgumentError, NoForeignKeysError
|
8
|
+
from sqlalchemy.orm import relationship, foreign, configure_mappers, clear_mappers
|
9
|
+
from sqlalchemy.sql.sqltypes import Integer, String, Float, DateTime, Boolean, Numeric, Text
|
10
|
+
|
11
|
+
from sqlmodel import SQLModel, create_engine
|
12
|
+
from sibi_dst.v2.utils import Logger
|
13
|
+
|
14
|
+
APPS_LABEL = "datacubes"
|
15
|
+
RESERVED_COLUMN_NAMES = {"metadata", "class_", "table"}
|
16
|
+
RESERVED_KEYWORDS = {"class", "def", "return", "yield", "global"}
|
17
|
+
|
18
|
+
MODEL_REGISTRY: Dict[str, Type] = {}
|
19
|
+
|
20
|
+
|
21
|
+
class SQLModelModelBuilder:
|
22
|
+
"""
|
23
|
+
Dynamically builds an ORM model for a single table by reflecting its columns
|
24
|
+
and reverse-engineering its relationships from foreign key metadata using SQLModel.
|
25
|
+
The generated model is mapped solely via its reflected __table__ attribute.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(
|
29
|
+
self,
|
30
|
+
engine,
|
31
|
+
table_name: str,
|
32
|
+
add_relationships: bool = False,
|
33
|
+
debug: bool = False,
|
34
|
+
logger: Optional[Logger] = None,
|
35
|
+
) -> None:
|
36
|
+
self.engine = engine
|
37
|
+
self.table_name = table_name
|
38
|
+
self.add_relationships = add_relationships
|
39
|
+
self.debug = debug
|
40
|
+
self.logger = logger or Logger.default_logger(logger_name="sqlmodel_model_builder", debug=self.debug)
|
41
|
+
# Use SQLModel's shared metadata.
|
42
|
+
self.metadata = SQLModel.metadata
|
43
|
+
self.metadata.bind = self.engine
|
44
|
+
|
45
|
+
try:
|
46
|
+
self.metadata.reflect(only=[table_name], bind=self.engine)
|
47
|
+
except Exception as e:
|
48
|
+
self.logger.warning(f"Could not reflect table '{table_name}': {e}. Skipping model build.")
|
49
|
+
self.table = None
|
50
|
+
else:
|
51
|
+
self.table = self.metadata.tables.get(table_name)
|
52
|
+
if self.table is None:
|
53
|
+
self.logger.warning(f"Table '{table_name}' not found in the database. Skipping model build.")
|
54
|
+
self.model_name: str = self.normalize_class_name(table_name)
|
55
|
+
if self.debug:
|
56
|
+
self.logger.debug(f"Reflected table for '{table_name}': {self.table}")
|
57
|
+
|
58
|
+
def build_model(self) -> Optional[Type]:
|
59
|
+
try:
|
60
|
+
self.metadata.reflect(only=[self.table_name], bind=self.engine)
|
61
|
+
except Exception as e:
|
62
|
+
self.logger.warning(f"Could not reflect table '{self.table_name}': {e}. Skipping model build.")
|
63
|
+
return None
|
64
|
+
|
65
|
+
self.table = self.metadata.tables.get(self.table_name)
|
66
|
+
if self.table is None:
|
67
|
+
self.logger.warning(f"Table '{self.table_name}' not found in the database. Skipping model build.")
|
68
|
+
return None
|
69
|
+
|
70
|
+
# Force registration of the reflected table in the metadata.
|
71
|
+
try:
|
72
|
+
self.metadata._add_table(self.table_name, None, self.table)
|
73
|
+
except Exception as e:
|
74
|
+
self.logger.debug(f"Error forcing table registration: {e}")
|
75
|
+
|
76
|
+
columns, annotations = self.get_columns(self.table)
|
77
|
+
# Build the mapping dictionary using only __table__.
|
78
|
+
attrs: Dict[str, Any] = {
|
79
|
+
"__table__": self.table,
|
80
|
+
"__module__": f"{APPS_LABEL}.models",
|
81
|
+
"__mapper_args__": {"eager_defaults": True},
|
82
|
+
"__annotations__": annotations,
|
83
|
+
}
|
84
|
+
attrs.update(columns)
|
85
|
+
if self.add_relationships:
|
86
|
+
self._add_relationships(attrs, self.table)
|
87
|
+
model = type(self.model_name, (SQLModel,), attrs)
|
88
|
+
MODEL_REGISTRY[self.table_name] = model
|
89
|
+
|
90
|
+
try:
|
91
|
+
configure_mappers()
|
92
|
+
self.logger.debug(f"Configured mappers for model {self.model_name}.")
|
93
|
+
except Exception as e:
|
94
|
+
self.logger.error(f"Mapper configuration error for model {self.model_name}: {e}")
|
95
|
+
raise ValueError(f"Invalid mapping in model {self.model_name}: {e}") from e
|
96
|
+
|
97
|
+
# Register the mapping.
|
98
|
+
SQLModel.metadata.create_all(self.engine)
|
99
|
+
self.logger.debug(f"Created model {self.model_name} for table {self.table_name}.")
|
100
|
+
return model
|
101
|
+
|
102
|
+
def get_columns(self, table: Any) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
103
|
+
cols: Dict[str, Any] = {}
|
104
|
+
annotations: Dict[str, Any] = {}
|
105
|
+
for column in table.columns:
|
106
|
+
norm_name = self.normalize_column_name(column.name)
|
107
|
+
if norm_name in RESERVED_COLUMN_NAMES:
|
108
|
+
continue
|
109
|
+
if norm_name in cols:
|
110
|
+
self.logger.warning(f"Duplicate normalized column name '{norm_name}'; skipping duplicate for column '{column.name}'.")
|
111
|
+
continue
|
112
|
+
cols[norm_name] = column
|
113
|
+
annotations[norm_name] = self._python_type_for_column(column)
|
114
|
+
return cols, annotations
|
115
|
+
|
116
|
+
def _python_type_for_column(self, column: Any) -> Any:
|
117
|
+
col_type = type(column.type)
|
118
|
+
if issubclass(col_type, Integer):
|
119
|
+
return int
|
120
|
+
elif issubclass(col_type, (String, Text)):
|
121
|
+
return str
|
122
|
+
elif issubclass(col_type, (Float, Numeric)):
|
123
|
+
return float
|
124
|
+
elif issubclass(col_type, DateTime):
|
125
|
+
return datetime
|
126
|
+
elif issubclass(col_type, Boolean):
|
127
|
+
return bool
|
128
|
+
else:
|
129
|
+
return Any
|
130
|
+
|
131
|
+
def _add_relationships(self, attrs: Dict[str, Any], table: Any) -> None:
|
132
|
+
inspector = inspect(self.engine)
|
133
|
+
fk_info_list = inspector.get_foreign_keys(self.table.name)
|
134
|
+
fk_groups = defaultdict(list)
|
135
|
+
for fk_info in fk_info_list:
|
136
|
+
referred_table = fk_info.get("referred_table")
|
137
|
+
if referred_table:
|
138
|
+
fk_groups[referred_table].append(fk_info)
|
139
|
+
|
140
|
+
for related_table_name, fk_dicts in fk_groups.items():
|
141
|
+
try:
|
142
|
+
if related_table_name not in MODEL_REGISTRY:
|
143
|
+
self.logger.debug(f"Building missing model for related table {related_table_name}.")
|
144
|
+
remote_model = SQLModelModelBuilder(
|
145
|
+
self.engine,
|
146
|
+
related_table_name,
|
147
|
+
add_relationships=False,
|
148
|
+
debug=self.debug,
|
149
|
+
logger=self.logger,
|
150
|
+
).build_model()
|
151
|
+
if related_table_name not in MODEL_REGISTRY or remote_model is None:
|
152
|
+
raise ValueError(f"Failed to build model for table {related_table_name}.")
|
153
|
+
else:
|
154
|
+
remote_model = MODEL_REGISTRY[related_table_name]
|
155
|
+
except Exception as e:
|
156
|
+
self.logger.warning(f"Could not build model for table {related_table_name}: {e}")
|
157
|
+
continue
|
158
|
+
|
159
|
+
remote_table = remote_model.__table__
|
160
|
+
join_conditions = []
|
161
|
+
local_foreign_keys = []
|
162
|
+
remote_side_keys = []
|
163
|
+
for fk_info in fk_dicts:
|
164
|
+
local_cols = fk_info.get("constrained_columns", [])
|
165
|
+
remote_cols = fk_info.get("referred_columns", [])
|
166
|
+
if not local_cols or not remote_cols:
|
167
|
+
self.logger.warning(f"Incomplete FK definition for {related_table_name} in {self.table_name}.")
|
168
|
+
continue
|
169
|
+
local_col_name = local_cols[0]
|
170
|
+
remote_col_name = remote_cols[0]
|
171
|
+
try:
|
172
|
+
local_col = self.table.c[local_col_name]
|
173
|
+
except KeyError:
|
174
|
+
self.logger.warning(f"Local column {local_col_name} not found in {self.table_name}.")
|
175
|
+
continue
|
176
|
+
try:
|
177
|
+
remote_col = remote_table.columns[remote_col_name]
|
178
|
+
except KeyError:
|
179
|
+
self.logger.warning(f"Remote column {remote_col_name} not found in model {remote_model.__name__}.")
|
180
|
+
continue
|
181
|
+
if not local_col.foreign_keys:
|
182
|
+
self.logger.warning(f"Column {local_col_name} in {self.table_name} is not defined as a foreign key.")
|
183
|
+
continue
|
184
|
+
if remote_col.name not in remote_model.__table__.columns.keys():
|
185
|
+
self.logger.warning(f"Remote column {remote_col.name} not in table for model {remote_model.__name__}.")
|
186
|
+
continue
|
187
|
+
join_conditions.append(foreign(local_col) == remote_col)
|
188
|
+
local_foreign_keys.append(local_col)
|
189
|
+
remote_side_keys.append(remote_col)
|
190
|
+
if not join_conditions:
|
191
|
+
self.logger.warning(f"No valid join conditions for relationship from {self.table_name} to {related_table_name}.")
|
192
|
+
continue
|
193
|
+
primaryjoin_expr = join_conditions[0] if len(join_conditions) == 1 else and_(*join_conditions)
|
194
|
+
relationship_name = self.normalize_column_name(related_table_name)
|
195
|
+
if relationship_name in attrs:
|
196
|
+
continue
|
197
|
+
try:
|
198
|
+
rel = relationship(
|
199
|
+
lambda rt=related_table_name: MODEL_REGISTRY[rt],
|
200
|
+
primaryjoin=primaryjoin_expr,
|
201
|
+
foreign_keys=local_foreign_keys,
|
202
|
+
remote_side=remote_side_keys,
|
203
|
+
lazy="joined",
|
204
|
+
viewonly=True,
|
205
|
+
)
|
206
|
+
attrs[relationship_name] = rel
|
207
|
+
attrs.setdefault("__annotations__", {})[relationship_name] = List[remote_model]
|
208
|
+
self.logger.debug(f"Added relationship '{relationship_name}' referencing {related_table_name}.")
|
209
|
+
except (ArgumentError, NoForeignKeysError) as e:
|
210
|
+
self.logger.error(f"Error creating relationship '{relationship_name}' on model {self.model_name}: {e}")
|
211
|
+
continue
|
212
|
+
try:
|
213
|
+
configure_mappers()
|
214
|
+
self.logger.debug(f"Validated relationship '{relationship_name}' on model {self.model_name}.")
|
215
|
+
except Exception as e:
|
216
|
+
self.logger.error(f"Relationship '{relationship_name}' on model {self.model_name} failed configuration: {e}")
|
217
|
+
del attrs[relationship_name]
|
218
|
+
self.logger.debug(f"Removed relationship '{relationship_name}' from model {self.model_name}.")
|
219
|
+
clear_mappers()
|
220
|
+
continue
|
221
|
+
|
222
|
+
@staticmethod
|
223
|
+
def normalize_class_name(table_name: str) -> str:
|
224
|
+
return "".join(word.capitalize() for word in table_name.split("_"))
|
225
|
+
|
226
|
+
def normalize_column_name(self, column_name: Any) -> str:
|
227
|
+
try:
|
228
|
+
s = str(column_name)
|
229
|
+
except Exception as e:
|
230
|
+
self.logger.debug(f"Failed to convert column name {column_name} to string: {e}")
|
231
|
+
s = ""
|
232
|
+
norm_name = re.sub(r"\W|^(?=\d)", "_", s)
|
233
|
+
if norm_name in RESERVED_KEYWORDS:
|
234
|
+
norm_name += "_field"
|
235
|
+
return norm_name
|
236
|
+
|
237
|
+
@staticmethod
|
238
|
+
def export_models_to_file(filename: str) -> None:
|
239
|
+
reserved_attrs = {"metadata", "__tablename__", "__sqlmodel_relationships__", "__name__"}
|
240
|
+
import re
|
241
|
+
import typing
|
242
|
+
|
243
|
+
with open(filename, "w") as f:
|
244
|
+
f.write("from sqlmodel import SQLModel, Field, Relationship, Column\n")
|
245
|
+
f.write("from sqlalchemy import ForeignKey\n")
|
246
|
+
f.write("from sqlalchemy.sql.elements import DefaultClause\n")
|
247
|
+
f.write("from sqlalchemy.sql.sqltypes import INTEGER, DATE, VARCHAR, SMALLINT, FLOAT, CHAR, TEXT, DATETIME\n")
|
248
|
+
f.write("from sqlalchemy.dialects.mysql import TINYINT\n")
|
249
|
+
f.write("from typing import Any, List, Optional, Union\n")
|
250
|
+
f.write("import typing\n")
|
251
|
+
f.write("import sqlalchemy\n\n\n")
|
252
|
+
|
253
|
+
f.write("class Base(SQLModel):\n")
|
254
|
+
f.write(" class Config:\n")
|
255
|
+
f.write(" arbitrary_types_allowed = True\n\n\n")
|
256
|
+
|
257
|
+
for table_name, model in MODEL_REGISTRY.items():
|
258
|
+
f.write(f"class {model.__name__}(SQLModel, table=True):\n")
|
259
|
+
f.write(f" __tablename__ = '{table_name}'\n")
|
260
|
+
for column in model.__table__.columns:
|
261
|
+
col_repr = repr(column)
|
262
|
+
col_repr = re.sub(r", table=<[^>]+>", "", col_repr)
|
263
|
+
col_repr = re.sub(r",\s*server_default=DefaultClause\([^)]*\)", "", col_repr)
|
264
|
+
col_repr = re.sub(r",\s*display_width=\d+", "", col_repr)
|
265
|
+
f.write(f" {column.name}: Any = Field(sa_column={col_repr})\n")
|
266
|
+
annotations = typing.get_type_hints(model)
|
267
|
+
col_names = {col.name for col in model.__table__.columns}
|
268
|
+
for key, type_hint in annotations.items():
|
269
|
+
if key in col_names or key in reserved_attrs or key.startswith("__"):
|
270
|
+
continue
|
271
|
+
origin = get_origin(type_hint)
|
272
|
+
if origin in (list, List):
|
273
|
+
remote_model = get_args(type_hint)[0]
|
274
|
+
remote_model_name = remote_model.__name__
|
275
|
+
elif origin is Optional:
|
276
|
+
args = get_args(type_hint)
|
277
|
+
non_none = [arg for arg in args if arg is not type(None)]
|
278
|
+
remote_model_name = non_none[0].__name__ if non_none else "Any"
|
279
|
+
else:
|
280
|
+
remote_model_name = type_hint.__name__ if hasattr(type_hint, '__name__') else str(type_hint)
|
281
|
+
f.write(f" {key}: {type_hint} = Relationship(\"{remote_model_name}\")\n")
|
282
|
+
f.write("\n\n")
|
283
|
+
print(f"Models exported to {filename}")
|