sibi-flux 2025.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +44 -0
- sibi_flux/__init__.py +49 -0
- sibi_flux/artifacts/__init__.py +7 -0
- sibi_flux/artifacts/base.py +166 -0
- sibi_flux/artifacts/parquet.py +360 -0
- sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
- sibi_flux/artifacts/parquet_engine/executor.py +204 -0
- sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
- sibi_flux/artifacts/parquet_engine/planner.py +544 -0
- sibi_flux/conf/settings.py +131 -0
- sibi_flux/core/__init__.py +5 -0
- sibi_flux/core/managed_resource/__init__.py +3 -0
- sibi_flux/core/managed_resource/_managed_resource.py +733 -0
- sibi_flux/core/type_maps/__init__.py +100 -0
- sibi_flux/dask_cluster/__init__.py +47 -0
- sibi_flux/dask_cluster/async_core.py +27 -0
- sibi_flux/dask_cluster/client_manager.py +549 -0
- sibi_flux/dask_cluster/core.py +322 -0
- sibi_flux/dask_cluster/exceptions.py +34 -0
- sibi_flux/dask_cluster/utils.py +49 -0
- sibi_flux/datacube/__init__.py +3 -0
- sibi_flux/datacube/_data_cube.py +332 -0
- sibi_flux/datacube/config_engine.py +152 -0
- sibi_flux/datacube/field_factory.py +48 -0
- sibi_flux/datacube/field_registry.py +122 -0
- sibi_flux/datacube/generator.py +677 -0
- sibi_flux/datacube/orchestrator.py +171 -0
- sibi_flux/dataset/__init__.py +3 -0
- sibi_flux/dataset/_dataset.py +162 -0
- sibi_flux/df_enricher/__init__.py +56 -0
- sibi_flux/df_enricher/async_enricher.py +201 -0
- sibi_flux/df_enricher/merger.py +253 -0
- sibi_flux/df_enricher/specs.py +45 -0
- sibi_flux/df_enricher/types.py +12 -0
- sibi_flux/df_helper/__init__.py +5 -0
- sibi_flux/df_helper/_df_helper.py +450 -0
- sibi_flux/df_helper/backends/__init__.py +34 -0
- sibi_flux/df_helper/backends/_params.py +173 -0
- sibi_flux/df_helper/backends/_strategies.py +295 -0
- sibi_flux/df_helper/backends/http/__init__.py +5 -0
- sibi_flux/df_helper/backends/http/_http_config.py +122 -0
- sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
- sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
- sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
- sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
- sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
- sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
- sibi_flux/df_helper/backends/utils.py +32 -0
- sibi_flux/df_helper/core/__init__.py +15 -0
- sibi_flux/df_helper/core/_defaults.py +104 -0
- sibi_flux/df_helper/core/_filter_handler.py +617 -0
- sibi_flux/df_helper/core/_params_config.py +185 -0
- sibi_flux/df_helper/core/_query_config.py +17 -0
- sibi_flux/df_validator/__init__.py +3 -0
- sibi_flux/df_validator/_df_validator.py +222 -0
- sibi_flux/logger/__init__.py +1 -0
- sibi_flux/logger/_logger.py +480 -0
- sibi_flux/mcp/__init__.py +26 -0
- sibi_flux/mcp/client.py +150 -0
- sibi_flux/mcp/router.py +126 -0
- sibi_flux/orchestration/__init__.py +9 -0
- sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
- sibi_flux/orchestration/_pipeline_executor.py +212 -0
- sibi_flux/osmnx_helper/__init__.py +22 -0
- sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
- sibi_flux/osmnx_helper/graph_loader.py +225 -0
- sibi_flux/osmnx_helper/utils.py +100 -0
- sibi_flux/pipelines/__init__.py +3 -0
- sibi_flux/pipelines/base.py +218 -0
- sibi_flux/py.typed +0 -0
- sibi_flux/readers/__init__.py +3 -0
- sibi_flux/readers/base.py +82 -0
- sibi_flux/readers/parquet.py +106 -0
- sibi_flux/utils/__init__.py +53 -0
- sibi_flux/utils/boilerplate/__init__.py +19 -0
- sibi_flux/utils/boilerplate/base_attacher.py +45 -0
- sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
- sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
- sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
- sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
- sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
- sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
- sibi_flux/utils/common.py +7 -0
- sibi_flux/utils/credentials/__init__.py +3 -0
- sibi_flux/utils/credentials/_config_manager.py +155 -0
- sibi_flux/utils/dask_utils.py +14 -0
- sibi_flux/utils/data_utils/__init__.py +3 -0
- sibi_flux/utils/data_utils/_data_utils.py +389 -0
- sibi_flux/utils/dataframe_utils.py +52 -0
- sibi_flux/utils/date_utils/__init__.py +10 -0
- sibi_flux/utils/date_utils/_business_days.py +220 -0
- sibi_flux/utils/date_utils/_date_utils.py +311 -0
- sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
- sibi_flux/utils/file_utils.py +48 -0
- sibi_flux/utils/filepath_generator/__init__.py +5 -0
- sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
- sibi_flux/utils/parquet_saver/__init__.py +6 -0
- sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
- sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
- sibi_flux/utils/retry.py +46 -0
- sibi_flux/utils/storage/__init__.py +7 -0
- sibi_flux/utils/storage/_fs_registry.py +112 -0
- sibi_flux/utils/storage/_storage_manager.py +257 -0
- sibi_flux/utils/storage/factory.py +33 -0
- sibi_flux-2025.12.0.dist-info/METADATA +283 -0
- sibi_flux-2025.12.0.dist-info/RECORD +110 -0
- sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Tuple, Dict, Optional
|
|
4
|
+
|
|
5
|
+
import dask.dataframe as dd
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from sibi_flux.core import ManagedResource
|
|
9
|
+
from sibi_flux.df_helper.core import ParamsConfig, QueryConfig
|
|
10
|
+
from ._db_connection import SqlAlchemyConnectionConfig
|
|
11
|
+
from ._io_dask import SQLAlchemyDask
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SqlAlchemyLoadFromDb(ManagedResource):
|
|
15
|
+
"""
|
|
16
|
+
Orchestrates loading data from a database using SQLAlchemy into a Dask DataFrame.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
logger_extra: Dict[str, Any] = {"sibi_flux_component": __name__}
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
|
24
|
+
plugin_query: Optional[QueryConfig] = None,
|
|
25
|
+
plugin_params: Optional[ParamsConfig] = None,
|
|
26
|
+
**kwargs: Any,
|
|
27
|
+
):
|
|
28
|
+
super().__init__(**kwargs)
|
|
29
|
+
self.db_connection = plugin_sqlalchemy
|
|
30
|
+
|
|
31
|
+
# Access properties safely (triggers lazy load if needed)
|
|
32
|
+
self.model = self.db_connection.model
|
|
33
|
+
self.engine = self.db_connection.engine
|
|
34
|
+
|
|
35
|
+
self.query_config = plugin_query
|
|
36
|
+
self.params_config = plugin_params
|
|
37
|
+
|
|
38
|
+
# Safe extraction of tuning parameters
|
|
39
|
+
df_params = self.params_config.df_params if self.params_config else None
|
|
40
|
+
|
|
41
|
+
# 1. Chunk Size (Default: 50k)
|
|
42
|
+
# Priority: kwargs > df_params > default
|
|
43
|
+
self.chunk_size = int(
|
|
44
|
+
kwargs.get("chunk_size", self._safe_get(df_params, "chunk_size", 50_000))
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# 2. Index Column (Critical for Range Pagination)
|
|
48
|
+
# Priority: kwargs > df_params > None
|
|
49
|
+
self.db_index = kwargs.get(
|
|
50
|
+
"db_index", self._safe_get(df_params, "db_index", None)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# 3. Safety Limit
|
|
54
|
+
# Priority: kwargs > df_params > None
|
|
55
|
+
self.limit = kwargs.get("limit", self._safe_get(df_params, "limit", None))
|
|
56
|
+
if self.limit is not None:
|
|
57
|
+
self.limit = int(self.limit)
|
|
58
|
+
|
|
59
|
+
# 4. Engine Options
|
|
60
|
+
self.execution_options = kwargs.get("execution_options", {})
|
|
61
|
+
|
|
62
|
+
self.total_records = -1
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def _safe_get(obj: Any, key: str, default: Any) -> Any:
|
|
66
|
+
"""Helper to extract values from Dict, Pydantic, or None safely."""
|
|
67
|
+
if obj is None:
|
|
68
|
+
return default
|
|
69
|
+
if isinstance(obj, dict):
|
|
70
|
+
return obj.get(key, default)
|
|
71
|
+
return getattr(obj, key, default)
|
|
72
|
+
|
|
73
|
+
def build_and_load(self) -> Tuple[int, dd.DataFrame]:
|
|
74
|
+
try:
|
|
75
|
+
# Determine pagination strategy automatically
|
|
76
|
+
# If we have an index_col, try 'range' (fast). Otherwise 'offset' (safe).
|
|
77
|
+
pagination_mode = "range" if self.db_index else "offset"
|
|
78
|
+
|
|
79
|
+
# Extract filters safely
|
|
80
|
+
filters = self.params_config.filters if self.params_config else {}
|
|
81
|
+
|
|
82
|
+
# Use Context Manager to ensure SQLAlchemyDask resources are cleaned up
|
|
83
|
+
with SQLAlchemyDask(
|
|
84
|
+
model=self.model,
|
|
85
|
+
filters=filters,
|
|
86
|
+
engine=self.engine,
|
|
87
|
+
chunk_size=self.chunk_size,
|
|
88
|
+
pagination=pagination_mode,
|
|
89
|
+
db_index=self.db_index,
|
|
90
|
+
limit=self.limit,
|
|
91
|
+
execution_options=self.execution_options,
|
|
92
|
+
logger=self.logger,
|
|
93
|
+
verbose=self.verbose,
|
|
94
|
+
debug=self.debug,
|
|
95
|
+
) as loader:
|
|
96
|
+
|
|
97
|
+
self.logger.debug(
|
|
98
|
+
f"SQLAlchemyDask initialized for {self.model.__name__} "
|
|
99
|
+
f"(strategy={pagination_mode}, chunk={self.chunk_size})",
|
|
100
|
+
extra=self.logger_extra,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
self.total_records, dask_df = loader.read_frame()
|
|
104
|
+
|
|
105
|
+
return self.total_records, dask_df
|
|
106
|
+
|
|
107
|
+
except Exception as e:
|
|
108
|
+
self.total_records = -1
|
|
109
|
+
self.logger.error(
|
|
110
|
+
f"{self.model.__name__} failed to load: {e}",
|
|
111
|
+
exc_info=True,
|
|
112
|
+
extra=self.logger_extra,
|
|
113
|
+
)
|
|
114
|
+
# Return empty dataframe structure on failure to prevent crashes downstream
|
|
115
|
+
try:
|
|
116
|
+
# Try to inspect columns from model to return empty DF with correct schema
|
|
117
|
+
columns = [c.name for c in self.model.__table__.columns]
|
|
118
|
+
except Exception:
|
|
119
|
+
columns = []
|
|
120
|
+
|
|
121
|
+
return -1, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
|
122
|
+
|
|
123
|
+
def _cleanup(self) -> None:
|
|
124
|
+
"""
|
|
125
|
+
Clean up instance references to prevent memory leaks.
|
|
126
|
+
Note: The engine is owned by DfHelper/ConnectionConfig, not us.
|
|
127
|
+
"""
|
|
128
|
+
try:
|
|
129
|
+
self.logger.debug(f"Cleaning up {self.__class__.__name__} refs")
|
|
130
|
+
self.db_connection = None
|
|
131
|
+
self.engine = None
|
|
132
|
+
self.model = None
|
|
133
|
+
except Exception as e:
|
|
134
|
+
self.logger.warning(f"Error during cleanup: {e}", extra=self.logger_extra)
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import sys
|
|
5
|
+
import types
|
|
6
|
+
import threading
|
|
7
|
+
from typing import Dict, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from sqlalchemy import MetaData, Table
|
|
10
|
+
from sqlalchemy.engine import Engine
|
|
11
|
+
from sqlalchemy.orm import DeclarativeBase
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Base(DeclarativeBase):
|
|
15
|
+
"""Shared declarative base for all ORM models."""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Default module label for generated classes
|
|
21
|
+
apps_label = "datacubes.models"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ModelRegistry:
|
|
25
|
+
"""
|
|
26
|
+
Thread-safe registry that reflects tables once per (engine, schema) and
|
|
27
|
+
returns a single mapped class per (engine, schema, table).
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
self._metadata_cache: Dict[Tuple[str, Optional[str]], MetaData] = {}
|
|
32
|
+
self._model_cache: Dict[Tuple[str, Optional[str], str], type] = {}
|
|
33
|
+
self._lock = threading.RLock()
|
|
34
|
+
self._md_locks: Dict[Tuple[str, Optional[str]], threading.Lock] = {}
|
|
35
|
+
|
|
36
|
+
# ---------- key helpers ----------
|
|
37
|
+
@staticmethod
|
|
38
|
+
def _engine_key(engine: Engine) -> str:
|
|
39
|
+
# Use URL string as key. Note: This includes passwords if not masked.
|
|
40
|
+
# For internal cache keys, this is fine.
|
|
41
|
+
return str(engine.url)
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def _qualified_key(schema: Optional[str], table: str) -> str:
|
|
45
|
+
return f"{schema}.{table}" if schema else table
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _split_schema_and_table(name: str) -> Tuple[Optional[str], str]:
|
|
49
|
+
if "." in name:
|
|
50
|
+
s, t = name.split(".", 1)
|
|
51
|
+
return (s or None), t
|
|
52
|
+
return None, name
|
|
53
|
+
|
|
54
|
+
# ---------- class name helpers ----------
|
|
55
|
+
@staticmethod
|
|
56
|
+
def _normalize_class_name(table_name: str) -> str:
|
|
57
|
+
return "".join(part.capitalize() for part in table_name.split("_"))
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _short_hash(*parts: str, length: int = 8) -> str:
|
|
61
|
+
h = hashlib.sha1("|".join(parts).encode("utf-8")).hexdigest()
|
|
62
|
+
return h[:length]
|
|
63
|
+
|
|
64
|
+
def _is_class_name_taken(self, class_name: str, module_label: str) -> bool:
|
|
65
|
+
# Check Python's module system directly to see if the name is taken
|
|
66
|
+
if module_label in sys.modules:
|
|
67
|
+
return hasattr(sys.modules[module_label], class_name)
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
def _find_existing_model_for_table(self, tbl: Table) -> Optional[type]:
|
|
71
|
+
# Scan SQLAlchemy's global mapper registry
|
|
72
|
+
for mapper in list(Base.registry.mappers):
|
|
73
|
+
try:
|
|
74
|
+
mapped_cls = mapper.class_
|
|
75
|
+
mapped_tbl = getattr(mapped_cls, "__table__", None)
|
|
76
|
+
if mapped_tbl is tbl:
|
|
77
|
+
return mapped_cls
|
|
78
|
+
# Check for equivalent table (same name/schema)
|
|
79
|
+
if isinstance(mapped_tbl, Table):
|
|
80
|
+
if (mapped_tbl.schema == tbl.schema) and (
|
|
81
|
+
mapped_tbl.name == tbl.name
|
|
82
|
+
):
|
|
83
|
+
return mapped_cls
|
|
84
|
+
except Exception:
|
|
85
|
+
continue
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
# ---------- metadata helpers ----------
|
|
89
|
+
def _get_or_create_metadata(self, ekey: str, schema: Optional[str]) -> MetaData:
|
|
90
|
+
md_key = (ekey, schema)
|
|
91
|
+
with self._lock:
|
|
92
|
+
md = self._metadata_cache.get(md_key)
|
|
93
|
+
if md is None:
|
|
94
|
+
md = MetaData(schema=schema)
|
|
95
|
+
self._metadata_cache[md_key] = md
|
|
96
|
+
return md
|
|
97
|
+
|
|
98
|
+
def _get_or_create_md_lock(
|
|
99
|
+
self, md_key: Tuple[str, Optional[str]]
|
|
100
|
+
) -> threading.Lock:
|
|
101
|
+
with self._lock:
|
|
102
|
+
lock = self._md_locks.get(md_key)
|
|
103
|
+
if lock is None:
|
|
104
|
+
lock = threading.Lock()
|
|
105
|
+
self._md_locks[md_key] = lock
|
|
106
|
+
return lock
|
|
107
|
+
|
|
108
|
+
def _register_as_module_attribute(
|
|
109
|
+
self, cls: type, module_name: str, class_name: str
|
|
110
|
+
) -> None:
|
|
111
|
+
"""
|
|
112
|
+
Injects the class into sys.modules so it can be imported/pickled.
|
|
113
|
+
Ensures parent packages exist (e.g., 'datacubes' for 'datacubes.models').
|
|
114
|
+
"""
|
|
115
|
+
if module_name not in sys.modules:
|
|
116
|
+
# Create the module
|
|
117
|
+
mod = types.ModuleType(module_name)
|
|
118
|
+
sys.modules[module_name] = mod
|
|
119
|
+
|
|
120
|
+
# Ensure parent packages exist logic
|
|
121
|
+
if "." in module_name:
|
|
122
|
+
parent_name, child_name = module_name.rsplit(".", 1)
|
|
123
|
+
if parent_name not in sys.modules:
|
|
124
|
+
sys.modules[parent_name] = types.ModuleType(parent_name)
|
|
125
|
+
|
|
126
|
+
# Link child to parent so `import parent; parent.child` works
|
|
127
|
+
setattr(sys.modules[parent_name], child_name, mod)
|
|
128
|
+
|
|
129
|
+
# Register the class on the module
|
|
130
|
+
setattr(sys.modules[module_name], class_name, cls)
|
|
131
|
+
|
|
132
|
+
# def _register_as_module_attribute(self, cls: type, module_name: str, class_name: str) -> None:
|
|
133
|
+
# """
|
|
134
|
+
# Injects the class into sys.modules so it can be imported/pickled.
|
|
135
|
+
# """
|
|
136
|
+
# if module_name not in sys.modules:
|
|
137
|
+
# # Create a dummy module on the fly
|
|
138
|
+
# sys.modules[module_name] = types.ModuleType(module_name)
|
|
139
|
+
#
|
|
140
|
+
# # Register the class on the module
|
|
141
|
+
# setattr(sys.modules[module_name], class_name, cls)
|
|
142
|
+
|
|
143
|
+
# ---------- public API ----------
|
|
144
|
+
def get_model(
|
|
145
|
+
self,
|
|
146
|
+
engine: Engine,
|
|
147
|
+
table_name: str,
|
|
148
|
+
*,
|
|
149
|
+
refresh: bool = False,
|
|
150
|
+
schema: Optional[str] = None,
|
|
151
|
+
module_label: Optional[str] = None,
|
|
152
|
+
prefer_stable_names: bool = True,
|
|
153
|
+
) -> type:
|
|
154
|
+
s2, tname = self._split_schema_and_table(table_name)
|
|
155
|
+
schema = schema if schema is not None else s2
|
|
156
|
+
ekey = self._engine_key(engine)
|
|
157
|
+
model_key = (ekey, schema, tname)
|
|
158
|
+
md_key = (ekey, schema)
|
|
159
|
+
module_label = module_label or apps_label
|
|
160
|
+
|
|
161
|
+
if refresh:
|
|
162
|
+
with self._lock:
|
|
163
|
+
self._model_cache.pop(model_key, None)
|
|
164
|
+
# Note: We don't drop metadata easily as it might be used by other models
|
|
165
|
+
|
|
166
|
+
# fast path: already cached model
|
|
167
|
+
with self._lock:
|
|
168
|
+
m = self._model_cache.get(model_key)
|
|
169
|
+
if m is not None:
|
|
170
|
+
return m
|
|
171
|
+
|
|
172
|
+
# ensure metadata and reflection are serialized per (engine, schema)
|
|
173
|
+
md = self._get_or_create_metadata(ekey, schema)
|
|
174
|
+
md_lock = self._get_or_create_md_lock(md_key)
|
|
175
|
+
qname = self._qualified_key(schema, tname)
|
|
176
|
+
|
|
177
|
+
# Reflection logic
|
|
178
|
+
tbl = md.tables.get(qname)
|
|
179
|
+
if tbl is None:
|
|
180
|
+
with md_lock:
|
|
181
|
+
# double-checked locking
|
|
182
|
+
tbl = md.tables.get(qname)
|
|
183
|
+
if tbl is None:
|
|
184
|
+
# Reflect only this table
|
|
185
|
+
md.reflect(bind=engine, only=[tname], schema=schema)
|
|
186
|
+
tbl = md.tables.get(qname)
|
|
187
|
+
|
|
188
|
+
if tbl is None:
|
|
189
|
+
raise ValueError(f"Table '{qname}' does not exist in the database.")
|
|
190
|
+
|
|
191
|
+
# If a mapped model for this Table already exists (anywhere), reuse it
|
|
192
|
+
reused = self._find_existing_model_for_table(tbl)
|
|
193
|
+
if reused is not None:
|
|
194
|
+
with self._lock:
|
|
195
|
+
self._model_cache[model_key] = reused
|
|
196
|
+
return reused
|
|
197
|
+
|
|
198
|
+
# pick class name
|
|
199
|
+
base_name = self._normalize_class_name(tname)
|
|
200
|
+
final_name = base_name
|
|
201
|
+
|
|
202
|
+
# Conflict resolution
|
|
203
|
+
if self._is_class_name_taken(base_name, module_label):
|
|
204
|
+
suffix = self._short_hash(ekey, schema or "", tname)
|
|
205
|
+
final_name = f"{base_name}_{suffix}"
|
|
206
|
+
elif not prefer_stable_names:
|
|
207
|
+
suffix = self._short_hash(ekey, schema or "", tname)
|
|
208
|
+
final_name = f"{base_name}_{suffix}"
|
|
209
|
+
|
|
210
|
+
# build the model
|
|
211
|
+
attrs = {
|
|
212
|
+
"__tablename__": tbl.name,
|
|
213
|
+
"__table__": tbl,
|
|
214
|
+
"__module__": module_label,
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
# Create the dynamic class
|
|
218
|
+
model_cls = type(final_name, (Base,), attrs)
|
|
219
|
+
|
|
220
|
+
# CRITICAL FIX: Make it pickleable by registering it
|
|
221
|
+
self._register_as_module_attribute(model_cls, module_label, final_name)
|
|
222
|
+
|
|
223
|
+
with self._lock:
|
|
224
|
+
self._model_cache[model_key] = model_cls
|
|
225
|
+
return model_cls
|
|
226
|
+
|
|
227
|
+
def clear(self) -> None:
|
|
228
|
+
with self._lock:
|
|
229
|
+
self._metadata_cache.clear()
|
|
230
|
+
self._model_cache.clear()
|
|
231
|
+
self._md_locks.clear()
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
# Singleton
|
|
235
|
+
_global_registry = ModelRegistry()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def get_global_registry() -> ModelRegistry:
|
|
239
|
+
return _global_registry
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import keyword
|
|
2
|
+
import re
|
|
3
|
+
from typing import Type
|
|
4
|
+
from sqlalchemy.engine import Engine
|
|
5
|
+
|
|
6
|
+
from ._model_registry import get_global_registry, apps_label
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SqlAlchemyModelBuilder:
|
|
10
|
+
"""
|
|
11
|
+
Builds a single SQLAlchemy ORM model from a specific database table.
|
|
12
|
+
Delegates to the process-wide ModelRegistry for thread-safety and caching.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, engine: Engine, table_name: str):
|
|
16
|
+
self.engine = engine
|
|
17
|
+
self.table_name = table_name
|
|
18
|
+
|
|
19
|
+
def build_model(self) -> Type:
|
|
20
|
+
"""Reflects the table and returns the mapped ORM class."""
|
|
21
|
+
registry = get_global_registry()
|
|
22
|
+
|
|
23
|
+
# The registry handles locking internally, so we don't need a lock here
|
|
24
|
+
return registry.get_model(
|
|
25
|
+
engine=self.engine,
|
|
26
|
+
table_name=self.table_name,
|
|
27
|
+
module_label=apps_label,
|
|
28
|
+
prefer_stable_names=True,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def _normalize_class_name(table_name: str) -> str:
|
|
33
|
+
return "".join(word.capitalize() for word in table_name.split("_"))
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _normalize_column_name(column_name: str) -> str:
|
|
37
|
+
# Useful helper if you ever need to generate column aliases
|
|
38
|
+
sane_name = re.sub(r"\W", "_", column_name)
|
|
39
|
+
sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
|
|
40
|
+
if keyword.iskeyword(sane_name):
|
|
41
|
+
return f"{sane_name}_field"
|
|
42
|
+
return sane_name
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import date
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
import dask.dataframe as dd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from sibi_flux.dask_cluster.core import safe_compute, safe_persist
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def is_dask_df(x: Any) -> bool:
|
|
13
|
+
return isinstance(x, dd.DataFrame)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def maybe_persist(df: Any, persist: bool):
|
|
17
|
+
return safe_persist(df) if persist and is_dask_df(df) else df
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def maybe_compute(df: Any, as_pandas: bool):
|
|
21
|
+
return safe_compute(df) if as_pandas and is_dask_df(df) else df
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def parse_iso_date(value: Optional[str], field_name: str) -> Optional[date]:
|
|
25
|
+
if value is None:
|
|
26
|
+
return None
|
|
27
|
+
if not isinstance(value, str):
|
|
28
|
+
raise TypeError(f"{field_name} must be a string in YYYY-MM-DD format.")
|
|
29
|
+
try:
|
|
30
|
+
return date.fromisoformat(value)
|
|
31
|
+
except ValueError as e:
|
|
32
|
+
raise ValueError(f"{field_name} must be in YYYY-MM-DD format.") from e
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ._defaults import sqlalchemy_field_conversion_map_dask, normalize_sqlalchemy_type
|
|
4
|
+
from ._filter_handler import FilterHandler
|
|
5
|
+
from ._params_config import ParamsConfig, DataFrameParams
|
|
6
|
+
from ._query_config import QueryConfig
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ParamsConfig",
|
|
10
|
+
"QueryConfig",
|
|
11
|
+
"sqlalchemy_field_conversion_map_dask",
|
|
12
|
+
"normalize_sqlalchemy_type",
|
|
13
|
+
"FilterHandler",
|
|
14
|
+
"DataFrameParams",
|
|
15
|
+
]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Copyright (c) 2023. ISTMO Center S.A. All Rights Reserved
|
|
2
|
+
#
|
|
3
|
+
import json
|
|
4
|
+
from typing import Dict, Callable
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from sqlalchemy import (
|
|
8
|
+
String,
|
|
9
|
+
Text,
|
|
10
|
+
Integer,
|
|
11
|
+
BigInteger,
|
|
12
|
+
SmallInteger,
|
|
13
|
+
Float,
|
|
14
|
+
Boolean,
|
|
15
|
+
DateTime,
|
|
16
|
+
Date,
|
|
17
|
+
Time,
|
|
18
|
+
JSON,
|
|
19
|
+
Numeric,
|
|
20
|
+
UUID,
|
|
21
|
+
)
|
|
22
|
+
from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
|
|
23
|
+
|
|
24
|
+
# This is the defaults configuration file for the df_helper module.
|
|
25
|
+
|
|
26
|
+
# conversion_map is a dictionary that maps the field types to their corresponding data type conversion functions.
|
|
27
|
+
# Each entry in the dictionary is a pair of a field type (as a string) and a callable function that performs the
|
|
28
|
+
# conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
|
|
29
|
+
# the db field type.
|
|
30
|
+
|
|
31
|
+
sqlalchemy_field_conversion_map_dask: Dict[str, Callable] = {
|
|
32
|
+
String.__name__: lambda x: x.astype(str).fillna(""),
|
|
33
|
+
Text.__name__: lambda x: x.fillna("").astype(str),
|
|
34
|
+
Integer.__name__: lambda x: x.fillna(0).astype(int),
|
|
35
|
+
BigInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
|
36
|
+
SmallInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
|
37
|
+
Float.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
|
38
|
+
Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
|
39
|
+
Boolean.__name__: lambda x: x.astype(bool),
|
|
40
|
+
DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
|
|
41
|
+
Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(
|
|
42
|
+
lambda x: x.dt.date, meta=("date", "object")
|
|
43
|
+
),
|
|
44
|
+
Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(
|
|
45
|
+
lambda x: x.dt.time, meta=("time", "object")
|
|
46
|
+
),
|
|
47
|
+
JSON.__name__: lambda x: x.map_partitions(
|
|
48
|
+
lambda s: s.apply(json.loads), meta=("json", "object")
|
|
49
|
+
),
|
|
50
|
+
UUID.__name__: lambda x: x.astype(str),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Conversion map with normalized SQLAlchemy field types
|
|
55
|
+
# sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
|
56
|
+
# "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
|
|
57
|
+
# "Text": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("text", "string")),
|
|
58
|
+
# "Integer": lambda x: pd.to_numeric(x, errors="coerce"),
|
|
59
|
+
# "SmallInteger": lambda x: pd.to_numeric(x, errors="coerce"),
|
|
60
|
+
# "BigInteger": lambda x: pd.to_numeric(x, errors="coerce"),
|
|
61
|
+
# "Float": lambda x: pd.to_numeric(x, errors="coerce"),
|
|
62
|
+
# "Numeric": lambda x: pd.to_numeric(x, errors="coerce"),
|
|
63
|
+
# "Boolean": lambda x: x.map_partitions(lambda s: s.fillna(False).astype(bool), meta=("boolean", "bool")),
|
|
64
|
+
# "DateTime": lambda x: pd.to_datetime(x, errors="coerce"),
|
|
65
|
+
# "Date": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.date, meta=("date", "object")),
|
|
66
|
+
# "Time": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.time, meta=("time", "object")),
|
|
67
|
+
# "JSON": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
|
68
|
+
# }
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def normalize_sqlalchemy_type(field_type):
|
|
72
|
+
"""
|
|
73
|
+
Normalize SQLAlchemy field types to generic type names.
|
|
74
|
+
Handles dialect-specific types (e.g., MySQL).
|
|
75
|
+
"""
|
|
76
|
+
# Map of generic SQLAlchemy types
|
|
77
|
+
type_mapping = {
|
|
78
|
+
String: "String",
|
|
79
|
+
Text: "Text",
|
|
80
|
+
Integer: "Integer",
|
|
81
|
+
SmallInteger: "SmallInteger",
|
|
82
|
+
BigInteger: "BigInteger",
|
|
83
|
+
Float: "Float",
|
|
84
|
+
Numeric: "Numeric",
|
|
85
|
+
Boolean: "Boolean",
|
|
86
|
+
DateTime: "DateTime",
|
|
87
|
+
Date: "Date",
|
|
88
|
+
Time: "Time",
|
|
89
|
+
JSON: "JSON",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Dialect-specific types
|
|
93
|
+
dialect_mapping = {
|
|
94
|
+
TINYINT: "SmallInteger",
|
|
95
|
+
MEDIUMTEXT: "Text",
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Check if the field matches a generic or dialect-specific type
|
|
99
|
+
for sql_type, name in {**type_mapping, **dialect_mapping}.items():
|
|
100
|
+
if isinstance(field_type, sql_type):
|
|
101
|
+
return name
|
|
102
|
+
|
|
103
|
+
# Fallback to raw class name
|
|
104
|
+
return field_type.__class__.__name__
|