sibi-flux 2025.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sibi_dst/__init__.py +44 -0
  2. sibi_flux/__init__.py +49 -0
  3. sibi_flux/artifacts/__init__.py +7 -0
  4. sibi_flux/artifacts/base.py +166 -0
  5. sibi_flux/artifacts/parquet.py +360 -0
  6. sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
  7. sibi_flux/artifacts/parquet_engine/executor.py +204 -0
  8. sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
  9. sibi_flux/artifacts/parquet_engine/planner.py +544 -0
  10. sibi_flux/conf/settings.py +131 -0
  11. sibi_flux/core/__init__.py +5 -0
  12. sibi_flux/core/managed_resource/__init__.py +3 -0
  13. sibi_flux/core/managed_resource/_managed_resource.py +733 -0
  14. sibi_flux/core/type_maps/__init__.py +100 -0
  15. sibi_flux/dask_cluster/__init__.py +47 -0
  16. sibi_flux/dask_cluster/async_core.py +27 -0
  17. sibi_flux/dask_cluster/client_manager.py +549 -0
  18. sibi_flux/dask_cluster/core.py +322 -0
  19. sibi_flux/dask_cluster/exceptions.py +34 -0
  20. sibi_flux/dask_cluster/utils.py +49 -0
  21. sibi_flux/datacube/__init__.py +3 -0
  22. sibi_flux/datacube/_data_cube.py +332 -0
  23. sibi_flux/datacube/config_engine.py +152 -0
  24. sibi_flux/datacube/field_factory.py +48 -0
  25. sibi_flux/datacube/field_registry.py +122 -0
  26. sibi_flux/datacube/generator.py +677 -0
  27. sibi_flux/datacube/orchestrator.py +171 -0
  28. sibi_flux/dataset/__init__.py +3 -0
  29. sibi_flux/dataset/_dataset.py +162 -0
  30. sibi_flux/df_enricher/__init__.py +56 -0
  31. sibi_flux/df_enricher/async_enricher.py +201 -0
  32. sibi_flux/df_enricher/merger.py +253 -0
  33. sibi_flux/df_enricher/specs.py +45 -0
  34. sibi_flux/df_enricher/types.py +12 -0
  35. sibi_flux/df_helper/__init__.py +5 -0
  36. sibi_flux/df_helper/_df_helper.py +450 -0
  37. sibi_flux/df_helper/backends/__init__.py +34 -0
  38. sibi_flux/df_helper/backends/_params.py +173 -0
  39. sibi_flux/df_helper/backends/_strategies.py +295 -0
  40. sibi_flux/df_helper/backends/http/__init__.py +5 -0
  41. sibi_flux/df_helper/backends/http/_http_config.py +122 -0
  42. sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
  43. sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
  44. sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
  45. sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
  46. sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  47. sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
  48. sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
  49. sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
  50. sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
  51. sibi_flux/df_helper/backends/utils.py +32 -0
  52. sibi_flux/df_helper/core/__init__.py +15 -0
  53. sibi_flux/df_helper/core/_defaults.py +104 -0
  54. sibi_flux/df_helper/core/_filter_handler.py +617 -0
  55. sibi_flux/df_helper/core/_params_config.py +185 -0
  56. sibi_flux/df_helper/core/_query_config.py +17 -0
  57. sibi_flux/df_validator/__init__.py +3 -0
  58. sibi_flux/df_validator/_df_validator.py +222 -0
  59. sibi_flux/logger/__init__.py +1 -0
  60. sibi_flux/logger/_logger.py +480 -0
  61. sibi_flux/mcp/__init__.py +26 -0
  62. sibi_flux/mcp/client.py +150 -0
  63. sibi_flux/mcp/router.py +126 -0
  64. sibi_flux/orchestration/__init__.py +9 -0
  65. sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
  66. sibi_flux/orchestration/_pipeline_executor.py +212 -0
  67. sibi_flux/osmnx_helper/__init__.py +22 -0
  68. sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
  69. sibi_flux/osmnx_helper/graph_loader.py +225 -0
  70. sibi_flux/osmnx_helper/utils.py +100 -0
  71. sibi_flux/pipelines/__init__.py +3 -0
  72. sibi_flux/pipelines/base.py +218 -0
  73. sibi_flux/py.typed +0 -0
  74. sibi_flux/readers/__init__.py +3 -0
  75. sibi_flux/readers/base.py +82 -0
  76. sibi_flux/readers/parquet.py +106 -0
  77. sibi_flux/utils/__init__.py +53 -0
  78. sibi_flux/utils/boilerplate/__init__.py +19 -0
  79. sibi_flux/utils/boilerplate/base_attacher.py +45 -0
  80. sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
  81. sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
  82. sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
  83. sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
  84. sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
  85. sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
  86. sibi_flux/utils/common.py +7 -0
  87. sibi_flux/utils/credentials/__init__.py +3 -0
  88. sibi_flux/utils/credentials/_config_manager.py +155 -0
  89. sibi_flux/utils/dask_utils.py +14 -0
  90. sibi_flux/utils/data_utils/__init__.py +3 -0
  91. sibi_flux/utils/data_utils/_data_utils.py +389 -0
  92. sibi_flux/utils/dataframe_utils.py +52 -0
  93. sibi_flux/utils/date_utils/__init__.py +10 -0
  94. sibi_flux/utils/date_utils/_business_days.py +220 -0
  95. sibi_flux/utils/date_utils/_date_utils.py +311 -0
  96. sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
  97. sibi_flux/utils/file_utils.py +48 -0
  98. sibi_flux/utils/filepath_generator/__init__.py +5 -0
  99. sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
  100. sibi_flux/utils/parquet_saver/__init__.py +6 -0
  101. sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
  102. sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
  103. sibi_flux/utils/retry.py +46 -0
  104. sibi_flux/utils/storage/__init__.py +7 -0
  105. sibi_flux/utils/storage/_fs_registry.py +112 -0
  106. sibi_flux/utils/storage/_storage_manager.py +257 -0
  107. sibi_flux/utils/storage/factory.py +33 -0
  108. sibi_flux-2025.12.0.dist-info/METADATA +283 -0
  109. sibi_flux-2025.12.0.dist-info/RECORD +110 -0
  110. sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,134 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Tuple, Dict, Optional
4
+
5
+ import dask.dataframe as dd
6
+ import pandas as pd
7
+
8
+ from sibi_flux.core import ManagedResource
9
+ from sibi_flux.df_helper.core import ParamsConfig, QueryConfig
10
+ from ._db_connection import SqlAlchemyConnectionConfig
11
+ from ._io_dask import SQLAlchemyDask
12
+
13
+
14
+ class SqlAlchemyLoadFromDb(ManagedResource):
15
+ """
16
+ Orchestrates loading data from a database using SQLAlchemy into a Dask DataFrame.
17
+ """
18
+
19
+ logger_extra: Dict[str, Any] = {"sibi_flux_component": __name__}
20
+
21
+ def __init__(
22
+ self,
23
+ plugin_sqlalchemy: SqlAlchemyConnectionConfig,
24
+ plugin_query: Optional[QueryConfig] = None,
25
+ plugin_params: Optional[ParamsConfig] = None,
26
+ **kwargs: Any,
27
+ ):
28
+ super().__init__(**kwargs)
29
+ self.db_connection = plugin_sqlalchemy
30
+
31
+ # Access properties safely (triggers lazy load if needed)
32
+ self.model = self.db_connection.model
33
+ self.engine = self.db_connection.engine
34
+
35
+ self.query_config = plugin_query
36
+ self.params_config = plugin_params
37
+
38
+ # Safe extraction of tuning parameters
39
+ df_params = self.params_config.df_params if self.params_config else None
40
+
41
+ # 1. Chunk Size (Default: 50k)
42
+ # Priority: kwargs > df_params > default
43
+ self.chunk_size = int(
44
+ kwargs.get("chunk_size", self._safe_get(df_params, "chunk_size", 50_000))
45
+ )
46
+
47
+ # 2. Index Column (Critical for Range Pagination)
48
+ # Priority: kwargs > df_params > None
49
+ self.db_index = kwargs.get(
50
+ "db_index", self._safe_get(df_params, "db_index", None)
51
+ )
52
+
53
+ # 3. Safety Limit
54
+ # Priority: kwargs > df_params > None
55
+ self.limit = kwargs.get("limit", self._safe_get(df_params, "limit", None))
56
+ if self.limit is not None:
57
+ self.limit = int(self.limit)
58
+
59
+ # 4. Engine Options
60
+ self.execution_options = kwargs.get("execution_options", {})
61
+
62
+ self.total_records = -1
63
+
64
+ @staticmethod
65
+ def _safe_get(obj: Any, key: str, default: Any) -> Any:
66
+ """Helper to extract values from Dict, Pydantic, or None safely."""
67
+ if obj is None:
68
+ return default
69
+ if isinstance(obj, dict):
70
+ return obj.get(key, default)
71
+ return getattr(obj, key, default)
72
+
73
+ def build_and_load(self) -> Tuple[int, dd.DataFrame]:
74
+ try:
75
+ # Determine pagination strategy automatically
76
+ # If we have an index_col, try 'range' (fast). Otherwise 'offset' (safe).
77
+ pagination_mode = "range" if self.db_index else "offset"
78
+
79
+ # Extract filters safely
80
+ filters = self.params_config.filters if self.params_config else {}
81
+
82
+ # Use Context Manager to ensure SQLAlchemyDask resources are cleaned up
83
+ with SQLAlchemyDask(
84
+ model=self.model,
85
+ filters=filters,
86
+ engine=self.engine,
87
+ chunk_size=self.chunk_size,
88
+ pagination=pagination_mode,
89
+ db_index=self.db_index,
90
+ limit=self.limit,
91
+ execution_options=self.execution_options,
92
+ logger=self.logger,
93
+ verbose=self.verbose,
94
+ debug=self.debug,
95
+ ) as loader:
96
+
97
+ self.logger.debug(
98
+ f"SQLAlchemyDask initialized for {self.model.__name__} "
99
+ f"(strategy={pagination_mode}, chunk={self.chunk_size})",
100
+ extra=self.logger_extra,
101
+ )
102
+
103
+ self.total_records, dask_df = loader.read_frame()
104
+
105
+ return self.total_records, dask_df
106
+
107
+ except Exception as e:
108
+ self.total_records = -1
109
+ self.logger.error(
110
+ f"{self.model.__name__} failed to load: {e}",
111
+ exc_info=True,
112
+ extra=self.logger_extra,
113
+ )
114
+ # Return empty dataframe structure on failure to prevent crashes downstream
115
+ try:
116
+ # Try to inspect columns from model to return empty DF with correct schema
117
+ columns = [c.name for c in self.model.__table__.columns]
118
+ except Exception:
119
+ columns = []
120
+
121
+ return -1, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
122
+
123
+ def _cleanup(self) -> None:
124
+ """
125
+ Clean up instance references to prevent memory leaks.
126
+ Note: The engine is owned by DfHelper/ConnectionConfig, not us.
127
+ """
128
+ try:
129
+ self.logger.debug(f"Cleaning up {self.__class__.__name__} refs")
130
+ self.db_connection = None
131
+ self.engine = None
132
+ self.model = None
133
+ except Exception as e:
134
+ self.logger.warning(f"Error during cleanup: {e}", extra=self.logger_extra)
@@ -0,0 +1,239 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import sys
5
+ import types
6
+ import threading
7
+ from typing import Dict, Optional, Tuple
8
+
9
+ from sqlalchemy import MetaData, Table
10
+ from sqlalchemy.engine import Engine
11
+ from sqlalchemy.orm import DeclarativeBase
12
+
13
+
14
+ class Base(DeclarativeBase):
15
+ """Shared declarative base for all ORM models."""
16
+
17
+ pass
18
+
19
+
20
+ # Default module label for generated classes
21
+ apps_label = "datacubes.models"
22
+
23
+
24
+ class ModelRegistry:
25
+ """
26
+ Thread-safe registry that reflects tables once per (engine, schema) and
27
+ returns a single mapped class per (engine, schema, table).
28
+ """
29
+
30
+ def __init__(self) -> None:
31
+ self._metadata_cache: Dict[Tuple[str, Optional[str]], MetaData] = {}
32
+ self._model_cache: Dict[Tuple[str, Optional[str], str], type] = {}
33
+ self._lock = threading.RLock()
34
+ self._md_locks: Dict[Tuple[str, Optional[str]], threading.Lock] = {}
35
+
36
+ # ---------- key helpers ----------
37
+ @staticmethod
38
+ def _engine_key(engine: Engine) -> str:
39
+ # Use URL string as key. Note: This includes passwords if not masked.
40
+ # For internal cache keys, this is fine.
41
+ return str(engine.url)
42
+
43
+ @staticmethod
44
+ def _qualified_key(schema: Optional[str], table: str) -> str:
45
+ return f"{schema}.{table}" if schema else table
46
+
47
+ @staticmethod
48
+ def _split_schema_and_table(name: str) -> Tuple[Optional[str], str]:
49
+ if "." in name:
50
+ s, t = name.split(".", 1)
51
+ return (s or None), t
52
+ return None, name
53
+
54
+ # ---------- class name helpers ----------
55
+ @staticmethod
56
+ def _normalize_class_name(table_name: str) -> str:
57
+ return "".join(part.capitalize() for part in table_name.split("_"))
58
+
59
+ @staticmethod
60
+ def _short_hash(*parts: str, length: int = 8) -> str:
61
+ h = hashlib.sha1("|".join(parts).encode("utf-8")).hexdigest()
62
+ return h[:length]
63
+
64
+ def _is_class_name_taken(self, class_name: str, module_label: str) -> bool:
65
+ # Check Python's module system directly to see if the name is taken
66
+ if module_label in sys.modules:
67
+ return hasattr(sys.modules[module_label], class_name)
68
+ return False
69
+
70
+ def _find_existing_model_for_table(self, tbl: Table) -> Optional[type]:
71
+ # Scan SQLAlchemy's global mapper registry
72
+ for mapper in list(Base.registry.mappers):
73
+ try:
74
+ mapped_cls = mapper.class_
75
+ mapped_tbl = getattr(mapped_cls, "__table__", None)
76
+ if mapped_tbl is tbl:
77
+ return mapped_cls
78
+ # Check for equivalent table (same name/schema)
79
+ if isinstance(mapped_tbl, Table):
80
+ if (mapped_tbl.schema == tbl.schema) and (
81
+ mapped_tbl.name == tbl.name
82
+ ):
83
+ return mapped_cls
84
+ except Exception:
85
+ continue
86
+ return None
87
+
88
+ # ---------- metadata helpers ----------
89
+ def _get_or_create_metadata(self, ekey: str, schema: Optional[str]) -> MetaData:
90
+ md_key = (ekey, schema)
91
+ with self._lock:
92
+ md = self._metadata_cache.get(md_key)
93
+ if md is None:
94
+ md = MetaData(schema=schema)
95
+ self._metadata_cache[md_key] = md
96
+ return md
97
+
98
+ def _get_or_create_md_lock(
99
+ self, md_key: Tuple[str, Optional[str]]
100
+ ) -> threading.Lock:
101
+ with self._lock:
102
+ lock = self._md_locks.get(md_key)
103
+ if lock is None:
104
+ lock = threading.Lock()
105
+ self._md_locks[md_key] = lock
106
+ return lock
107
+
108
+ def _register_as_module_attribute(
109
+ self, cls: type, module_name: str, class_name: str
110
+ ) -> None:
111
+ """
112
+ Injects the class into sys.modules so it can be imported/pickled.
113
+ Ensures parent packages exist (e.g., 'datacubes' for 'datacubes.models').
114
+ """
115
+ if module_name not in sys.modules:
116
+ # Create the module
117
+ mod = types.ModuleType(module_name)
118
+ sys.modules[module_name] = mod
119
+
120
+ # Ensure parent packages exist logic
121
+ if "." in module_name:
122
+ parent_name, child_name = module_name.rsplit(".", 1)
123
+ if parent_name not in sys.modules:
124
+ sys.modules[parent_name] = types.ModuleType(parent_name)
125
+
126
+ # Link child to parent so `import parent; parent.child` works
127
+ setattr(sys.modules[parent_name], child_name, mod)
128
+
129
+ # Register the class on the module
130
+ setattr(sys.modules[module_name], class_name, cls)
131
+
132
+ # def _register_as_module_attribute(self, cls: type, module_name: str, class_name: str) -> None:
133
+ # """
134
+ # Injects the class into sys.modules so it can be imported/pickled.
135
+ # """
136
+ # if module_name not in sys.modules:
137
+ # # Create a dummy module on the fly
138
+ # sys.modules[module_name] = types.ModuleType(module_name)
139
+ #
140
+ # # Register the class on the module
141
+ # setattr(sys.modules[module_name], class_name, cls)
142
+
143
+ # ---------- public API ----------
144
+ def get_model(
145
+ self,
146
+ engine: Engine,
147
+ table_name: str,
148
+ *,
149
+ refresh: bool = False,
150
+ schema: Optional[str] = None,
151
+ module_label: Optional[str] = None,
152
+ prefer_stable_names: bool = True,
153
+ ) -> type:
154
+ s2, tname = self._split_schema_and_table(table_name)
155
+ schema = schema if schema is not None else s2
156
+ ekey = self._engine_key(engine)
157
+ model_key = (ekey, schema, tname)
158
+ md_key = (ekey, schema)
159
+ module_label = module_label or apps_label
160
+
161
+ if refresh:
162
+ with self._lock:
163
+ self._model_cache.pop(model_key, None)
164
+ # Note: We don't drop metadata easily as it might be used by other models
165
+
166
+ # fast path: already cached model
167
+ with self._lock:
168
+ m = self._model_cache.get(model_key)
169
+ if m is not None:
170
+ return m
171
+
172
+ # ensure metadata and reflection are serialized per (engine, schema)
173
+ md = self._get_or_create_metadata(ekey, schema)
174
+ md_lock = self._get_or_create_md_lock(md_key)
175
+ qname = self._qualified_key(schema, tname)
176
+
177
+ # Reflection logic
178
+ tbl = md.tables.get(qname)
179
+ if tbl is None:
180
+ with md_lock:
181
+ # double-checked locking
182
+ tbl = md.tables.get(qname)
183
+ if tbl is None:
184
+ # Reflect only this table
185
+ md.reflect(bind=engine, only=[tname], schema=schema)
186
+ tbl = md.tables.get(qname)
187
+
188
+ if tbl is None:
189
+ raise ValueError(f"Table '{qname}' does not exist in the database.")
190
+
191
+ # If a mapped model for this Table already exists (anywhere), reuse it
192
+ reused = self._find_existing_model_for_table(tbl)
193
+ if reused is not None:
194
+ with self._lock:
195
+ self._model_cache[model_key] = reused
196
+ return reused
197
+
198
+ # pick class name
199
+ base_name = self._normalize_class_name(tname)
200
+ final_name = base_name
201
+
202
+ # Conflict resolution
203
+ if self._is_class_name_taken(base_name, module_label):
204
+ suffix = self._short_hash(ekey, schema or "", tname)
205
+ final_name = f"{base_name}_{suffix}"
206
+ elif not prefer_stable_names:
207
+ suffix = self._short_hash(ekey, schema or "", tname)
208
+ final_name = f"{base_name}_{suffix}"
209
+
210
+ # build the model
211
+ attrs = {
212
+ "__tablename__": tbl.name,
213
+ "__table__": tbl,
214
+ "__module__": module_label,
215
+ }
216
+
217
+ # Create the dynamic class
218
+ model_cls = type(final_name, (Base,), attrs)
219
+
220
+ # CRITICAL FIX: Make it pickleable by registering it
221
+ self._register_as_module_attribute(model_cls, module_label, final_name)
222
+
223
+ with self._lock:
224
+ self._model_cache[model_key] = model_cls
225
+ return model_cls
226
+
227
+ def clear(self) -> None:
228
+ with self._lock:
229
+ self._metadata_cache.clear()
230
+ self._model_cache.clear()
231
+ self._md_locks.clear()
232
+
233
+
234
+ # Singleton
235
+ _global_registry = ModelRegistry()
236
+
237
+
238
+ def get_global_registry() -> ModelRegistry:
239
+ return _global_registry
@@ -0,0 +1,42 @@
1
+ import keyword
2
+ import re
3
+ from typing import Type
4
+ from sqlalchemy.engine import Engine
5
+
6
+ from ._model_registry import get_global_registry, apps_label
7
+
8
+
9
+ class SqlAlchemyModelBuilder:
10
+ """
11
+ Builds a single SQLAlchemy ORM model from a specific database table.
12
+ Delegates to the process-wide ModelRegistry for thread-safety and caching.
13
+ """
14
+
15
+ def __init__(self, engine: Engine, table_name: str):
16
+ self.engine = engine
17
+ self.table_name = table_name
18
+
19
+ def build_model(self) -> Type:
20
+ """Reflects the table and returns the mapped ORM class."""
21
+ registry = get_global_registry()
22
+
23
+ # The registry handles locking internally, so we don't need a lock here
24
+ return registry.get_model(
25
+ engine=self.engine,
26
+ table_name=self.table_name,
27
+ module_label=apps_label,
28
+ prefer_stable_names=True,
29
+ )
30
+
31
+ @staticmethod
32
+ def _normalize_class_name(table_name: str) -> str:
33
+ return "".join(word.capitalize() for word in table_name.split("_"))
34
+
35
+ @staticmethod
36
+ def _normalize_column_name(column_name: str) -> str:
37
+ # Useful helper if you ever need to generate column aliases
38
+ sane_name = re.sub(r"\W", "_", column_name)
39
+ sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
40
+ if keyword.iskeyword(sane_name):
41
+ return f"{sane_name}_field"
42
+ return sane_name
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import date
4
+ from typing import Any, Optional
5
+
6
+ import dask.dataframe as dd
7
+
8
+
9
+ from sibi_flux.dask_cluster.core import safe_compute, safe_persist
10
+
11
+
12
+ def is_dask_df(x: Any) -> bool:
13
+ return isinstance(x, dd.DataFrame)
14
+
15
+
16
+ def maybe_persist(df: Any, persist: bool):
17
+ return safe_persist(df) if persist and is_dask_df(df) else df
18
+
19
+
20
+ def maybe_compute(df: Any, as_pandas: bool):
21
+ return safe_compute(df) if as_pandas and is_dask_df(df) else df
22
+
23
+
24
+ def parse_iso_date(value: Optional[str], field_name: str) -> Optional[date]:
25
+ if value is None:
26
+ return None
27
+ if not isinstance(value, str):
28
+ raise TypeError(f"{field_name} must be a string in YYYY-MM-DD format.")
29
+ try:
30
+ return date.fromisoformat(value)
31
+ except ValueError as e:
32
+ raise ValueError(f"{field_name} must be in YYYY-MM-DD format.") from e
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from ._defaults import sqlalchemy_field_conversion_map_dask, normalize_sqlalchemy_type
4
+ from ._filter_handler import FilterHandler
5
+ from ._params_config import ParamsConfig, DataFrameParams
6
+ from ._query_config import QueryConfig
7
+
8
+ __all__ = [
9
+ "ParamsConfig",
10
+ "QueryConfig",
11
+ "sqlalchemy_field_conversion_map_dask",
12
+ "normalize_sqlalchemy_type",
13
+ "FilterHandler",
14
+ "DataFrameParams",
15
+ ]
@@ -0,0 +1,104 @@
1
+ # Copyright (c) 2023. ISTMO Center S.A. All Rights Reserved
2
+ #
3
+ import json
4
+ from typing import Dict, Callable
5
+
6
+ import pandas as pd
7
+ from sqlalchemy import (
8
+ String,
9
+ Text,
10
+ Integer,
11
+ BigInteger,
12
+ SmallInteger,
13
+ Float,
14
+ Boolean,
15
+ DateTime,
16
+ Date,
17
+ Time,
18
+ JSON,
19
+ Numeric,
20
+ UUID,
21
+ )
22
+ from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
23
+
24
+ # This is the defaults configuration file for the df_helper module.
25
+
26
+ # conversion_map is a dictionary that maps the field types to their corresponding data type conversion functions.
27
+ # Each entry in the dictionary is a pair of a field type (as a string) and a callable function that performs the
28
+ # conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
29
+ # the db field type.
30
+
31
+ sqlalchemy_field_conversion_map_dask: Dict[str, Callable] = {
32
+ String.__name__: lambda x: x.astype(str).fillna(""),
33
+ Text.__name__: lambda x: x.fillna("").astype(str),
34
+ Integer.__name__: lambda x: x.fillna(0).astype(int),
35
+ BigInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
36
+ SmallInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
37
+ Float.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
38
+ Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
39
+ Boolean.__name__: lambda x: x.astype(bool),
40
+ DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
41
+ Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(
42
+ lambda x: x.dt.date, meta=("date", "object")
43
+ ),
44
+ Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(
45
+ lambda x: x.dt.time, meta=("time", "object")
46
+ ),
47
+ JSON.__name__: lambda x: x.map_partitions(
48
+ lambda s: s.apply(json.loads), meta=("json", "object")
49
+ ),
50
+ UUID.__name__: lambda x: x.astype(str),
51
+ }
52
+
53
+
54
+ # Conversion map with normalized SQLAlchemy field types
55
+ # sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
56
+ # "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
57
+ # "Text": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("text", "string")),
58
+ # "Integer": lambda x: pd.to_numeric(x, errors="coerce"),
59
+ # "SmallInteger": lambda x: pd.to_numeric(x, errors="coerce"),
60
+ # "BigInteger": lambda x: pd.to_numeric(x, errors="coerce"),
61
+ # "Float": lambda x: pd.to_numeric(x, errors="coerce"),
62
+ # "Numeric": lambda x: pd.to_numeric(x, errors="coerce"),
63
+ # "Boolean": lambda x: x.map_partitions(lambda s: s.fillna(False).astype(bool), meta=("boolean", "bool")),
64
+ # "DateTime": lambda x: pd.to_datetime(x, errors="coerce"),
65
+ # "Date": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.date, meta=("date", "object")),
66
+ # "Time": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.time, meta=("time", "object")),
67
+ # "JSON": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
68
+ # }
69
+
70
+
71
+ def normalize_sqlalchemy_type(field_type):
72
+ """
73
+ Normalize SQLAlchemy field types to generic type names.
74
+ Handles dialect-specific types (e.g., MySQL).
75
+ """
76
+ # Map of generic SQLAlchemy types
77
+ type_mapping = {
78
+ String: "String",
79
+ Text: "Text",
80
+ Integer: "Integer",
81
+ SmallInteger: "SmallInteger",
82
+ BigInteger: "BigInteger",
83
+ Float: "Float",
84
+ Numeric: "Numeric",
85
+ Boolean: "Boolean",
86
+ DateTime: "DateTime",
87
+ Date: "Date",
88
+ Time: "Time",
89
+ JSON: "JSON",
90
+ }
91
+
92
+ # Dialect-specific types
93
+ dialect_mapping = {
94
+ TINYINT: "SmallInteger",
95
+ MEDIUMTEXT: "Text",
96
+ }
97
+
98
+ # Check if the field matches a generic or dialect-specific type
99
+ for sql_type, name in {**type_mapping, **dialect_mapping}.items():
100
+ if isinstance(field_type, sql_type):
101
+ return name
102
+
103
+ # Fallback to raw class name
104
+ return field_type.__class__.__name__