sibi-dst 0.3.10__tar.gz → 0.3.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/PKG-INFO +5 -2
  2. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/pyproject.toml +5 -2
  3. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/_df_helper.py +9 -2
  4. sibi_dst-0.3.12/sibi_dst/df_helper/core/__init__.py +18 -0
  5. sibi_dst-0.3.12/sibi_dst/df_helper/core/_defaults.py +133 -0
  6. sibi_dst-0.3.12/sibi_dst/df_helper/plugins/django/_django_load_from_db.py +121 -0
  7. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/http/_http_config.py +2 -3
  8. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +4 -0
  9. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/parquet/_parquet_options.py +2 -0
  10. sibi_dst-0.3.12/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +133 -0
  11. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +2 -0
  12. sibi_dst-0.3.12/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +59 -0
  13. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/__init__.py +2 -1
  14. sibi_dst-0.3.12/sibi_dst/utils/_data_utils.py +215 -0
  15. sibi_dst-0.3.12/sibi_dst/utils/_data_wrapper.py +238 -0
  16. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_date_utils.py +130 -0
  17. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_df_utils.py +91 -0
  18. sibi_dst-0.3.10/sibi_dst/df_helper/core/__init__.py +0 -11
  19. sibi_dst-0.3.10/sibi_dst/df_helper/core/_defaults.py +0 -37
  20. sibi_dst-0.3.10/sibi_dst/df_helper/plugins/django/_django_load_from_db.py +0 -70
  21. sibi_dst-0.3.10/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +0 -143
  22. sibi_dst-0.3.10/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +0 -83
  23. sibi_dst-0.3.10/sibi_dst/utils/_data_utils.py +0 -187
  24. sibi_dst-0.3.10/sibi_dst/utils/_data_wrapper.py +0 -556
  25. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/README.md +0 -0
  26. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/__init__.py +0 -0
  27. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/__init__.py +0 -0
  28. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  29. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/core/_params_config.py +0 -0
  30. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/core/_query_config.py +0 -0
  31. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/__init__.py +0 -0
  32. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/__init__.py +0 -0
  33. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/_django_db_connection.py +0 -0
  34. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py +0 -0
  35. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/_io_dask.py +0 -0
  36. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/_io_dask_alt.py +0 -0
  37. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/http/__init__.py +0 -0
  38. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/parquet/__init__.py +0 -0
  39. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/__init__.py +0 -0
  40. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
  41. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
  42. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_model/__init__.py +0 -0
  43. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py +0 -0
  44. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +0 -0
  45. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_airflow_manager.py +0 -0
  46. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_clickhouse_writer.py +0 -0
  47. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_credentials.py +0 -0
  48. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_file_utils.py +0 -0
  49. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_filepath_generator.py +0 -0
  50. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_log_utils.py +0 -0
  51. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_parquet_saver.py +0 -0
  52. {sibi_dst-0.3.10 → sibi_dst-0.3.12}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.10
4
- Summary:
3
+ Version: 0.3.12
4
+ Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
7
7
  Requires-Python: >=3.11,<4.0
@@ -9,6 +9,8 @@ Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.11
10
10
  Classifier: Programming Language :: Python :: 3.12
11
11
  Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
12
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
+ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
12
14
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
13
15
  Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
14
16
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
@@ -29,6 +31,7 @@ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
29
31
  Requires-Dist: sqlmodel (>=0.0.22,<0.0.23)
30
32
  Requires-Dist: tornado (>=6.4.1,<7.0.0)
31
33
  Requires-Dist: tqdm (>=4.67.0,<5.0.0)
34
+ Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
32
35
  Description-Content-Type: text/markdown
33
36
 
34
37
  # sibi-dst
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.10"
4
- description = ""
3
+ version = "0.3.12"
4
+ description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
7
7
  packages = [{include = "sibi_dst"}]
@@ -29,6 +29,9 @@ pytest = "^8.3.3"
29
29
  clickhouse-connect = "^0.8.7"
30
30
  clickhouse-driver = "^0.2.9"
31
31
  paramiko = "^3.5.0"
32
+ chardet = "^5.2.0"
33
+ charset-normalizer = "^3.4.0"
34
+ uvicorn = "^0.32.1"
32
35
 
33
36
 
34
37
  [build-system]
@@ -42,6 +42,7 @@ class DfHelper:
42
42
  self.dt_field=kwargs.setdefault("dt_field", None)
43
43
  self.as_pandas = kwargs.setdefault("as_pandas", False)
44
44
  kwargs.setdefault("live", True)
45
+ kwargs.setdefault("logger", self.logger)
45
46
  self.post_init(**kwargs)
46
47
 
47
48
 
@@ -101,6 +102,8 @@ class DfHelper:
101
102
 
102
103
  def _load_from_sqlalchemy(self, **options):
103
104
  try:
105
+ options.setdefault("debug", self.debug)
106
+ options.setdefault("verbose_debug", self.verbose_debug)
104
107
  db_loader = SqlAlchemyLoadFromDb(
105
108
  self.plugin_sqlalchemy,
106
109
  self.plugin_query,
@@ -120,6 +123,8 @@ class DfHelper:
120
123
 
121
124
  def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
122
125
  try:
126
+ options.setdefault("debug", self.debug)
127
+ options.setdefault("verbose_debug", self.verbose_debug)
123
128
  db_loader = DjangoLoadFromDb(
124
129
  self.plugin_django_connection,
125
130
  self.plugin_query,
@@ -207,6 +212,7 @@ class DfHelper:
207
212
  def save_to_parquet(self, parquet_filename: Optional[str] = None):
208
213
  ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
209
214
  ps.save_to_parquet(parquet_filename)
215
+ self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
210
216
 
211
217
  def save_to_clickhouse(self, database, table, order_by=None, **credentials):
212
218
  click_config ={
@@ -215,13 +221,14 @@ class DfHelper:
215
221
  'order_by': order_by or 'id',
216
222
  }
217
223
  credentials = {**credentials, **click_config}
218
- cs=ClickHouseWriter(**credentials)
224
+ cs=ClickHouseWriter(logger=self.logger, **credentials)
219
225
  cs.save_to_clickhouse(self.df)
226
+ self.logger.info("Save to ClickHouse completed.")
220
227
 
221
228
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
222
229
  self.df = self.plugin_parquet.load_files()
223
230
  if options:
224
- self.df = ParquetFilterHandler().apply_filters_dask(self.df, options)
231
+ self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
225
232
  return self.df
226
233
 
227
234
  def load_period(self, **kwargs):
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ from ._params_config import ParamsConfig
4
+ from ._query_config import QueryConfig
5
+ from ._defaults import (
6
+ django_field_conversion_map_pandas,
7
+ django_field_conversion_map_dask,
8
+ sqlalchemy_field_conversion_map_dask,
9
+ normalize_sqlalchemy_type)
10
+
11
+ __all__ = [
12
+ "ParamsConfig",
13
+ "QueryConfig",
14
+ "django_field_conversion_map_pandas",
15
+ "django_field_conversion_map_dask",
16
+ "sqlalchemy_field_conversion_map_dask",
17
+ "normalize_sqlalchemy_type"
18
+ ]
@@ -0,0 +1,133 @@
1
+ # Copyright (c) 2023. ISTMO Center S.A. All Rights Reserved
2
+ #
3
+ import json
4
+ from typing import Dict
5
+
6
+ import pandas as pd
7
+ from sqlalchemy import String, Text, Integer, BigInteger, SmallInteger, Float, Boolean, DateTime, Date, Time, JSON, \
8
+ Numeric, UUID
9
+ from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
10
+
11
+ # This is the defaults configuration file for the df_helper module.
12
+
13
+ # conversion_map is a dictionary that maps the field types to their corresponding data type conversion functions.
14
+ # Each entry in the dictionary is a pair of a field type (as a string) and a callable function that performs the
15
+ # conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
16
+ # the Django field type.
17
+
18
+ django_field_conversion_map_pandas: Dict[str, callable] = {
19
+ "CharField": lambda x: x.astype(str),
20
+ "TextField": lambda x: x.astype(str),
21
+ "IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
22
+ "AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
23
+ "BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
24
+ "BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
25
+ "SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
26
+ "PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
27
+ "PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
28
+ "FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
29
+ "DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
30
+ "BooleanField": lambda x: x.astype(bool),
31
+ "NullBooleanField": lambda x: x.astype(bool),
32
+ "DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
33
+ "DateField": lambda x: pd.to_datetime(x, errors="coerce").dt.date,
34
+ "TimeField": lambda x: pd.to_datetime(x, errors="coerce").dt.time,
35
+ "DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
36
+ # for JSONField, assuming JSON objects are represented as string in df
37
+ "JSONField": lambda x: x.apply(json.loads),
38
+ "ArrayField": lambda x: x.apply(eval),
39
+ "UUIDField": lambda x: x.astype(str),
40
+ }
41
+
42
+ django_field_conversion_map_dask: Dict[str, callable] = {
43
+ "CharField": lambda x: x.astype(str),
44
+ "TextField": lambda x: x.astype(str),
45
+ "IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
46
+ "AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
47
+ "BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
48
+ "BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
49
+ "SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
50
+ "PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
51
+ "PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
52
+ "FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
53
+ "DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
54
+ "BooleanField": lambda x: x.astype(bool),
55
+ "NullBooleanField": lambda x: x.astype(bool),
56
+ "DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
57
+ "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
58
+ "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
59
+ "DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
60
+ "JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
61
+ "ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
62
+ "UUIDField": lambda x: x.astype(str),
63
+ }
64
+
65
+ sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
66
+ String.__name__: lambda x: x.astype(str).fillna(""),
67
+ Text.__name__: lambda x: x.fillna('').astype(str),
68
+ Integer.__name__: lambda x: x.fillna(0).astype(int),
69
+ BigInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
70
+ SmallInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
71
+ Float.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
72
+ Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
73
+ Boolean.__name__: lambda x: x.astype(bool),
74
+ DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
75
+ Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
76
+ Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
77
+ JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
78
+ UUID.__name__: lambda x: x.astype(str),
79
+ }
80
+
81
+ # Conversion map with normalized SQLAlchemy field types
82
+ # sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
83
+ # "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
84
+ # "Text": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("text", "string")),
85
+ # "Integer": lambda x: pd.to_numeric(x, errors="coerce"),
86
+ # "SmallInteger": lambda x: pd.to_numeric(x, errors="coerce"),
87
+ # "BigInteger": lambda x: pd.to_numeric(x, errors="coerce"),
88
+ # "Float": lambda x: pd.to_numeric(x, errors="coerce"),
89
+ # "Numeric": lambda x: pd.to_numeric(x, errors="coerce"),
90
+ # "Boolean": lambda x: x.map_partitions(lambda s: s.fillna(False).astype(bool), meta=("boolean", "bool")),
91
+ # "DateTime": lambda x: pd.to_datetime(x, errors="coerce"),
92
+ # "Date": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.date, meta=("date", "object")),
93
+ # "Time": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.time, meta=("time", "object")),
94
+ # "JSON": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
95
+ # }
96
+
97
+
98
+ def normalize_sqlalchemy_type(field_type):
99
+ """
100
+ Normalize SQLAlchemy field types to generic type names.
101
+ Handles dialect-specific types (e.g., MySQL).
102
+ """
103
+ # Map of generic SQLAlchemy types
104
+ type_mapping = {
105
+ String: "String",
106
+ Text: "Text",
107
+ Integer: "Integer",
108
+ SmallInteger: "SmallInteger",
109
+ BigInteger: "BigInteger",
110
+ Float: "Float",
111
+ Numeric: "Numeric",
112
+ Boolean: "Boolean",
113
+ DateTime: "DateTime",
114
+ Date: "Date",
115
+ Time: "Time",
116
+ JSON: "JSON",
117
+ }
118
+
119
+ # Dialect-specific types
120
+ dialect_mapping = {
121
+ TINYINT: "SmallInteger",
122
+ MEDIUMTEXT: "Text",
123
+ }
124
+
125
+ # Check if the field matches a generic or dialect-specific type
126
+ for sql_type, name in {**type_mapping, **dialect_mapping}.items():
127
+ if isinstance(field_type, sql_type):
128
+ return name
129
+
130
+ # Fallback to raw class name
131
+ return field_type.__class__.__name__
132
+
133
+
@@ -0,0 +1,121 @@
1
+ import dask.dataframe as dd
2
+ import pandas as pd
3
+ from django.db.models import Q
4
+
5
+ from sibi_dst.df_helper.plugins.django import ReadFrameDask
6
+ from sibi_dst.utils import Logger
7
+ from sibi_dst.df_helper.core import django_field_conversion_map_dask
8
+
9
+ class DjangoLoadFromDb:
10
+ df: dd.DataFrame
11
+
12
+ def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
13
+ self.connection_config = db_connection
14
+ self.debug = kwargs.pop('debug', False)
15
+ self.verbose_debug = kwargs.pop('verbose_debug', False)
16
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
17
+ if self.connection_config.model is None:
18
+ if self.debug:
19
+ self.logger.critical('Model must be specified')
20
+ if self.verbose_debug:
21
+ print('Model must be specified')
22
+ raise ValueError('Model must be specified')
23
+
24
+ self.query_config = db_query
25
+ self.params_config = db_params
26
+ self.params_config.parse_params(kwargs)
27
+
28
+ def build_and_load(self):
29
+ self.df = self._build_and_load()
30
+ #self.df = self._convert_columns(self.df)
31
+ return self.df
32
+
33
+
34
+ def _build_and_load(self) -> dd.DataFrame:
35
+ query = self.connection_config.model.objects.using(self.connection_config.connection_name)
36
+ if not self.params_config.filters:
37
+ # IMPORTANT: if no filters are provided show only the first n_records
38
+ # this is to prevent loading the entire table by mistake
39
+ n_records = self.query_config.n_records if self.query_config.n_records else 100
40
+ queryset=query.all()[:n_records]
41
+ else:
42
+ q_objects = self.__build_query_objects(self.params_config.filters, self.query_config.use_exclude)
43
+ queryset = query.filter(q_objects)
44
+ if queryset is not None:
45
+ try:
46
+ self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
47
+ except Exception as e:
48
+ self.logger.critical(f'Error loading query: {str(queryset.query)}, error message: {e}')
49
+ self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
50
+ else:
51
+ self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
52
+
53
+ return self.df
54
+
55
+ @staticmethod
56
+ def __build_query_objects(filters: dict, use_exclude: bool):
57
+ q_objects = Q()
58
+ for key, value in filters.items():
59
+ if not use_exclude:
60
+ q_objects.add(Q(**{key: value}), Q.AND)
61
+ else:
62
+ q_objects.add(~Q(**{key: value}), Q.AND)
63
+ return q_objects
64
+
65
+ def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
66
+ """
67
+ Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
68
+
69
+ :param df: Dask DataFrame whose columns' data types are to be converted.
70
+ :return: Dask DataFrame with converted column data types.
71
+ """
72
+
73
+ def log_debug(message: str, is_verbose: bool = False):
74
+ """Helper to handle debug and verbose debug logging."""
75
+ if self.debug:
76
+ self.logger.debug(message)
77
+ if is_verbose and self.verbose_debug:
78
+ print(message)
79
+
80
+ if self.debug:
81
+ self.logger.info(f'Converting columns: {list(df.columns)}')
82
+
83
+ # Get field information from the Django model
84
+ model_fields = self.connection_config.model._meta.get_fields()
85
+ field_type_map = {field.name: type(field).__name__ for field in model_fields}
86
+ # Simplified loop to apply conversions partition-wise
87
+ for field_name, field_type in field_type_map.items():
88
+ if field_name not in df.columns:
89
+
90
+ log_debug(f"Column '{field_name}' not found in DataFrame columns.")
91
+ continue
92
+
93
+ conversion_func = django_field_conversion_map_dask.get(field_type)
94
+ if not conversion_func:
95
+ message=f"Field type '{field_type}' not found in conversion_map."
96
+ log_debug(message, is_verbose=True)
97
+ continue
98
+
99
+ def apply_conversion(partition):
100
+ """
101
+ Apply the conversion function to a single partition for the given column.
102
+ """
103
+ try:
104
+ if field_name in partition.columns:
105
+ partition[field_name] = conversion_func(partition[field_name])
106
+ except Exception as e:
107
+ self.logger.error(f"Error converting column '{field_name}' in partition: {str(e)}")
108
+ return partition
109
+
110
+ try:
111
+ # Apply conversion lazily to each partition
112
+ df = df.map_partitions(
113
+ apply_conversion,
114
+ meta=df,
115
+ )
116
+ log_debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.",
117
+ is_verbose=True)
118
+ except Exception as e:
119
+ log_debug(f"Failed to queue conversion for column '{field_name}': {str(e)}", is_verbose=True)
120
+
121
+ return df
@@ -13,11 +13,10 @@ class HttpConfig(BaseModel):
13
13
  api_key: Optional[SecretStr] = None
14
14
  model_config = ConfigDict(arbitrary_types_allowed=True)
15
15
 
16
- def __init__(self, **data):
16
+ def __init__(self, logger=None, **data):
17
17
  super().__init__(**data)
18
18
  # Initialize the logger if not provided
19
- if not self.logger:
20
- self.logger = Logger(log_dir='./logs/', logger_name="HttpDataSource", log_file='http_data_source.log')
19
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
21
20
 
22
21
  async def fetch_data(self, **options) -> dd.DataFrame:
23
22
  """Asynchronously fetch JSON data from HTTP endpoint, substituting options into the URL path."""
@@ -1,7 +1,11 @@
1
1
  import pandas as pd
2
2
  import dask.dataframe as dd
3
+ from sibi_dst.utils import Logger
3
4
 
4
5
  class ParquetFilterHandler(object):
6
+ def __init__(self, logger=None):
7
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
8
+
5
9
  @staticmethod
6
10
  def apply_filters_dask(df, filters):
7
11
  dt_operators = ['date', 'time']
@@ -25,6 +25,8 @@ class ParquetConfig(BaseModel):
25
25
  @model_validator(mode='after')
26
26
  def check_parquet_params(self):
27
27
  # Configure paths based on fsspec
28
+ if self.logger is None:
29
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
28
30
  self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
29
31
 
30
32
  # Validation for parquet path
@@ -0,0 +1,133 @@
1
+ import itertools
2
+
3
+ import dask.dataframe as dd
4
+ import pandas as pd
5
+ from sqlalchemy import create_engine, inspect, select
6
+ from sqlalchemy.orm import sessionmaker
7
+
8
+ from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
9
+ from sibi_dst.utils import Logger
10
+
11
+
12
+ class SQLAlchemyDask:
13
+ def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, verbose=True):
14
+ """
15
+ Initialize with an SQLAlchemy query and database engine URL.
16
+
17
+ :param model: SQLAlchemy ORM model.
18
+ :param filters: Filters to apply on the query.
19
+ :param engine_url: Database connection string for SQLAlchemy engine.
20
+ :param chunk_size: Number of records per chunk for Dask partitions.
21
+ :param logger: Logger instance for logging.
22
+ :param verbose: Whether to print detailed logs.
23
+ """
24
+ self.query = None
25
+ self.model = model
26
+ self.filters = filters
27
+ self.chunk_size = chunk_size
28
+ self.verbose = verbose
29
+ self.engine = create_engine(engine_url)
30
+ self.Session = sessionmaker(bind=self.engine)
31
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
32
+
33
+ @staticmethod
34
+ def infer_dtypes_from_model(model):
35
+ """
36
+ Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
37
+ """
38
+ mapper = inspect(model)
39
+ sqlalchemy_to_dask_dtype = {
40
+ #'INTEGER': pd.to_numeric(x, errors="coerce"),
41
+ 'INTEGER': 'Int64',
42
+ 'SMALLINT': 'Int64',
43
+ 'BIGINT': 'Int64',
44
+ 'FLOAT': 'float64',
45
+ 'NUMERIC': 'float64',
46
+ 'BOOLEAN': 'bool',
47
+ 'VARCHAR': 'object',
48
+ 'TEXT': 'object',
49
+ 'DATE': 'datetime64[ns]',
50
+ 'DATETIME': 'datetime64[ns]',
51
+ 'TIME': 'object',
52
+ 'UUID': 'object',
53
+ }
54
+
55
+ dtypes = {}
56
+ for column in mapper.columns:
57
+ dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
58
+ dtypes[column.name] = dtype
59
+
60
+ return dtypes
61
+
62
+ def read_frame(self, fillna_value=None):
63
+ """
64
+ Load data from an SQLAlchemy query into a Dask DataFrame.
65
+
66
+ :param fillna_value: Value to replace NaN or NULL values with, if any.
67
+ :return: Dask DataFrame.
68
+ """
69
+ with self.Session() as session:
70
+ try:
71
+ # Build query
72
+ self.query = select(self.model)
73
+ if self.filters:
74
+ self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
75
+ else:
76
+ n_records = 100
77
+ self.query = self.query.limit(n_records)
78
+
79
+ # Infer dtypes
80
+ dtypes = self.infer_dtypes_from_model(self.model)
81
+ # Get the column order from the SQLAlchemy model
82
+ ordered_columns = [column.name for column in self.model.__table__.columns]
83
+
84
+ # Execute query and fetch results in chunks
85
+ result_proxy = session.execute(self.query)
86
+ results = result_proxy.scalars().all() # Fetch all rows
87
+ iterator = iter(results)
88
+
89
+ partitions = []
90
+
91
+ while True:
92
+ chunk = list(itertools.islice(iterator, self.chunk_size))
93
+ if not chunk:
94
+ break
95
+
96
+ # Convert chunk to Pandas DataFrame
97
+ df = pd.DataFrame.from_records(
98
+ [row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
99
+ )
100
+ # Drop internal SQLAlchemy state if it exists
101
+ df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
102
+
103
+ # Reorder columns to match the model's order
104
+ df = df[ordered_columns]
105
+
106
+ # Fill NaN values
107
+ if fillna_value is not None:
108
+ df = df.fillna(fillna_value)
109
+
110
+ # Convert timezone-aware columns to naive
111
+ for col in df.columns:
112
+ if isinstance(df[col].dtype, pd.DatetimeTZDtype):
113
+ df[col] = df[col].dt.tz_localize(None)
114
+
115
+ # Apply inferred dtypes
116
+ df = df.astype(dtypes)
117
+ # Create a Dask partition
118
+ partitions.append(dd.from_pandas(df, npartitions=1))
119
+
120
+ # Concatenate all partitions
121
+ # print(partitions)
122
+ if partitions:
123
+ dask_df = dd.concat(partitions, axis=0, ignore_index=True)
124
+ else:
125
+ dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
126
+
127
+ if self.verbose:
128
+ self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
+ return dask_df
130
+
131
+ except Exception as e:
132
+ self.logger.error(f"Error executing query: {str(e)}")
133
+ return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
@@ -50,6 +50,7 @@ class SqlAlchemyFilterHandler:
50
50
  return [datetime.date.fromisoformat(v) for v in value]
51
51
  return value
52
52
 
53
+
53
54
  def handle_date_operator(column, date_op):
54
55
  """
55
56
  Handle filtering on specific datetime parts (e.g., year, month).
@@ -90,6 +91,7 @@ class SqlAlchemyFilterHandler:
90
91
 
91
92
  # Get the column from the model
92
93
  column = getattr(model, field_name, None)
94
+ #column = model.__table__.columns.get(field_name)
93
95
  if not column:
94
96
  raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
95
97
 
@@ -0,0 +1,59 @@
1
+ from typing import Dict
2
+
3
+ import dask.dataframe as dd
4
+ import pandas as pd
5
+ from sqlalchemy.inspection import inspect
6
+ from sqlalchemy.orm import sessionmaker
7
+ from sqlalchemy import select
8
+ #from sqlmodel import Session, select
9
+
10
+ from sibi_dst.df_helper.core import ParamsConfig, QueryConfig, sqlalchemy_field_conversion_map_dask, \
11
+ normalize_sqlalchemy_type
12
+ from sibi_dst.utils import Logger
13
+ from ._io_sqlalchemy_dask import SQLAlchemyDask
14
+ from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
15
+ from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
16
+
17
+
18
+ class SqlAlchemyLoadFromDb:
19
+ df: dd.DataFrame
20
+
21
+ def __init__(
22
+ self,
23
+ plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
24
+ plugin_query: QueryConfig = None,
25
+ plugin_params: ParamsConfig = None,
26
+ logger: Logger = None,
27
+ **kwargs,
28
+ ):
29
+ """
30
+ Initialize the loader with database connection, query, and parameters.
31
+ """
32
+ self.db_connection = plugin_sqlalchemy
33
+ self.table_name = self.db_connection.table
34
+ self.model = self.db_connection.model
35
+ self.engine = self.db_connection.engine
36
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
37
+ self.query_config = plugin_query
38
+ self.params_config = plugin_params
39
+ self.debug = kwargs.pop("debug", False)
40
+ self.verbose_debug = kwargs.pop("verbose_debug", False)
41
+
42
+ def build_and_load(self) -> dd.DataFrame:
43
+ """
44
+ Load data into a Dask DataFrame based on the query and parameters.
45
+ """
46
+ self.df = self._build_and_load()
47
+ return self.df
48
+
49
+ def _build_and_load(self) -> dd.DataFrame:
50
+ try:
51
+ reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, verbose=self.debug)
52
+ df = reader.read_frame()
53
+ if df is None or len(df.index) == 0:
54
+ self.logger.warning("Query returned no results.")
55
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
56
+ return df
57
+ except Exception as e:
58
+ self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
59
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
  from ._credentials import ConfigManager, ConfigLoader
3
3
  from ._log_utils import Logger
4
- from ._date_utils import DateUtils
4
+ from ._date_utils import DateUtils, BusinessDays
5
5
  from ._data_utils import DataUtils
6
6
  from ._file_utils import FileUtils
7
7
  from ._filepath_generator import FilePathGenerator
@@ -17,6 +17,7 @@ __all__=[
17
17
  "ConfigLoader",
18
18
  "Logger",
19
19
  "DateUtils",
20
+ "BusinessDays",
20
21
  "FileUtils",
21
22
  "DataWrapper",
22
23
  "DataUtils",