sibi-dst 0.3.20__py3-none-any.whl → 0.3.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. sibi_dst/__init__.py +1 -1
  2. sibi_dst/df_helper/__init__.py +2 -2
  3. sibi_dst/df_helper/_df_helper.py +34 -33
  4. sibi_dst/df_helper/_parquet_artifact.py +4 -1
  5. sibi_dst/df_helper/_parquet_reader.py +2 -1
  6. sibi_dst/df_helper/backends/django/__init__.py +1 -2
  7. sibi_dst/df_helper/backends/django/_django_db_connection.py +1 -1
  8. sibi_dst/df_helper/backends/django/_django_load_from_db.py +6 -8
  9. sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +5 -5
  10. sibi_dst/df_helper/backends/django/_io_dask.py +0 -1
  11. sibi_dst/df_helper/backends/django/_io_dask_alt.py +5 -4
  12. sibi_dst/df_helper/backends/http/__init__.py +2 -2
  13. sibi_dst/df_helper/backends/http/_http_config.py +6 -3
  14. sibi_dst/df_helper/backends/parquet/__init__.py +3 -3
  15. sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +4 -2
  16. sibi_dst/df_helper/backends/parquet/_parquet_options.py +12 -7
  17. sibi_dst/df_helper/backends/sql_alchemy/__init__.py +2 -2
  18. sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +3 -1
  19. sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +2 -3
  20. sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +3 -3
  21. sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +2 -2
  22. sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +5 -3
  23. sibi_dst/df_helper/backends/sql_model/__init__.py +1 -1
  24. sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +5 -4
  25. sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +13 -11
  26. sibi_dst/df_helper/core/_defaults.py +9 -6
  27. sibi_dst/df_helper/core/_filter_handler.py +7 -4
  28. sibi_dst/df_helper/core/_params_config.py +3 -2
  29. sibi_dst/df_helper/core/_query_config.py +0 -2
  30. sibi_dst/utils/__init__.py +6 -5
  31. sibi_dst/utils/_airflow_manager.py +4 -3
  32. sibi_dst/utils/_clickhouse_writer.py +16 -13
  33. sibi_dst/utils/_credentials.py +1 -1
  34. sibi_dst/utils/_data_wrapper.py +82 -16
  35. sibi_dst/utils/_date_utils.py +11 -5
  36. sibi_dst/utils/_df_utils.py +9 -5
  37. sibi_dst/utils/_file_utils.py +3 -1
  38. sibi_dst/utils/_filepath_generator.py +4 -2
  39. sibi_dst/utils/_log_utils.py +1 -1
  40. sibi_dst/utils/_parquet_saver.py +0 -2
  41. sibi_dst/utils/_storage_manager.py +1 -1
  42. {sibi_dst-0.3.20.dist-info → sibi_dst-0.3.22.dist-info}/METADATA +1 -1
  43. sibi_dst-0.3.22.dist-info/RECORD +47 -0
  44. sibi_dst-0.3.20.dist-info/RECORD +0 -47
  45. {sibi_dst-0.3.20.dist-info → sibi_dst-0.3.22.dist-info}/WHEEL +0 -0
sibi_dst/__init__.py CHANGED
@@ -6,4 +6,4 @@ except ImportError:
6
6
  try:
7
7
  __version__ = version_reader.version("sibi-dst")
8
8
  except version_reader.PackageNotFoundError:
9
- __version__ = "unknown"
9
+ __version__ = "unknown"
@@ -4,8 +4,8 @@ from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
6
 
7
- __all__=[
7
+ __all__ = [
8
8
  'DfHelper',
9
9
  'ParquetArtifact',
10
10
  'ParquetReader',
11
- ]
11
+ ]
@@ -27,11 +27,12 @@ warnings.filterwarnings(
27
27
  category=UserWarning,
28
28
  )
29
29
 
30
+
30
31
  class DfHelper:
31
32
  df: Union[dd.DataFrame, pd.DataFrame] = None
32
33
  backend_django: Optional[DjangoConnectionConfig] = None
33
- backend_query: Optional[QueryConfig] = None
34
- backend_params: Optional[ParamsConfig] = None
34
+ _backend_query: Optional[QueryConfig] = None
35
+ _backend_params: Optional[ParamsConfig] = None
35
36
  backend_parquet: Optional[ParquetConfig] = None
36
37
  backend_http: Optional[HttpConfig] = None
37
38
  backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
@@ -45,7 +46,7 @@ class DfHelper:
45
46
  kwargs = {**self.default_config.copy(), **kwargs}
46
47
  self.backend = backend
47
48
  self.debug = kwargs.setdefault("debug", False)
48
- self.logger = kwargs.get("logger",Logger.default_logger(logger_name=self.__class__.__name__))
49
+ self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
49
50
  # Configure logger level
50
51
  self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
51
52
  self.logger.debug("Logger initialized in DEBUG mode.")
@@ -54,15 +55,15 @@ class DfHelper:
54
55
  self.as_pandas = kwargs.setdefault("as_pandas", False)
55
56
  kwargs.setdefault("live", True)
56
57
  kwargs.setdefault("logger", self.logger)
57
- self.post_init(**kwargs)
58
+ self.__post_init(**kwargs)
58
59
 
59
60
  def __str__(self):
60
61
  return self.__class__.__name__
61
62
 
62
- def post_init(self, **kwargs):
63
+ def __post_init(self, **kwargs):
63
64
  self.logger.debug(f"backend used: {self.backend}")
64
- self.backend_query = self.__get_config(QueryConfig, kwargs)
65
- self.backend_params = self.__get_config(ParamsConfig, kwargs)
65
+ self._backend_query = self.__get_config(QueryConfig, kwargs)
66
+ self._backend_params = self.__get_config(ParamsConfig, kwargs)
66
67
  if self.backend == 'django_db':
67
68
  self.backend_django = self.__get_config(DjangoConnectionConfig, kwargs)
68
69
  elif self.backend == 'parquet':
@@ -89,42 +90,42 @@ class DfHelper:
89
90
 
90
91
  def load(self, **options):
91
92
  # this will be the universal method to load data from a df irrespective of the backend
92
- df = self._load(**options)
93
+ df = self.__load(**options)
93
94
  if self.as_pandas:
94
95
  return df.compute()
95
96
  return df
96
97
 
97
- def _load(self, **options):
98
+ def __load(self, **options):
98
99
 
99
100
  if self.backend == 'django_db':
100
- self.backend_params.parse_params(options)
101
- return self._load_from_db(**options)
101
+ self._backend_params.parse_params(options)
102
+ return self.__load_from_db(**options)
102
103
  elif self.backend == 'sqlalchemy':
103
- self.backend_params.parse_params(options)
104
- return self._load_from_sqlalchemy(**options)
104
+ self._backend_params.parse_params(options)
105
+ return self.__load_from_sqlalchemy(**options)
105
106
  elif self.backend == 'parquet':
106
- return self._load_from_parquet(**options)
107
+ return self.__load_from_parquet(**options)
107
108
  elif self.backend == 'http':
108
109
  if asyncio.get_event_loop().is_running():
109
110
  self.logger.debug("Running as a task from an event loop")
110
- return asyncio.create_task(self._load_from_http(**options))
111
+ return asyncio.create_task(self.__load_from_http(**options))
111
112
  else:
112
113
  self.logger.debug("Regular asyncio run...")
113
- return asyncio.run(self._load_from_http(**options))
114
+ return asyncio.run(self.__load_from_http(**options))
114
115
 
115
- def _load_from_sqlalchemy(self, **options):
116
+ def __load_from_sqlalchemy(self, **options):
116
117
  try:
117
118
  options.setdefault("debug", self.debug)
118
119
  db_loader = SqlAlchemyLoadFromDb(
119
120
  self.backend_sqlalchemy,
120
- self.backend_query,
121
- self.backend_params,
121
+ self._backend_query,
122
+ self._backend_params,
122
123
  self.logger,
123
124
  **options
124
125
  )
125
126
  self.df = db_loader.build_and_load()
126
- self._process_loaded_data()
127
- self._post_process_df()
127
+ self.__process_loaded_data()
128
+ self.__post_process_df()
128
129
  self.logger.debug("Data successfully loaded from sqlalchemy database.")
129
130
  except Exception as e:
130
131
  self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
@@ -132,19 +133,19 @@ class DfHelper:
132
133
 
133
134
  return self.df
134
135
 
135
- def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
136
+ def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
136
137
  try:
137
138
  options.setdefault("debug", self.debug)
138
139
  db_loader = DjangoLoadFromDb(
139
140
  self.backend_django,
140
- self.backend_query,
141
- self.backend_params,
141
+ self._backend_query,
142
+ self._backend_params,
142
143
  self.logger,
143
144
  **options
144
145
  )
145
146
  self.df = db_loader.build_and_load()
146
- self._process_loaded_data()
147
- self._post_process_df()
147
+ self.__process_loaded_data()
148
+ self.__post_process_df()
148
149
  self.logger.debug("Data successfully loaded from django database.")
149
150
  except Exception as e:
150
151
  self.logger.debug(f"Failed to load data from django database: {e}")
@@ -152,7 +153,7 @@ class DfHelper:
152
153
 
153
154
  return self.df
154
155
 
155
- async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
156
+ async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
156
157
  """Delegate asynchronous HTTP data loading to HttpDatabackend plugin."""
157
158
  if not self.backend_http:
158
159
  self.logger.debug("HTTP plugin not configured properly.")
@@ -164,12 +165,12 @@ class DfHelper:
164
165
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
165
166
  return self.df
166
167
 
167
- def _post_process_df(self):
168
+ def __post_process_df(self):
168
169
  """
169
170
  Efficiently process the DataFrame by filtering, renaming, and setting indices.
170
171
  Optimized for large datasets with Dask compatibility.
171
172
  """
172
- df_params = self.backend_params.df_params
173
+ df_params = self._backend_params.df_params
173
174
  fieldnames = df_params.get("fieldnames", None)
174
175
  index_col = df_params.get("index_col", None)
175
176
  datetime_index = df_params.get("datetime_index", False)
@@ -203,10 +204,10 @@ class DfHelper:
203
204
 
204
205
  self.logger.debug("Post-processing of DataFrame completed.")
205
206
 
206
- def _process_loaded_data(self):
207
+ def __process_loaded_data(self):
207
208
  self.logger.debug(f"Type of self.df: {type(self.df)}")
208
209
  if self.df.map_partitions(len).compute().sum() > 0:
209
- field_map = self.backend_params.field_map or {}
210
+ field_map = self._backend_params.field_map or {}
210
211
  if isinstance(field_map, dict):
211
212
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
212
213
  missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
@@ -237,7 +238,7 @@ class DfHelper:
237
238
  cs.save_to_clickhouse(self.df)
238
239
  self.logger.debug("Save to ClickHouse completed.")
239
240
 
240
- def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
241
+ def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
241
242
  self.df = self.backend_parquet.load_files()
242
243
  if options:
243
244
  """
@@ -274,7 +275,7 @@ class DfHelper:
274
275
  raise ValueError("The 'start' date cannot be later than the 'end' date.")
275
276
 
276
277
  # Reverse map to original field name
277
- field_map = getattr(self.backend_params, 'field_map', {}) or {}
278
+ field_map = getattr(self._backend_params, 'field_map', {}) or {}
278
279
  reverse_map = {v: k for k, v in field_map.items()}
279
280
  mapped_field = reverse_map.get(dt_field, dt_field)
280
281
 
@@ -1,6 +1,8 @@
1
1
  from typing import Optional, Any, Dict
2
+
2
3
  import dask.dataframe as dd
3
4
  import fsspec
5
+
4
6
  from sibi_dst.df_helper import DfHelper
5
7
  from sibi_dst.utils import DataWrapper
6
8
  from sibi_dst.utils import DateUtils
@@ -106,9 +108,10 @@ class ParquetArtifact(DfHelper):
106
108
  'parquet_start_date': start_date.strftime('%Y-%m-%d'),
107
109
  'parquet_end_date': end_date.strftime('%Y-%m-%d'),
108
110
  }
111
+
109
112
  def ensure_directory_exists(self, path: str) -> None:
110
113
  """Ensure the directory exists in the specified filesystem."""
111
114
  try:
112
115
  self.fs.makedirs(path, exist_ok=True)
113
116
  except Exception as e:
114
- raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
117
+ raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
@@ -5,6 +5,7 @@ import fsspec
5
5
 
6
6
  from sibi_dst.df_helper import DfHelper
7
7
 
8
+
8
9
  class ParquetReader(DfHelper):
9
10
  DEFAULT_CONFIG = {
10
11
  'backend': 'parquet'
@@ -46,4 +47,4 @@ class ParquetReader(DfHelper):
46
47
  info = self.fs.info(self.parquet_storage_path)
47
48
  return info['type'] == 'directory'
48
49
  except FileNotFoundError:
49
- return False
50
+ return False
@@ -1,8 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._django_db_connection import DjangoConnectionConfig
4
3
  from ._io_dask import ReadFrameDask
5
- #from ._io_dask_alt import ReadFrameDask
4
+ from ._django_db_connection import DjangoConnectionConfig
6
5
  from ._django_load_from_db import DjangoLoadFromDb
7
6
 
8
7
  __all__ = [
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, Union
1
+ from typing import Any
2
2
 
3
3
  from pydantic import BaseModel, model_validator
4
4
 
@@ -2,12 +2,12 @@ import warnings
2
2
 
3
3
  import dask.dataframe as dd
4
4
  import pandas as pd
5
- from IPython.core.hooks import deprecated
6
5
  from django.db.models import Q
7
6
 
8
7
  from sibi_dst.df_helper.backends.django import ReadFrameDask
9
- from sibi_dst.utils import Logger
10
8
  from sibi_dst.df_helper.core import django_field_conversion_map_dask
9
+ from sibi_dst.utils import Logger
10
+
11
11
 
12
12
  class DjangoLoadFromDb:
13
13
  df: dd.DataFrame
@@ -28,17 +28,16 @@ class DjangoLoadFromDb:
28
28
 
29
29
  def build_and_load(self):
30
30
  self.df = self._build_and_load()
31
- #self.df = self._convert_columns(self.df)
31
+ # self.df = self._convert_columns(self.df)
32
32
  return self.df
33
33
 
34
-
35
34
  def _build_and_load(self) -> dd.DataFrame:
36
35
  query = self.connection_config.model.objects.using(self.connection_config.connection_name)
37
36
  if not self.params_config.filters:
38
37
  # IMPORTANT: if no filters are provided show only the first n_records
39
38
  # this is to prevent loading the entire table by mistake
40
39
  n_records = self.query_config.n_records if self.query_config.n_records else 100
41
- queryset=query.all()[:n_records]
40
+ queryset = query.all()[:n_records]
42
41
  else:
43
42
  q_objects = self.__build_query_objects(self.params_config.filters, self.query_config.use_exclude)
44
43
  queryset = query.filter(q_objects)
@@ -99,13 +98,12 @@ class DjangoLoadFromDb:
99
98
  # Simplified loop to apply conversions partition-wise
100
99
  for field_name, field_type in field_type_map.items():
101
100
  if field_name not in df.columns:
102
-
103
101
  self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
104
102
  continue
105
103
 
106
104
  conversion_func = django_field_conversion_map_dask.get(field_type)
107
105
  if not conversion_func:
108
- message=f"Field type '{field_type}' not found in conversion_map."
106
+ message = f"Field type '{field_type}' not found in conversion_map."
109
107
  self.logger.debug(message)
110
108
  continue
111
109
 
@@ -130,4 +128,4 @@ class DjangoLoadFromDb:
130
128
  except Exception as e:
131
129
  self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
132
130
 
133
- return df
131
+ return df
@@ -219,9 +219,9 @@ class DjangoSqlModelBuilder:
219
219
  if field_type == "AutoField(":
220
220
  continue
221
221
  elif (
222
- field_type
223
- == connection.features.introspected_field_types["AutoField"]
224
- + "("
222
+ field_type
223
+ == connection.features.introspected_field_types["AutoField"]
224
+ + "("
225
225
  ):
226
226
  comment_notes.append("AutoField?")
227
227
 
@@ -240,8 +240,8 @@ class DjangoSqlModelBuilder:
240
240
 
241
241
  # Add comment.
242
242
  if (
243
- hasattr(connection.features, "supports_comments")
244
- and row.comment
243
+ hasattr(connection.features, "supports_comments")
244
+ and row.comment
245
245
  ):
246
246
  extra_params["db_comment"] = row.comment
247
247
  # if connection.features.supports_comments and row.comment:
@@ -1,7 +1,6 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
- import dask_expr
5
4
  import django
6
5
  import pandas as pd
7
6
  from django.core.cache import cache
@@ -1,13 +1,14 @@
1
1
  import itertools
2
+
2
3
  import dask.dataframe as dd
4
+ import django
3
5
  import pandas as pd
4
-
5
6
  from django.core.cache import cache
7
+ from django.core.exceptions import FieldDoesNotExist
6
8
  from django.db import models
7
9
  from django.db.models import Field
8
10
  from django.utils.encoding import force_str as force_text
9
- import django
10
- from django.core.exceptions import FieldDoesNotExist
11
+
11
12
 
12
13
  class ReadFrameDask:
13
14
  FieldDoesNotExist = (
@@ -185,4 +186,4 @@ class ReadFrameDask:
185
186
  if verbose:
186
187
  self.update_with_verbose(dask_df, fieldnames, qs.model._meta.fields)
187
188
 
188
- return dask_df
189
+ return dask_df
@@ -2,6 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  from ._http_config import HttpConfig
4
4
 
5
- __all__=[
5
+ __all__ = [
6
6
  'HttpConfig'
7
- ]
7
+ ]
@@ -1,10 +1,13 @@
1
- from pydantic import BaseModel, HttpUrl, Field, ConfigDict, SecretStr
2
1
  from typing import Dict, Optional, Any
3
- import httpx
2
+
4
3
  import dask.dataframe as dd
4
+ import httpx
5
5
  import pandas as pd
6
+ from pydantic import BaseModel, HttpUrl, Field, ConfigDict, SecretStr
7
+
6
8
  from sibi_dst.utils import Logger
7
9
 
10
+
8
11
  class HttpConfig(BaseModel):
9
12
  base_url: HttpUrl
10
13
  params: Optional[Dict[str, Any]] = Field(default_factory=dict)
@@ -44,4 +47,4 @@ class HttpConfig(BaseModel):
44
47
  raise
45
48
  except ValueError as e:
46
49
  self.logger.debug(f"Error parsing JSON data: {e}")
47
- raise
50
+ raise
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._parquet_options import *
4
3
  from ._parquet_filter_handler import ParquetFilterHandler
4
+ from ._parquet_options import *
5
5
 
6
- __all__=[
6
+ __all__ = [
7
7
  "ParquetConfig",
8
8
  "ParquetFilterHandler",
9
- ]
9
+ ]
@@ -1,7 +1,9 @@
1
- import pandas as pd
2
1
  import dask.dataframe as dd
2
+ import pandas as pd
3
+
3
4
  from sibi_dst.utils import Logger
4
5
 
6
+
5
7
  class ParquetFilterHandler(object):
6
8
  def __init__(self, logger=None):
7
9
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -92,4 +94,4 @@ class ParquetFilterHandler(object):
92
94
  else:
93
95
  raise ValueError(f"Unsupported operation: {operation}")
94
96
 
95
- return df
97
+ return df
@@ -1,12 +1,15 @@
1
+ import datetime
1
2
  from pathlib import Path
2
3
  from typing import Optional, List
4
+
3
5
  import dask.dataframe as dd
4
- from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
5
6
  import fsspec
6
- import datetime
7
+ from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
8
+
7
9
  from sibi_dst.utils import FilePathGenerator
8
10
  from sibi_dst.utils import Logger
9
11
 
12
+
10
13
  class ParquetConfig(BaseModel):
11
14
  load_parquet: bool = False
12
15
  parquet_filename: Optional[str] = None
@@ -27,7 +30,8 @@ class ParquetConfig(BaseModel):
27
30
  # Configure paths based on fsspec
28
31
  if self.logger is None:
29
32
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
30
- self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
33
+ self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
34
+ str(self.parquet_storage_path).split("://")[0])
31
35
 
32
36
  # Validation for parquet path
33
37
  if self.parquet_storage_path is None:
@@ -37,7 +41,8 @@ class ParquetConfig(BaseModel):
37
41
  self.load_parquet = False
38
42
  if self.parquet_filename is not None:
39
43
  self.parquet_full_path = self.ensure_file_extension(
40
- filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]), extension='parquet'
44
+ filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
45
+ extension='parquet'
41
46
  )
42
47
  self.parquet_is_recent = self.is_file_recent()
43
48
  self.load_parquet = self.parquet_is_recent and self.fs.exists(self.parquet_full_path)
@@ -52,10 +57,11 @@ class ParquetConfig(BaseModel):
52
57
  raise ValueError('Parquet end date must be greater than start date')
53
58
 
54
59
  # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
55
- self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), logger=self.logger).generate_file_paths(start_date, end_date)
60
+ self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
61
+ logger=self.logger).generate_file_paths(start_date, end_date)
56
62
  self.parquet_size_bytes = self.get_parquet_size_bytes()
57
63
  self.load_parquet = True
58
- #self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
64
+ # self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
59
65
  elif self.parquet_end_date is not None:
60
66
  raise ValueError('Parquet start date must be specified if end date is provided')
61
67
 
@@ -88,4 +94,3 @@ class ParquetConfig(BaseModel):
88
94
  def ensure_file_extension(filepath: str, extension: str) -> str:
89
95
  path = Path(filepath)
90
96
  return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
91
-
@@ -1,6 +1,6 @@
1
+ from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
1
2
  from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
2
- from ._sqlalchemy_model_builder import SqlAlchemyModelBuilder
3
3
  from ._sqlalchemy_load_from_db import SqlAlchemyLoadFromDb
4
- from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
4
+ from ._sqlalchemy_model_builder import SqlAlchemyModelBuilder
5
5
 
6
6
  __all__ = ['SqlAlchemyConnectionConfig', 'SqlAlchemyModelBuilder', 'SqlAlchemyLoadFromDb', 'SqlAlchemyFilterHandler']
@@ -74,7 +74,9 @@ class SQLAlchemyDask:
74
74
  deprecated specific filter handling to a generic one
75
75
  #self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
76
76
  """
77
- self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query, model=self.model, filters=self.filters)
77
+ self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query,
78
+ model=self.model,
79
+ filters=self.filters)
78
80
  else:
79
81
  n_records = 100
80
82
  self.query = self.query.limit(n_records)
@@ -52,7 +52,6 @@ class SqlAlchemyFilterHandler:
52
52
  return [datetime.date.fromisoformat(v) for v in value]
53
53
  return value
54
54
 
55
-
56
55
  def handle_date_operator(column, date_op):
57
56
  """
58
57
  Handle filtering on specific datetime parts (e.g., year, month).
@@ -93,7 +92,7 @@ class SqlAlchemyFilterHandler:
93
92
 
94
93
  # Get the column from the model
95
94
  column = getattr(model, field_name, None)
96
- #column = model.__table__.columns.get(field_name)
95
+ # column = model.__table__.columns.get(field_name)
97
96
  if not column:
98
97
  raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
99
98
 
@@ -117,4 +116,4 @@ class SqlAlchemyFilterHandler:
117
116
  else:
118
117
  raise ValueError(f"Unsupported operation: {operation}")
119
118
 
120
- return query
119
+ return query
@@ -1,18 +1,19 @@
1
1
  from typing import Any, Optional
2
2
 
3
3
  from pydantic import BaseModel, model_validator
4
+ from sqlalchemy import create_engine
4
5
  from sqlalchemy.exc import OperationalError
5
6
  from sqlalchemy.sql import text
6
- from sqlalchemy import create_engine
7
+
7
8
  from ._sqlalchemy_model_builder import SqlAlchemyModelBuilder
8
9
 
10
+
9
11
  class SqlAlchemyConnectionConfig(BaseModel):
10
12
  connection_url: str
11
13
  table: Optional[str] = None
12
14
  model: Any = None
13
15
  engine: Optional[Any] = None # Save engine to reuse it
14
16
 
15
-
16
17
  @model_validator(mode="after")
17
18
  def validate_and_initialize(self):
18
19
  """
@@ -45,4 +46,3 @@ class SqlAlchemyConnectionConfig(BaseModel):
45
46
  connection.execute(text("SELECT 1"))
46
47
  except OperationalError as e:
47
48
  raise ValueError(f"Failed to connect to the database: {e}")
48
-
@@ -1,5 +1,4 @@
1
1
  import dask.dataframe as dd
2
- import dask_expr
3
2
  import pandas as pd
4
3
 
5
4
  from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
@@ -7,6 +6,7 @@ from sibi_dst.utils import Logger
7
6
  from ._io_sqlalchemy_dask import SQLAlchemyDask
8
7
  from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
9
8
 
9
+
10
10
  class SqlAlchemyLoadFromDb:
11
11
  df: dd.DataFrame = None
12
12
 
@@ -52,7 +52,7 @@ class SqlAlchemyLoadFromDb:
52
52
 
53
53
  if self.df is None or len(self.df.head().index) == 0:
54
54
  self.logger.debug("Query returned no results.")
55
- dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
55
+ dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
56
56
 
57
57
  return dask_df
58
58
  return self.df
@@ -1,15 +1,17 @@
1
1
  import re
2
+
2
3
  from sqlalchemy import MetaData, Table
3
4
  from sqlalchemy.orm import declarative_base, relationship
4
5
 
5
6
  # Base class for dynamically created models
6
7
  Base = declarative_base()
7
8
 
8
-
9
9
  apps_label = "datacubes"
10
10
 
11
+
11
12
  class SqlAlchemyModelBuilder:
12
13
  _model_cache = {} # Local cache for model classes
14
+
13
15
  def __init__(self, engine, table_name):
14
16
  """
15
17
  Initialize the model builder with a database engine and specific table.
@@ -58,7 +60,7 @@ class SqlAlchemyModelBuilder:
58
60
 
59
61
  # Add columns and relationships to the model
60
62
  attrs.update(columns)
61
- #self.add_relationships(attrs, self.table)
63
+ # self.add_relationships(attrs, self.table)
62
64
  model = Base.registry._class_registry.get(self.class_name)
63
65
  if not model:
64
66
  model = type(self.class_name, (Base,), attrs)
@@ -126,4 +128,4 @@ class SqlAlchemyModelBuilder:
126
128
  column_name = re.sub(r"\W|^(?=\d)", "_", column_name)
127
129
  if column_name in {"class", "def", "return", "yield", "global"}:
128
130
  column_name += "_field"
129
- return column_name
131
+ return column_name
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._sqlmodel_load_from_db import SQLModelLoadFromDb
4
3
  from ._sqlmodel_db_connection import SQLModelConnectionConfig
4
+ from ._sqlmodel_load_from_db import SQLModelLoadFromDb
5
5
 
6
6
  __all__ = [
7
7
  "SQLModelLoadFromDb",
@@ -1,9 +1,10 @@
1
+ import datetime
1
2
  from typing import Any, Optional, Dict, Type
3
+
2
4
  from pydantic import BaseModel, model_validator
3
- from sqlmodel import SQLModel, Field, create_engine
4
5
  from sqlalchemy import inspect
5
- from sqlalchemy.sql import text
6
6
  from sqlalchemy.exc import OperationalError
7
+ from sqlalchemy.sql import text
7
8
  from sqlalchemy.sql.sqltypes import (
8
9
  Integer,
9
10
  String,
@@ -14,7 +15,7 @@ from sqlalchemy.sql.sqltypes import (
14
15
  Time,
15
16
  Numeric,
16
17
  )
17
- import datetime
18
+ from sqlmodel import SQLModel, Field, create_engine
18
19
 
19
20
 
20
21
  class SQLModelConnectionConfig(BaseModel):
@@ -130,4 +131,4 @@ class SQLModelConnectionConfig(BaseModel):
130
131
  @staticmethod
131
132
  def _table2model(table_name: str) -> str:
132
133
  """Convert table name to PascalCase model name."""
133
- return "".join(word.capitalize() for word in table_name.split("_"))
134
+ return "".join(word.capitalize() for word in table_name.split("_"))