sibi-dst 0.3.19__tar.gz → 0.3.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/PKG-INFO +1 -1
  2. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/pyproject.toml +1 -1
  3. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/__init__.py +1 -1
  4. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/__init__.py +2 -2
  5. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/_df_helper.py +34 -33
  6. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/_parquet_artifact.py +4 -1
  7. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/_parquet_reader.py +2 -1
  8. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/__init__.py +1 -2
  9. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_django_db_connection.py +1 -1
  10. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_django_load_from_db.py +6 -8
  11. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +5 -5
  12. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -1
  13. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/django/_io_dask_alt.py +5 -4
  14. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/http/__init__.py +2 -2
  15. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/http/_http_config.py +6 -3
  16. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/parquet/__init__.py +3 -3
  17. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +4 -2
  18. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +12 -7
  19. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +3 -1
  20. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +2 -3
  21. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +2 -2
  22. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +5 -3
  23. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +5 -4
  24. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +13 -11
  25. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/_defaults.py +9 -6
  26. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/_filter_handler.py +7 -4
  27. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/_params_config.py +3 -2
  28. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/_query_config.py +0 -2
  29. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/__init__.py +10 -9
  30. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_airflow_manager.py +4 -3
  31. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_clickhouse_writer.py +16 -13
  32. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_data_wrapper.py +7 -4
  33. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_date_utils.py +11 -5
  34. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_df_utils.py +9 -5
  35. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_file_utils.py +3 -1
  36. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_filepath_generator.py +4 -2
  37. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_log_utils.py +1 -1
  38. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_parquet_saver.py +0 -2
  39. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/README.md +0 -0
  40. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/__init__.py +0 -0
  41. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/__init__.py +2 -2
  42. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +3 -3
  43. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/backends/sql_model/__init__.py +1 -1
  44. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/df_helper/core/__init__.py +0 -0
  45. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_credentials.py +1 -1
  46. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_data_utils.py +0 -0
  47. {sibi_dst-0.3.19 → sibi_dst-0.3.21}/sibi_dst/utils/_storage_manager.py +1 -1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.19
3
+ Version: 0.3.21
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.19"
3
+ version = "0.3.21"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -6,4 +6,4 @@ except ImportError:
6
6
  try:
7
7
  __version__ = version_reader.version("sibi-dst")
8
8
  except version_reader.PackageNotFoundError:
9
- __version__ = "unknown"
9
+ __version__ = "unknown"
@@ -4,8 +4,8 @@ from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
6
 
7
- __all__=[
7
+ __all__ = [
8
8
  'DfHelper',
9
9
  'ParquetArtifact',
10
10
  'ParquetReader',
11
- ]
11
+ ]
@@ -27,11 +27,12 @@ warnings.filterwarnings(
27
27
  category=UserWarning,
28
28
  )
29
29
 
30
+
30
31
  class DfHelper:
31
32
  df: Union[dd.DataFrame, pd.DataFrame] = None
32
33
  backend_django: Optional[DjangoConnectionConfig] = None
33
- backend_query: Optional[QueryConfig] = None
34
- backend_params: Optional[ParamsConfig] = None
34
+ _backend_query: Optional[QueryConfig] = None
35
+ _backend_params: Optional[ParamsConfig] = None
35
36
  backend_parquet: Optional[ParquetConfig] = None
36
37
  backend_http: Optional[HttpConfig] = None
37
38
  backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
@@ -45,7 +46,7 @@ class DfHelper:
45
46
  kwargs = {**self.default_config.copy(), **kwargs}
46
47
  self.backend = backend
47
48
  self.debug = kwargs.setdefault("debug", False)
48
- self.logger = kwargs.get("logger",Logger.default_logger(logger_name=self.__class__.__name__))
49
+ self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
49
50
  # Configure logger level
50
51
  self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
51
52
  self.logger.debug("Logger initialized in DEBUG mode.")
@@ -54,15 +55,15 @@ class DfHelper:
54
55
  self.as_pandas = kwargs.setdefault("as_pandas", False)
55
56
  kwargs.setdefault("live", True)
56
57
  kwargs.setdefault("logger", self.logger)
57
- self.post_init(**kwargs)
58
+ self.__post_init(**kwargs)
58
59
 
59
60
  def __str__(self):
60
61
  return self.__class__.__name__
61
62
 
62
- def post_init(self, **kwargs):
63
+ def __post_init(self, **kwargs):
63
64
  self.logger.debug(f"backend used: {self.backend}")
64
- self.backend_query = self.__get_config(QueryConfig, kwargs)
65
- self.backend_params = self.__get_config(ParamsConfig, kwargs)
65
+ self._backend_query = self.__get_config(QueryConfig, kwargs)
66
+ self._backend_params = self.__get_config(ParamsConfig, kwargs)
66
67
  if self.backend == 'django_db':
67
68
  self.backend_django = self.__get_config(DjangoConnectionConfig, kwargs)
68
69
  elif self.backend == 'parquet':
@@ -89,42 +90,42 @@ class DfHelper:
89
90
 
90
91
  def load(self, **options):
91
92
  # this will be the universal method to load data from a df irrespective of the backend
92
- df = self._load(**options)
93
+ df = self.__load(**options)
93
94
  if self.as_pandas:
94
95
  return df.compute()
95
96
  return df
96
97
 
97
- def _load(self, **options):
98
+ def __load(self, **options):
98
99
 
99
100
  if self.backend == 'django_db':
100
- self.backend_params.parse_params(options)
101
- return self._load_from_db(**options)
101
+ self._backend_params.parse_params(options)
102
+ return self.__load_from_db(**options)
102
103
  elif self.backend == 'sqlalchemy':
103
- self.backend_params.parse_params(options)
104
- return self._load_from_sqlalchemy(**options)
104
+ self._backend_params.parse_params(options)
105
+ return self.__load_from_sqlalchemy(**options)
105
106
  elif self.backend == 'parquet':
106
- return self._load_from_parquet(**options)
107
+ return self.__load_from_parquet(**options)
107
108
  elif self.backend == 'http':
108
109
  if asyncio.get_event_loop().is_running():
109
110
  self.logger.debug("Running as a task from an event loop")
110
- return asyncio.create_task(self._load_from_http(**options))
111
+ return asyncio.create_task(self.__load_from_http(**options))
111
112
  else:
112
113
  self.logger.debug("Regular asyncio run...")
113
- return asyncio.run(self._load_from_http(**options))
114
+ return asyncio.run(self.__load_from_http(**options))
114
115
 
115
- def _load_from_sqlalchemy(self, **options):
116
+ def __load_from_sqlalchemy(self, **options):
116
117
  try:
117
118
  options.setdefault("debug", self.debug)
118
119
  db_loader = SqlAlchemyLoadFromDb(
119
120
  self.backend_sqlalchemy,
120
- self.backend_query,
121
- self.backend_params,
121
+ self._backend_query,
122
+ self._backend_params,
122
123
  self.logger,
123
124
  **options
124
125
  )
125
126
  self.df = db_loader.build_and_load()
126
- self._process_loaded_data()
127
- self._post_process_df()
127
+ self.__process_loaded_data()
128
+ self.__post_process_df()
128
129
  self.logger.debug("Data successfully loaded from sqlalchemy database.")
129
130
  except Exception as e:
130
131
  self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
@@ -132,19 +133,19 @@ class DfHelper:
132
133
 
133
134
  return self.df
134
135
 
135
- def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
136
+ def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
136
137
  try:
137
138
  options.setdefault("debug", self.debug)
138
139
  db_loader = DjangoLoadFromDb(
139
140
  self.backend_django,
140
- self.backend_query,
141
- self.backend_params,
141
+ self._backend_query,
142
+ self._backend_params,
142
143
  self.logger,
143
144
  **options
144
145
  )
145
146
  self.df = db_loader.build_and_load()
146
- self._process_loaded_data()
147
- self._post_process_df()
147
+ self.__process_loaded_data()
148
+ self.__post_process_df()
148
149
  self.logger.debug("Data successfully loaded from django database.")
149
150
  except Exception as e:
150
151
  self.logger.debug(f"Failed to load data from django database: {e}")
@@ -152,7 +153,7 @@ class DfHelper:
152
153
 
153
154
  return self.df
154
155
 
155
- async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
156
+ async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
156
157
  """Delegate asynchronous HTTP data loading to HttpDatabackend plugin."""
157
158
  if not self.backend_http:
158
159
  self.logger.debug("HTTP plugin not configured properly.")
@@ -164,12 +165,12 @@ class DfHelper:
164
165
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
165
166
  return self.df
166
167
 
167
- def _post_process_df(self):
168
+ def __post_process_df(self):
168
169
  """
169
170
  Efficiently process the DataFrame by filtering, renaming, and setting indices.
170
171
  Optimized for large datasets with Dask compatibility.
171
172
  """
172
- df_params = self.backend_params.df_params
173
+ df_params = self._backend_params.df_params
173
174
  fieldnames = df_params.get("fieldnames", None)
174
175
  index_col = df_params.get("index_col", None)
175
176
  datetime_index = df_params.get("datetime_index", False)
@@ -203,10 +204,10 @@ class DfHelper:
203
204
 
204
205
  self.logger.debug("Post-processing of DataFrame completed.")
205
206
 
206
- def _process_loaded_data(self):
207
+ def __process_loaded_data(self):
207
208
  self.logger.debug(f"Type of self.df: {type(self.df)}")
208
209
  if self.df.map_partitions(len).compute().sum() > 0:
209
- field_map = self.backend_params.field_map or {}
210
+ field_map = self._backend_params.field_map or {}
210
211
  if isinstance(field_map, dict):
211
212
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
212
213
  missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
@@ -237,7 +238,7 @@ class DfHelper:
237
238
  cs.save_to_clickhouse(self.df)
238
239
  self.logger.debug("Save to ClickHouse completed.")
239
240
 
240
- def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
241
+ def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
241
242
  self.df = self.backend_parquet.load_files()
242
243
  if options:
243
244
  """
@@ -274,7 +275,7 @@ class DfHelper:
274
275
  raise ValueError("The 'start' date cannot be later than the 'end' date.")
275
276
 
276
277
  # Reverse map to original field name
277
- field_map = getattr(self.backend_params, 'field_map', {}) or {}
278
+ field_map = getattr(self._backend_params, 'field_map', {}) or {}
278
279
  reverse_map = {v: k for k, v in field_map.items()}
279
280
  mapped_field = reverse_map.get(dt_field, dt_field)
280
281
 
@@ -1,6 +1,8 @@
1
1
  from typing import Optional, Any, Dict
2
+
2
3
  import dask.dataframe as dd
3
4
  import fsspec
5
+
4
6
  from sibi_dst.df_helper import DfHelper
5
7
  from sibi_dst.utils import DataWrapper
6
8
  from sibi_dst.utils import DateUtils
@@ -106,9 +108,10 @@ class ParquetArtifact(DfHelper):
106
108
  'parquet_start_date': start_date.strftime('%Y-%m-%d'),
107
109
  'parquet_end_date': end_date.strftime('%Y-%m-%d'),
108
110
  }
111
+
109
112
  def ensure_directory_exists(self, path: str) -> None:
110
113
  """Ensure the directory exists in the specified filesystem."""
111
114
  try:
112
115
  self.fs.makedirs(path, exist_ok=True)
113
116
  except Exception as e:
114
- raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
117
+ raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
@@ -5,6 +5,7 @@ import fsspec
5
5
 
6
6
  from sibi_dst.df_helper import DfHelper
7
7
 
8
+
8
9
  class ParquetReader(DfHelper):
9
10
  DEFAULT_CONFIG = {
10
11
  'backend': 'parquet'
@@ -46,4 +47,4 @@ class ParquetReader(DfHelper):
46
47
  info = self.fs.info(self.parquet_storage_path)
47
48
  return info['type'] == 'directory'
48
49
  except FileNotFoundError:
49
- return False
50
+ return False
@@ -1,9 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from ._django_db_connection import DjangoConnectionConfig
4
- from ._io_dask import ReadFrameDask
5
- #from ._io_dask_alt import ReadFrameDask
6
4
  from ._django_load_from_db import DjangoLoadFromDb
5
+ from ._io_dask import ReadFrameDask
7
6
 
8
7
  __all__ = [
9
8
  "DjangoConnectionConfig",
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, Union
1
+ from typing import Any
2
2
 
3
3
  from pydantic import BaseModel, model_validator
4
4
 
@@ -2,12 +2,12 @@ import warnings
2
2
 
3
3
  import dask.dataframe as dd
4
4
  import pandas as pd
5
- from IPython.core.hooks import deprecated
6
5
  from django.db.models import Q
7
6
 
8
7
  from sibi_dst.df_helper.backends.django import ReadFrameDask
9
- from sibi_dst.utils import Logger
10
8
  from sibi_dst.df_helper.core import django_field_conversion_map_dask
9
+ from sibi_dst.utils import Logger
10
+
11
11
 
12
12
  class DjangoLoadFromDb:
13
13
  df: dd.DataFrame
@@ -28,17 +28,16 @@ class DjangoLoadFromDb:
28
28
 
29
29
  def build_and_load(self):
30
30
  self.df = self._build_and_load()
31
- #self.df = self._convert_columns(self.df)
31
+ # self.df = self._convert_columns(self.df)
32
32
  return self.df
33
33
 
34
-
35
34
  def _build_and_load(self) -> dd.DataFrame:
36
35
  query = self.connection_config.model.objects.using(self.connection_config.connection_name)
37
36
  if not self.params_config.filters:
38
37
  # IMPORTANT: if no filters are provided show only the first n_records
39
38
  # this is to prevent loading the entire table by mistake
40
39
  n_records = self.query_config.n_records if self.query_config.n_records else 100
41
- queryset=query.all()[:n_records]
40
+ queryset = query.all()[:n_records]
42
41
  else:
43
42
  q_objects = self.__build_query_objects(self.params_config.filters, self.query_config.use_exclude)
44
43
  queryset = query.filter(q_objects)
@@ -99,13 +98,12 @@ class DjangoLoadFromDb:
99
98
  # Simplified loop to apply conversions partition-wise
100
99
  for field_name, field_type in field_type_map.items():
101
100
  if field_name not in df.columns:
102
-
103
101
  self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
104
102
  continue
105
103
 
106
104
  conversion_func = django_field_conversion_map_dask.get(field_type)
107
105
  if not conversion_func:
108
- message=f"Field type '{field_type}' not found in conversion_map."
106
+ message = f"Field type '{field_type}' not found in conversion_map."
109
107
  self.logger.debug(message)
110
108
  continue
111
109
 
@@ -130,4 +128,4 @@ class DjangoLoadFromDb:
130
128
  except Exception as e:
131
129
  self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
132
130
 
133
- return df
131
+ return df
@@ -219,9 +219,9 @@ class DjangoSqlModelBuilder:
219
219
  if field_type == "AutoField(":
220
220
  continue
221
221
  elif (
222
- field_type
223
- == connection.features.introspected_field_types["AutoField"]
224
- + "("
222
+ field_type
223
+ == connection.features.introspected_field_types["AutoField"]
224
+ + "("
225
225
  ):
226
226
  comment_notes.append("AutoField?")
227
227
 
@@ -240,8 +240,8 @@ class DjangoSqlModelBuilder:
240
240
 
241
241
  # Add comment.
242
242
  if (
243
- hasattr(connection.features, "supports_comments")
244
- and row.comment
243
+ hasattr(connection.features, "supports_comments")
244
+ and row.comment
245
245
  ):
246
246
  extra_params["db_comment"] = row.comment
247
247
  # if connection.features.supports_comments and row.comment:
@@ -1,7 +1,6 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
- import dask_expr
5
4
  import django
6
5
  import pandas as pd
7
6
  from django.core.cache import cache
@@ -1,13 +1,14 @@
1
1
  import itertools
2
+
2
3
  import dask.dataframe as dd
4
+ import django
3
5
  import pandas as pd
4
-
5
6
  from django.core.cache import cache
7
+ from django.core.exceptions import FieldDoesNotExist
6
8
  from django.db import models
7
9
  from django.db.models import Field
8
10
  from django.utils.encoding import force_str as force_text
9
- import django
10
- from django.core.exceptions import FieldDoesNotExist
11
+
11
12
 
12
13
  class ReadFrameDask:
13
14
  FieldDoesNotExist = (
@@ -185,4 +186,4 @@ class ReadFrameDask:
185
186
  if verbose:
186
187
  self.update_with_verbose(dask_df, fieldnames, qs.model._meta.fields)
187
188
 
188
- return dask_df
189
+ return dask_df
@@ -2,6 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  from ._http_config import HttpConfig
4
4
 
5
- __all__=[
5
+ __all__ = [
6
6
  'HttpConfig'
7
- ]
7
+ ]
@@ -1,10 +1,13 @@
1
- from pydantic import BaseModel, HttpUrl, Field, ConfigDict, SecretStr
2
1
  from typing import Dict, Optional, Any
3
- import httpx
2
+
4
3
  import dask.dataframe as dd
4
+ import httpx
5
5
  import pandas as pd
6
+ from pydantic import BaseModel, HttpUrl, Field, ConfigDict, SecretStr
7
+
6
8
  from sibi_dst.utils import Logger
7
9
 
10
+
8
11
  class HttpConfig(BaseModel):
9
12
  base_url: HttpUrl
10
13
  params: Optional[Dict[str, Any]] = Field(default_factory=dict)
@@ -44,4 +47,4 @@ class HttpConfig(BaseModel):
44
47
  raise
45
48
  except ValueError as e:
46
49
  self.logger.debug(f"Error parsing JSON data: {e}")
47
- raise
50
+ raise
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._parquet_options import *
4
3
  from ._parquet_filter_handler import ParquetFilterHandler
4
+ from ._parquet_options import *
5
5
 
6
- __all__=[
6
+ __all__ = [
7
7
  "ParquetConfig",
8
8
  "ParquetFilterHandler",
9
- ]
9
+ ]
@@ -1,7 +1,9 @@
1
- import pandas as pd
2
1
  import dask.dataframe as dd
2
+ import pandas as pd
3
+
3
4
  from sibi_dst.utils import Logger
4
5
 
6
+
5
7
  class ParquetFilterHandler(object):
6
8
  def __init__(self, logger=None):
7
9
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -92,4 +94,4 @@ class ParquetFilterHandler(object):
92
94
  else:
93
95
  raise ValueError(f"Unsupported operation: {operation}")
94
96
 
95
- return df
97
+ return df
@@ -1,12 +1,15 @@
1
+ import datetime
1
2
  from pathlib import Path
2
3
  from typing import Optional, List
4
+
3
5
  import dask.dataframe as dd
4
- from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
5
6
  import fsspec
6
- import datetime
7
+ from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
8
+
7
9
  from sibi_dst.utils import FilePathGenerator
8
10
  from sibi_dst.utils import Logger
9
11
 
12
+
10
13
  class ParquetConfig(BaseModel):
11
14
  load_parquet: bool = False
12
15
  parquet_filename: Optional[str] = None
@@ -27,7 +30,8 @@ class ParquetConfig(BaseModel):
27
30
  # Configure paths based on fsspec
28
31
  if self.logger is None:
29
32
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
30
- self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
33
+ self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
34
+ str(self.parquet_storage_path).split("://")[0])
31
35
 
32
36
  # Validation for parquet path
33
37
  if self.parquet_storage_path is None:
@@ -37,7 +41,8 @@ class ParquetConfig(BaseModel):
37
41
  self.load_parquet = False
38
42
  if self.parquet_filename is not None:
39
43
  self.parquet_full_path = self.ensure_file_extension(
40
- filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]), extension='parquet'
44
+ filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
45
+ extension='parquet'
41
46
  )
42
47
  self.parquet_is_recent = self.is_file_recent()
43
48
  self.load_parquet = self.parquet_is_recent and self.fs.exists(self.parquet_full_path)
@@ -52,10 +57,11 @@ class ParquetConfig(BaseModel):
52
57
  raise ValueError('Parquet end date must be greater than start date')
53
58
 
54
59
  # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
55
- self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), logger=self.logger).generate_file_paths(start_date, end_date)
60
+ self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
61
+ logger=self.logger).generate_file_paths(start_date, end_date)
56
62
  self.parquet_size_bytes = self.get_parquet_size_bytes()
57
63
  self.load_parquet = True
58
- #self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
64
+ # self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
59
65
  elif self.parquet_end_date is not None:
60
66
  raise ValueError('Parquet start date must be specified if end date is provided')
61
67
 
@@ -88,4 +94,3 @@ class ParquetConfig(BaseModel):
88
94
  def ensure_file_extension(filepath: str, extension: str) -> str:
89
95
  path = Path(filepath)
90
96
  return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
91
-
@@ -74,7 +74,9 @@ class SQLAlchemyDask:
74
74
  deprecated specific filter handling to a generic one
75
75
  #self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
76
76
  """
77
- self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query, model=self.model, filters=self.filters)
77
+ self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query,
78
+ model=self.model,
79
+ filters=self.filters)
78
80
  else:
79
81
  n_records = 100
80
82
  self.query = self.query.limit(n_records)
@@ -52,7 +52,6 @@ class SqlAlchemyFilterHandler:
52
52
  return [datetime.date.fromisoformat(v) for v in value]
53
53
  return value
54
54
 
55
-
56
55
  def handle_date_operator(column, date_op):
57
56
  """
58
57
  Handle filtering on specific datetime parts (e.g., year, month).
@@ -93,7 +92,7 @@ class SqlAlchemyFilterHandler:
93
92
 
94
93
  # Get the column from the model
95
94
  column = getattr(model, field_name, None)
96
- #column = model.__table__.columns.get(field_name)
95
+ # column = model.__table__.columns.get(field_name)
97
96
  if not column:
98
97
  raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
99
98
 
@@ -117,4 +116,4 @@ class SqlAlchemyFilterHandler:
117
116
  else:
118
117
  raise ValueError(f"Unsupported operation: {operation}")
119
118
 
120
- return query
119
+ return query
@@ -1,5 +1,4 @@
1
1
  import dask.dataframe as dd
2
- import dask_expr
3
2
  import pandas as pd
4
3
 
5
4
  from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
@@ -7,6 +6,7 @@ from sibi_dst.utils import Logger
7
6
  from ._io_sqlalchemy_dask import SQLAlchemyDask
8
7
  from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
9
8
 
9
+
10
10
  class SqlAlchemyLoadFromDb:
11
11
  df: dd.DataFrame = None
12
12
 
@@ -52,7 +52,7 @@ class SqlAlchemyLoadFromDb:
52
52
 
53
53
  if self.df is None or len(self.df.head().index) == 0:
54
54
  self.logger.debug("Query returned no results.")
55
- dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
55
+ dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
56
56
 
57
57
  return dask_df
58
58
  return self.df
@@ -1,15 +1,17 @@
1
1
  import re
2
+
2
3
  from sqlalchemy import MetaData, Table
3
4
  from sqlalchemy.orm import declarative_base, relationship
4
5
 
5
6
  # Base class for dynamically created models
6
7
  Base = declarative_base()
7
8
 
8
-
9
9
  apps_label = "datacubes"
10
10
 
11
+
11
12
  class SqlAlchemyModelBuilder:
12
13
  _model_cache = {} # Local cache for model classes
14
+
13
15
  def __init__(self, engine, table_name):
14
16
  """
15
17
  Initialize the model builder with a database engine and specific table.
@@ -58,7 +60,7 @@ class SqlAlchemyModelBuilder:
58
60
 
59
61
  # Add columns and relationships to the model
60
62
  attrs.update(columns)
61
- #self.add_relationships(attrs, self.table)
63
+ # self.add_relationships(attrs, self.table)
62
64
  model = Base.registry._class_registry.get(self.class_name)
63
65
  if not model:
64
66
  model = type(self.class_name, (Base,), attrs)
@@ -126,4 +128,4 @@ class SqlAlchemyModelBuilder:
126
128
  column_name = re.sub(r"\W|^(?=\d)", "_", column_name)
127
129
  if column_name in {"class", "def", "return", "yield", "global"}:
128
130
  column_name += "_field"
129
- return column_name
131
+ return column_name
@@ -1,9 +1,10 @@
1
+ import datetime
1
2
  from typing import Any, Optional, Dict, Type
3
+
2
4
  from pydantic import BaseModel, model_validator
3
- from sqlmodel import SQLModel, Field, create_engine
4
5
  from sqlalchemy import inspect
5
- from sqlalchemy.sql import text
6
6
  from sqlalchemy.exc import OperationalError
7
+ from sqlalchemy.sql import text
7
8
  from sqlalchemy.sql.sqltypes import (
8
9
  Integer,
9
10
  String,
@@ -14,7 +15,7 @@ from sqlalchemy.sql.sqltypes import (
14
15
  Time,
15
16
  Numeric,
16
17
  )
17
- import datetime
18
+ from sqlmodel import SQLModel, Field, create_engine
18
19
 
19
20
 
20
21
  class SQLModelConnectionConfig(BaseModel):
@@ -130,4 +131,4 @@ class SQLModelConnectionConfig(BaseModel):
130
131
  @staticmethod
131
132
  def _table2model(table_name: str) -> str:
132
133
  """Convert table name to PascalCase model name."""
133
- return "".join(word.capitalize() for word in table_name.split("_"))
134
+ return "".join(word.capitalize() for word in table_name.split("_"))
@@ -1,19 +1,21 @@
1
- import dask.dataframe as dd
2
- from sqlmodel import Session, select, text
3
- from typing import Any, Dict, Optional
4
1
  import logging
2
+ from typing import Any, Dict, Optional
3
+
4
+ import dask.dataframe as dd
5
5
  import pandas as pd
6
+ from sqlmodel import Session, select, text
7
+
6
8
 
7
9
  class SQLModelLoadFromDb:
8
10
  df: dd.DataFrame
9
11
 
10
12
  def __init__(
11
- self,
12
- db_connection,
13
- db_query: Optional[Dict[str, Any]] = None,
14
- db_params: Optional[Dict[str, Any]] = None,
15
- logger=None,
16
- **kwargs,
13
+ self,
14
+ db_connection,
15
+ db_query: Optional[Dict[str, Any]] = None,
16
+ db_params: Optional[Dict[str, Any]] = None,
17
+ logger=None,
18
+ **kwargs,
17
19
  ):
18
20
  """
19
21
  Initialize the loader with database connection, query, and parameters.
@@ -74,7 +76,7 @@ class SQLModelLoadFromDb:
74
76
  results = session.exec(query).fetchall()
75
77
 
76
78
  # Convert query results to a Dask DataFrame
77
- print("results:",results)
79
+ print("results:", results)
78
80
  if results:
79
81
  df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
80
82
  else:
@@ -96,4 +98,4 @@ class SQLModelLoadFromDb:
96
98
  if field_map:
97
99
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
98
100
  if rename_mapping:
99
- self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
101
+ self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
@@ -54,8 +54,10 @@ django_field_conversion_map_dask: Dict[str, callable] = {
54
54
  "BooleanField": lambda x: x.astype(bool),
55
55
  "NullBooleanField": lambda x: x.astype(bool),
56
56
  "DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
57
- "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
58
- "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
57
+ "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
58
+ meta=("date", "object")),
59
+ "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
60
+ meta=("time", "object")),
59
61
  "DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
60
62
  "JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
61
63
  "ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
@@ -72,12 +74,15 @@ sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
72
74
  Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
73
75
  Boolean.__name__: lambda x: x.astype(bool),
74
76
  DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
75
- Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
76
- Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
77
+ Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
78
+ meta=("date", "object")),
79
+ Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
80
+ meta=("time", "object")),
77
81
  JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
78
82
  UUID.__name__: lambda x: x.astype(str),
79
83
  }
80
84
 
85
+
81
86
  # Conversion map with normalized SQLAlchemy field types
82
87
  # sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
83
88
  # "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
@@ -129,5 +134,3 @@ def normalize_sqlalchemy_type(field_type):
129
134
 
130
135
  # Fallback to raw class name
131
136
  return field_type.__class__.__name__
132
-
133
-
@@ -1,10 +1,13 @@
1
1
  import datetime
2
+
2
3
  import dask.dataframe as dd
3
4
  import pandas as pd
4
5
  from sqlalchemy import func, cast
5
6
  from sqlalchemy.sql.sqltypes import Date, Time
7
+
6
8
  from sibi_dst.utils import Logger
7
9
 
10
+
8
11
  class FilterHandler:
9
12
  def __init__(self, backend, logger=None):
10
13
  """
@@ -15,7 +18,8 @@ class FilterHandler:
15
18
  logger: Optional logger for debugging purposes.
16
19
  """
17
20
  self.backend = backend
18
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__) # No-op logger if none provided
21
+ self.logger = logger or Logger.default_logger(
22
+ logger_name=self.__class__.__name__) # No-op logger if none provided
19
23
  self.backend_methods = self._get_backend_methods(backend)
20
24
 
21
25
  def apply_filters(self, query_or_df, model=None, filters=None):
@@ -34,7 +38,7 @@ class FilterHandler:
34
38
  for key, value in filters.items():
35
39
  field_name, casting, operation = self._parse_filter_key(key)
36
40
  parsed_value = self._parse_filter_value(casting, value)
37
- #print(field_name, casting, operation, parsed_value)
41
+ # print(field_name, casting, operation, parsed_value)
38
42
  # Get the column and apply backend-specific transformations
39
43
  if self.backend == "sqlalchemy":
40
44
  column = self.backend_methods["get_column"](field_name, model, casting)
@@ -67,7 +71,6 @@ class FilterHandler:
67
71
 
68
72
  return field_name, casting, operation
69
73
 
70
-
71
74
  def _parse_filter_value(self, casting, value):
72
75
  """
73
76
  Convert filter value to appropriate type based on the casting (e.g., date).
@@ -213,4 +216,4 @@ class FilterHandler:
213
216
  return [
214
217
  "gte", "lte", "gt", "lt", "exact", "in", "range",
215
218
  "contains", "startswith", "endswith", "isnull",
216
- ]
219
+ ]
@@ -1,7 +1,7 @@
1
+ from typing import Optional, Dict, Union, List
1
2
 
2
3
  from pydantic import BaseModel, model_validator, Field
3
4
 
4
- from typing import Optional, Dict, Union, List
5
5
  dataframe_params: Dict[str, Union[None, str, bool, int, None]] = {
6
6
  "fieldnames": None,
7
7
  "index_col": None,
@@ -25,6 +25,7 @@ dataframe_options: Dict[str, Union[bool, str, int, None]] = {
25
25
 
26
26
  LOOKUP_SEP = "__"
27
27
 
28
+
28
29
  class ParamsConfig(BaseModel):
29
30
  field_map: Optional[Dict] = Field(default_factory=dict)
30
31
  legacy_filters: bool = False
@@ -76,4 +77,4 @@ class ParamsConfig(BaseModel):
76
77
  new_filter_field = LOOKUP_SEP.join(new_parts)
77
78
  new_filters[new_filter_field] = value
78
79
 
79
- self.filters = new_filters
80
+ self.filters = new_filters
@@ -1,7 +1,5 @@
1
1
  from typing import Optional
2
2
 
3
- import dask.dataframe as dd
4
- import pandas as pd
5
3
  from pydantic import BaseModel, model_validator
6
4
 
7
5
 
@@ -1,18 +1,19 @@
1
1
  from __future__ import annotations
2
+
3
+ from ._airflow_manager import AirflowDAGManager
4
+ from ._clickhouse_writer import ClickHouseWriter
2
5
  from ._credentials import *
3
- from ._log_utils import Logger
4
- from ._date_utils import *
5
6
  from ._data_utils import DataUtils
7
+ from ._data_wrapper import DataWrapper
8
+ from ._date_utils import *
9
+ from ._df_utils import DfUtils
6
10
  from ._file_utils import FileUtils
7
11
  from ._filepath_generator import FilePathGenerator
8
- from ._df_utils import DfUtils
9
- from ._storage_manager import StorageManager
12
+ from ._log_utils import Logger
10
13
  from ._parquet_saver import ParquetSaver
11
- from ._clickhouse_writer import ClickHouseWriter
12
- from ._data_wrapper import DataWrapper
13
- from ._airflow_manager import AirflowDAGManager
14
+ from ._storage_manager import StorageManager
14
15
 
15
- __all__=[
16
+ __all__ = [
16
17
  "ConfigManager",
17
18
  "ConfigLoader",
18
19
  "Logger",
@@ -27,4 +28,4 @@ __all__=[
27
28
  "DfUtils",
28
29
  "ClickHouseWriter",
29
30
  "AirflowDAGManager",
30
- ]
31
+ ]
@@ -1,8 +1,9 @@
1
1
  import os
2
- from jinja2 import Template
3
2
  from datetime import datetime
3
+
4
4
  import fsspec
5
5
  import httpx
6
+ from jinja2 import Template
6
7
 
7
8
  """
8
9
  A manager to dynamically generate, save, and upload Airflow DAGs via SSH using fsspec.
@@ -54,8 +55,8 @@ with DAG(
54
55
  {% endfor %}
55
56
  """
56
57
 
57
- class AirflowDAGManager:
58
58
 
59
+ class AirflowDAGManager:
59
60
 
60
61
  def __init__(self, output_dir, remote_dags_path, ssh_host, ssh_user, ssh_password, url, auth, wrapper_module_path):
61
62
  """
@@ -208,4 +209,4 @@ class AirflowDAGManager:
208
209
  return response.json()
209
210
  except httpx.RequestError as e:
210
211
  print(f"Failed to trigger DAG {dag_id}: {e}")
211
- raise
212
+ raise
@@ -1,9 +1,12 @@
1
+ from concurrent.futures import ThreadPoolExecutor
2
+
1
3
  import clickhouse_connect
4
+ import pandas as pd
2
5
  from clickhouse_driver import Client
3
6
  from dask.dataframe import dd
4
- import pandas as pd
7
+
5
8
  from sibi_dst.utils import Logger
6
- from concurrent.futures import ThreadPoolExecutor
9
+
7
10
 
8
11
  class ClickHouseWriter:
9
12
  dtype_to_clickhouse = {
@@ -19,20 +22,20 @@ class ClickHouseWriter:
19
22
  df: dd.DataFrame
20
23
 
21
24
  def __init__(self, logger=None, **kwargs):
22
- self.clickhouse_host = kwargs.setdefault('host',"localhost")
23
- self.clickhouse_port = kwargs.setdefault('port',8123)
24
- self.clickhouse_dbname = kwargs.setdefault('database','sibi_data')
25
- self.clickhouse_user = kwargs.setdefault('user','default')
26
- self.clickhouse_password = kwargs.setdefault('password','')
27
- self.clickhouse_table = kwargs.setdefault('table','test_sibi_table')
25
+ self.clickhouse_host = kwargs.setdefault('host', "localhost")
26
+ self.clickhouse_port = kwargs.setdefault('port', 8123)
27
+ self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
28
+ self.clickhouse_user = kwargs.setdefault('user', 'default')
29
+ self.clickhouse_password = kwargs.setdefault('password', '')
30
+ self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
28
31
 
29
32
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
30
33
  self.client = None
31
- self.order_by=kwargs.setdefault('order_by','id')
34
+ self.order_by = kwargs.setdefault('order_by', 'id')
32
35
 
33
36
  def save_to_clickhouse(self, df, **kwargs):
34
37
  self.df = df.copy()
35
- self.order_by = kwargs.setdefault('order_by',self.order_by)
38
+ self.order_by = kwargs.setdefault('order_by', self.order_by)
36
39
  if len(self.df.head().index) == 0:
37
40
  self.logger.debug("Dataframe is empty")
38
41
  return
@@ -86,8 +89,8 @@ class ClickHouseWriter:
86
89
  if engine is None:
87
90
  engine = f"ENGINE = MergeTree() order by {self.order_by}"
88
91
  dtypes = self.df.dtypes
89
- clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
90
- create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
92
+ clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
93
+ create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
91
94
  self.logger.debug(f"Creating table SQL:{create_table_sql}")
92
95
  if self.client:
93
96
  self.client.command(create_table_sql)
@@ -200,4 +203,4 @@ class ClickHouseWriter:
200
203
  with ThreadPoolExecutor() as executor:
201
204
  executor.map(write_partition, partitions, range(len(partitions)))
202
205
  except Exception as e:
203
- self.logger.error(f"Error during multi-partition write: {e}")
206
+ self.logger.error(f"Error during multi-partition write: {e}")
@@ -1,12 +1,15 @@
1
1
  import datetime
2
2
  from typing import Type, Any, Dict, Optional
3
+
3
4
  import fsspec
4
5
  import pandas as pd
5
6
  from IPython.display import display
6
- from sibi_dst.utils import Logger
7
7
  from tqdm import tqdm
8
+
9
+ from sibi_dst.utils import Logger
8
10
  from sibi_dst.utils import ParquetSaver
9
11
 
12
+
10
13
  class DataWrapper:
11
14
  DEFAULT_MAX_AGE_MINUTES = 1440
12
15
  DEFAULT_HISTORY_DAYS_THRESHOLD = 30
@@ -89,7 +92,7 @@ class DataWrapper:
89
92
  # Filter dates in the category where `update_required` is True
90
93
  dates_to_process = update_plan_table[
91
94
  (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
92
- ]["date"].tolist()
95
+ ]["date"].tolist()
93
96
 
94
97
  date_iterator = dates_to_process
95
98
  if self.show_progress:
@@ -130,7 +133,7 @@ class DataWrapper:
130
133
  data_object = self.dataclass(**self.class_params)
131
134
  df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
132
135
 
133
- if len(df.index)==0:
136
+ if len(df.index) == 0:
134
137
  self.logger.error("No data found for the specified date.")
135
138
  return
136
139
 
@@ -194,7 +197,7 @@ class DataWrapper:
194
197
  "missing_file": missing_file,
195
198
  "update_required": update_required,
196
199
  "update_category": category,
197
- "datawrapper class":self.dataclass.__name__
200
+ "datawrapper class": self.dataclass.__name__
198
201
  })
199
202
 
200
203
  update_plan_table = pd.DataFrame(rows)
@@ -1,8 +1,9 @@
1
1
  import datetime
2
- from typing import Union, Tuple, Callable, Dict, Any
2
+ from typing import Union, Tuple, Callable, Dict
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
+
6
7
  from sibi_dst.utils import Logger
7
8
 
8
9
 
@@ -32,7 +33,8 @@ class DateUtils:
32
33
  raise ValueError(f"Unsupported date format: {value}")
33
34
 
34
35
  @classmethod
35
- def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[datetime.date, datetime.date]:
36
+ def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
37
+ datetime.date, datetime.date]:
36
38
  """
37
39
  Calculate the start and end of the week for a given reference date.
38
40
  """
@@ -49,7 +51,8 @@ class DateUtils:
49
51
  return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
50
52
 
51
53
  @classmethod
52
- def get_first_day_of_the_quarter(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
54
+ def get_first_day_of_the_quarter(cls, reference_date: Union[
55
+ str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
53
56
  """
54
57
  Get the first day of the quarter for a given date.
55
58
  """
@@ -58,7 +61,8 @@ class DateUtils:
58
61
  return datetime.date(reference_date.year, 3 * quarter - 2, 1)
59
62
 
60
63
  @classmethod
61
- def get_last_day_of_the_quarter(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
64
+ def get_last_day_of_the_quarter(cls, reference_date: Union[
65
+ str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
62
66
  """
63
67
  Get the last day of the quarter for a given date.
64
68
  """
@@ -116,10 +120,12 @@ class DateUtils:
116
120
  'current_month': lambda: cls.get_month_range(n=0),
117
121
  'last_month': lambda: cls.get_month_range(n=-1),
118
122
  'current_year': lambda: cls.get_year_timerange(today().year),
119
- 'current_quarter': lambda: (cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
123
+ 'current_quarter': lambda: (
124
+ cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
120
125
  'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
121
126
  }
122
127
 
128
+
123
129
  class BusinessDays:
124
130
  def __init__(self, holiday_list, logger):
125
131
  """
@@ -1,7 +1,9 @@
1
- import pandas as pd
2
1
  import dask.dataframe as dd
2
+ import pandas as pd
3
+
3
4
  from ._log_utils import Logger
4
5
 
6
+
5
7
  class DfUtils:
6
8
  def __init__(self, logger=None):
7
9
  """
@@ -210,7 +212,7 @@ class DfUtils:
210
212
  df['Total'] = df.sum(axis=1, numeric_only=True)
211
213
  return df
212
214
 
213
- def summarise_data(self,df, summary_column, values_column, rule='D', agg_func='count'):
215
+ def summarise_data(self, df, summary_column, values_column, rule='D', agg_func='count'):
214
216
  """
215
217
  Summarizes data by creating a pivot table and resampling.
216
218
 
@@ -233,10 +235,12 @@ class DfUtils:
233
235
  df = df.set_index(dd.to_datetime(df.index))
234
236
 
235
237
  # Group by index and summary columns
236
- df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(agg_func).reset_index()
238
+ df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
239
+ agg_func).reset_index()
237
240
 
238
241
  # Pivot the table
239
- df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column, aggfunc='sum').fillna(0)
242
+ df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
243
+ aggfunc='sum').fillna(0)
240
244
 
241
245
  # Resample
242
246
  df_pivot.index = dd.to_datetime(df_pivot.index)
@@ -269,4 +273,4 @@ class DfUtils:
269
273
  Returns:
270
274
  DataFrame: Resampled pivot table.
271
275
  """
272
- return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
276
+ return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
@@ -1,10 +1,12 @@
1
1
  import shutil
2
2
  from pathlib import Path
3
3
  from typing import Optional
4
+
4
5
  import fsspec
5
6
 
6
7
  from sibi_dst.utils import Logger
7
8
 
9
+
8
10
  class FileUtils:
9
11
  def __init__(self, logger=None):
10
12
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -27,7 +29,7 @@ class FileUtils:
27
29
  fs.mkdirs(path)
28
30
 
29
31
  @staticmethod
30
- def construct_full_path(storage_path:str, parquet_filename: Optional[str]) -> Path:
32
+ def construct_full_path(storage_path: str, parquet_filename: Optional[str]) -> Path:
31
33
  """Construct and return the full path for the parquet file."""
32
34
  fs, base_path = fsspec.core.url_to_fs(storage_path)
33
35
  parquet_filename = parquet_filename or "default.parquet"
@@ -1,7 +1,8 @@
1
1
  import datetime
2
- import fsspec
3
2
  import re
4
3
 
4
+ import fsspec
5
+
5
6
  from sibi_dst.utils import Logger
6
7
 
7
8
 
@@ -150,6 +151,7 @@ class FilePathGenerator:
150
151
  return datetime.datetime.strptime(date, '%Y-%m-%d')
151
152
  return date
152
153
 
154
+
153
155
  """
154
156
  Usage:
155
157
  # Initialize the generator
@@ -182,4 +184,4 @@ for fp in file_paths:
182
184
 
183
185
  df_pandas = pd.concat(dataframes, ignore_index=True)
184
186
  print(df_pandas.head())
185
- """
187
+ """
@@ -71,4 +71,4 @@ class Logger:
71
71
  self.logger.error(msg)
72
72
 
73
73
  def critical(self, msg):
74
- self.logger.critical(msg)
74
+ self.logger.critical(msg)
@@ -1,7 +1,6 @@
1
1
  from pathlib import Path
2
2
  from typing import Optional
3
3
 
4
- import dask_expr
5
4
  import fsspec
6
5
  import pyarrow as pa
7
6
 
@@ -103,4 +102,3 @@ class ParquetSaver:
103
102
  self.df_result.to_parquet(
104
103
  str(full_path), engine="pyarrow", schema=schema, write_index=False
105
104
  )
106
-
File without changes
@@ -1,6 +1,6 @@
1
+ from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
1
2
  from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
2
- from ._sqlalchemy_model_builder import SqlAlchemyModelBuilder
3
3
  from ._sqlalchemy_load_from_db import SqlAlchemyLoadFromDb
4
- from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
4
+ from ._sqlalchemy_model_builder import SqlAlchemyModelBuilder
5
5
 
6
6
  __all__ = ['SqlAlchemyConnectionConfig', 'SqlAlchemyModelBuilder', 'SqlAlchemyLoadFromDb', 'SqlAlchemyFilterHandler']
@@ -1,18 +1,19 @@
1
1
  from typing import Any, Optional
2
2
 
3
3
  from pydantic import BaseModel, model_validator
4
+ from sqlalchemy import create_engine
4
5
  from sqlalchemy.exc import OperationalError
5
6
  from sqlalchemy.sql import text
6
- from sqlalchemy import create_engine
7
+
7
8
  from ._sqlalchemy_model_builder import SqlAlchemyModelBuilder
8
9
 
10
+
9
11
  class SqlAlchemyConnectionConfig(BaseModel):
10
12
  connection_url: str
11
13
  table: Optional[str] = None
12
14
  model: Any = None
13
15
  engine: Optional[Any] = None # Save engine to reuse it
14
16
 
15
-
16
17
  @model_validator(mode="after")
17
18
  def validate_and_initialize(self):
18
19
  """
@@ -45,4 +46,3 @@ class SqlAlchemyConnectionConfig(BaseModel):
45
46
  connection.execute(text("SELECT 1"))
46
47
  except OperationalError as e:
47
48
  raise ValueError(f"Failed to connect to the database: {e}")
48
-
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._sqlmodel_load_from_db import SQLModelLoadFromDb
4
3
  from ._sqlmodel_db_connection import SQLModelConnectionConfig
4
+ from ._sqlmodel_load_from_db import SQLModelLoadFromDb
5
5
 
6
6
  __all__ = [
7
7
  "SQLModelLoadFromDb",
@@ -1,5 +1,6 @@
1
1
  import os
2
2
 
3
+
3
4
  class ConfigLoader:
4
5
  def __init__(self, prefix, keys, defaults=None):
5
6
  """
@@ -54,4 +55,3 @@ class ConfigManager:
54
55
  :return: The configuration dictionary.
55
56
  """
56
57
  return self.configurations.get(name, {})
57
-
@@ -1,4 +1,5 @@
1
1
  from types import SimpleNamespace
2
+
2
3
  import fsspec
3
4
 
4
5
 
@@ -86,4 +87,3 @@ class StorageManager:
86
87
  print("Rebuilding depot structure...")
87
88
  self.rebuild_depot_paths(depots, clear_existing=clear_existing)
88
89
  print("Rebuild complete.")
89
-