sibi-dst 0.3.16__tar.gz → 0.3.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/PKG-INFO +1 -1
  2. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/pyproject.toml +1 -1
  3. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/df_helper/_df_helper.py +49 -51
  4. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/df_helper/_parquet_artifact.py +1 -1
  5. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_django_load_from_db.py +1 -1
  6. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_io_sqlalchemy_dask.py +2 -3
  7. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_sqlalchemy_load_from_db.py +5 -2
  8. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/__init__.py +2 -2
  9. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_data_wrapper.py +2 -3
  10. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/README.md +0 -0
  11. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/__init__.py +0 -0
  12. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/df_helper/__init__.py +0 -0
  13. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/__init__.py +0 -0
  14. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/__init__.py +0 -0
  15. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_django_db_connection.py +0 -0
  16. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_django_sql_model_builder.py +0 -0
  17. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_io_dask.py +0 -0
  18. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_io_dask_alt.py +0 -0
  19. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/http/__init__.py +0 -0
  20. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/http/_http_config.py +0 -0
  21. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/parquet/__init__.py +0 -0
  22. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/parquet/_parquet_filter_handler.py +0 -0
  23. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/parquet/_parquet_options.py +0 -0
  24. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/__init__.py +0 -0
  25. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
  26. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
  27. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
  28. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_model/__init__.py +0 -0
  29. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_model/_sqlmodel_db_connection.py +0 -0
  30. {sibi_dst-0.3.16/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_model/_sqlmodel_load_from_db.py +0 -0
  31. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/__init__.py +0 -0
  32. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/_defaults.py +0 -0
  33. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  34. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/_params_config.py +0 -0
  35. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/_query_config.py +0 -0
  36. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_airflow_manager.py +0 -0
  37. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_clickhouse_writer.py +0 -0
  38. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_credentials.py +0 -0
  39. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_data_utils.py +0 -0
  40. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_date_utils.py +0 -0
  41. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_df_utils.py +0 -0
  42. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_file_utils.py +0 -0
  43. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_filepath_generator.py +0 -0
  44. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_log_utils.py +0 -0
  45. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_parquet_saver.py +0 -0
  46. {sibi_dst-0.3.16 → sibi_dst-0.3.18}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.16
3
+ Version: 0.3.18
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.16"
3
+ version = "0.3.18"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -6,17 +6,16 @@ from typing import Any, Dict, TypeVar
6
6
  from typing import Union, Optional
7
7
 
8
8
  import dask.dataframe as dd
9
- import dask_expr
10
9
  import pandas as pd
11
10
  from pydantic import BaseModel
12
11
 
13
12
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
14
13
  from sibi_dst.utils import Logger
15
14
  from sibi_dst.utils import ParquetSaver, ClickHouseWriter
16
- from .plugins.django import *
17
- from .plugins.http import HttpConfig
18
- from .plugins.parquet import ParquetConfig, ParquetFilterHandler
19
- from .plugins.sql_alchemy import *
15
+ from .backends.django import *
16
+ from .backends.http import HttpConfig
17
+ from .backends.parquet import ParquetConfig
18
+ from .backends.sql_alchemy import *
20
19
 
21
20
  # Define a generic type variable for BaseModel subclasses
22
21
  T = TypeVar("T", bound=BaseModel)
@@ -30,26 +29,25 @@ warnings.filterwarnings(
30
29
 
31
30
  class DfHelper:
32
31
  df: Union[dd.DataFrame, pd.DataFrame] = None
33
- plugin_django_connection: Optional[DjangoConnectionConfig] = None
34
- plugin_query: Optional[QueryConfig] = None
35
- plugin_params: Optional[ParamsConfig] = None
36
- plugin_parquet: Optional[ParquetConfig] = None
37
- plugin_http: Optional[HttpConfig] = None
38
- plugin_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
32
+ backend_django: Optional[DjangoConnectionConfig] = None
33
+ backend_query: Optional[QueryConfig] = None
34
+ backend_params: Optional[ParamsConfig] = None
35
+ backend_parquet: Optional[ParquetConfig] = None
36
+ backend_http: Optional[HttpConfig] = None
37
+ backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
39
38
  parquet_filename: str = None
40
39
  logger: Logger
41
40
  default_config: Dict = None
42
41
 
43
- def __init__(self, source='django_db', **kwargs):
42
+ def __init__(self, backend='django_db', **kwargs):
44
43
  # Ensure default_config is not shared across instances
45
44
  self.default_config = self.default_config or {}
46
45
  kwargs = {**self.default_config.copy(), **kwargs}
47
- self.source = source
46
+ self.backend = backend
48
47
  self.debug = kwargs.setdefault("debug", False)
49
- self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
48
+ self.logger = kwargs.get("logger",Logger.default_logger(logger_name=self.__class__.__name__))
50
49
  # Configure logger level
51
50
  self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
52
- # Configure logger level
53
51
  self.logger.debug("Logger initialized in DEBUG mode.")
54
52
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
55
53
  self.dt_field = kwargs.setdefault("dt_field", None)
@@ -59,18 +57,18 @@ class DfHelper:
59
57
  self.post_init(**kwargs)
60
58
 
61
59
  def post_init(self, **kwargs):
62
- self.logger.debug(f"Source used: {self.source}")
63
- self.plugin_query = self.__get_config(QueryConfig, kwargs)
64
- self.plugin_params = self.__get_config(ParamsConfig, kwargs)
65
- if self.source == 'django_db':
66
- self.plugin_django_connection = self.__get_config(DjangoConnectionConfig, kwargs)
67
- elif self.source == 'parquet':
60
+ self.logger.debug(f"backend used: {self.backend}")
61
+ self.backend_query = self.__get_config(QueryConfig, kwargs)
62
+ self.backend_params = self.__get_config(ParamsConfig, kwargs)
63
+ if self.backend == 'django_db':
64
+ self.backend_django = self.__get_config(DjangoConnectionConfig, kwargs)
65
+ elif self.backend == 'parquet':
68
66
  self.parquet_filename = kwargs.setdefault("parquet_filename", None)
69
- self.plugin_parquet = ParquetConfig(**kwargs)
70
- elif self.source == 'http':
71
- self.plugin_http = HttpConfig(**kwargs)
72
- elif self.source == 'sqlalchemy':
73
- self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
67
+ self.backend_parquet = ParquetConfig(**kwargs)
68
+ elif self.backend == 'http':
69
+ self.backend_http = HttpConfig(**kwargs)
70
+ elif self.backend == 'sqlalchemy':
71
+ self.backend_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
74
72
 
75
73
  @staticmethod
76
74
  def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
@@ -87,7 +85,7 @@ class DfHelper:
87
85
  return model(**model_kwargs)
88
86
 
89
87
  def load(self, **options):
90
- # this will be the universal method to load data from a df irrespective of the source
88
+ # this will be the universal method to load data from a df irrespective of the backend
91
89
  df = self._load(**options)
92
90
  if self.as_pandas:
93
91
  return df.compute()
@@ -95,15 +93,15 @@ class DfHelper:
95
93
 
96
94
  def _load(self, **options):
97
95
 
98
- if self.source == 'django_db':
99
- self.plugin_params.parse_params(options)
96
+ if self.backend == 'django_db':
97
+ self.backend_params.parse_params(options)
100
98
  return self._load_from_db(**options)
101
- elif self.source == 'sqlalchemy':
102
- self.plugin_params.parse_params(options)
99
+ elif self.backend == 'sqlalchemy':
100
+ self.backend_params.parse_params(options)
103
101
  return self._load_from_sqlalchemy(**options)
104
- elif self.source == 'parquet':
102
+ elif self.backend == 'parquet':
105
103
  return self._load_from_parquet(**options)
106
- elif self.source == 'http':
104
+ elif self.backend == 'http':
107
105
  if asyncio.get_event_loop().is_running():
108
106
  self.logger.debug("Running as a task from an event loop")
109
107
  return asyncio.create_task(self._load_from_http(**options))
@@ -115,9 +113,9 @@ class DfHelper:
115
113
  try:
116
114
  options.setdefault("debug", self.debug)
117
115
  db_loader = SqlAlchemyLoadFromDb(
118
- self.plugin_sqlalchemy,
119
- self.plugin_query,
120
- self.plugin_params,
116
+ self.backend_sqlalchemy,
117
+ self.backend_query,
118
+ self.backend_params,
121
119
  self.logger,
122
120
  **options
123
121
  )
@@ -135,9 +133,9 @@ class DfHelper:
135
133
  try:
136
134
  options.setdefault("debug", self.debug)
137
135
  db_loader = DjangoLoadFromDb(
138
- self.plugin_django_connection,
139
- self.plugin_query,
140
- self.plugin_params,
136
+ self.backend_django,
137
+ self.backend_query,
138
+ self.backend_params,
141
139
  self.logger,
142
140
  **options
143
141
  )
@@ -152,12 +150,12 @@ class DfHelper:
152
150
  return self.df
153
151
 
154
152
  async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
155
- """Delegate asynchronous HTTP data loading to HttpDataSource plugin."""
156
- if not self.plugin_http:
153
+ """Delegate asynchronous HTTP data loading to HttpDatabackend plugin."""
154
+ if not self.backend_http:
157
155
  self.logger.debug("HTTP plugin not configured properly.")
158
156
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
159
157
  try:
160
- self.df = await self.plugin_http.fetch_data(**options)
158
+ self.df = await self.backend_http.fetch_data(**options)
161
159
  except Exception as e:
162
160
  self.logger.debug(f"Failed to load data from http plugin: {e}")
163
161
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -168,7 +166,7 @@ class DfHelper:
168
166
  Efficiently process the DataFrame by filtering, renaming, and setting indices.
169
167
  Optimized for large datasets with Dask compatibility.
170
168
  """
171
- df_params = self.plugin_params.df_params
169
+ df_params = self.backend_params.df_params
172
170
  fieldnames = df_params.get("fieldnames", None)
173
171
  index_col = df_params.get("index_col", None)
174
172
  datetime_index = df_params.get("datetime_index", False)
@@ -205,7 +203,7 @@ class DfHelper:
205
203
  def _process_loaded_data(self):
206
204
  self.logger.debug(f"Type of self.df: {type(self.df)}")
207
205
  if self.df.map_partitions(len).compute().sum() > 0:
208
- field_map = self.plugin_params.field_map or {}
206
+ field_map = self.backend_params.field_map or {}
209
207
  if isinstance(field_map, dict):
210
208
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
211
209
  missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
@@ -237,7 +235,7 @@ class DfHelper:
237
235
  self.logger.debug("Save to ClickHouse completed.")
238
236
 
239
237
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
240
- self.df = self.plugin_parquet.load_files()
238
+ self.df = self.backend_parquet.load_files()
241
239
  if options:
242
240
  """
243
241
  deprecated specific filter handling to a generic one
@@ -273,20 +271,20 @@ class DfHelper:
273
271
  raise ValueError("The 'start' date cannot be later than the 'end' date.")
274
272
 
275
273
  # Reverse map to original field name
276
- field_map = getattr(self.plugin_params, 'field_map', {}) or {}
274
+ field_map = getattr(self.backend_params, 'field_map', {}) or {}
277
275
  reverse_map = {v: k for k, v in field_map.items()}
278
276
  mapped_field = reverse_map.get(dt_field, dt_field)
279
277
 
280
278
  # Common logic for Django and SQLAlchemy
281
- if self.source == 'django_db':
282
- model_fields = {field.name: field for field in self.plugin_django_connection.model._meta.get_fields()}
279
+ if self.backend == 'django_db':
280
+ model_fields = {field.name: field for field in self.backend_django.model._meta.get_fields()}
283
281
  if mapped_field not in model_fields:
284
282
  raise ValueError(f"Field '{dt_field}' does not exist in the Django model.")
285
283
  field_type = type(model_fields[mapped_field]).__name__
286
284
  is_date_field = field_type == 'DateField'
287
285
  is_datetime_field = field_type == 'DateTimeField'
288
- elif self.source == 'sqlalchemy':
289
- model = self.plugin_sqlalchemy.model
286
+ elif self.backend == 'sqlalchemy':
287
+ model = self.backend_sqlalchemy.model
290
288
  fields = [column.name for column in model.__table__.columns]
291
289
  if mapped_field not in fields:
292
290
  raise ValueError(f"Field '{dt_field}' does not exist in the SQLAlchemy model.")
@@ -295,7 +293,7 @@ class DfHelper:
295
293
  is_date_field = field_type == 'DATE'
296
294
  is_datetime_field = field_type == 'DATETIME'
297
295
  else:
298
- raise ValueError(f"Unsupported source '{self.source}'")
296
+ raise ValueError(f"Unsupported backend '{self.backend}'")
299
297
  # Build query filters
300
298
  if start == end:
301
299
  if is_date_field:
@@ -8,7 +8,7 @@ from sibi_dst.utils import DateUtils
8
8
 
9
9
  class ParquetArtifact(DfHelper):
10
10
  DEFAULT_CONFIG = {
11
- 'source': 'parquet'
11
+ 'backend': 'parquet'
12
12
  }
13
13
 
14
14
  def __init__(self, data_wrapper_class, filesystem_type="file", filesystem_options=None, **kwargs):
@@ -5,7 +5,7 @@ import pandas as pd
5
5
  from IPython.core.hooks import deprecated
6
6
  from django.db.models import Q
7
7
 
8
- from sibi_dst.df_helper.plugins.django import ReadFrameDask
8
+ from sibi_dst.df_helper.backends.django import ReadFrameDask
9
9
  from sibi_dst.utils import Logger
10
10
  from sibi_dst.df_helper.core import django_field_conversion_map_dask
11
11
 
@@ -1,14 +1,13 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
- import dask_expr
5
4
  import pandas as pd
6
5
  from sqlalchemy import create_engine, inspect, select
7
6
  from sqlalchemy.orm import sessionmaker
8
7
 
9
- from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
10
- from sibi_dst.utils import Logger
11
8
  from sibi_dst.df_helper.core import FilterHandler
9
+ from sibi_dst.utils import Logger
10
+
12
11
 
13
12
  class SQLAlchemyDask:
14
13
  def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
@@ -29,6 +29,7 @@ class SqlAlchemyLoadFromDb:
29
29
  self.query_config = plugin_query
30
30
  self.params_config = plugin_params
31
31
  self.debug = kwargs.pop("debug", False)
32
+ self.chunk_size = kwargs.pop("chunk_size", 1000)
32
33
 
33
34
  def build_and_load(self) -> dd.DataFrame:
34
35
  """
@@ -45,8 +46,10 @@ class SqlAlchemyLoadFromDb:
45
46
  filters=self.params_config.filters,
46
47
  engine_url=self.engine.url,
47
48
  logger=self.logger,
48
- chunk_size=1000,
49
- debug=self.debug).read_frame()
49
+ chunk_size=self.chunk_size,
50
+ debug=self.debug
51
+ ).read_frame()
52
+
50
53
  if self.df is None or len(self.df.head().index) == 0:
51
54
  self.logger.debug("Query returned no results.")
52
55
  dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
- from ._credentials import ConfigManager, ConfigLoader
2
+ from ._credentials import *
3
3
  from ._log_utils import Logger
4
- from ._date_utils import DateUtils, BusinessDays
4
+ from ._date_utils import *
5
5
  from ._data_utils import DataUtils
6
6
  from ._file_utils import FileUtils
7
7
  from ._filepath_generator import FilePathGenerator
@@ -1,16 +1,15 @@
1
1
  import datetime
2
2
  from typing import Type, Any, Dict, Optional
3
3
 
4
- import dask_expr
5
4
  import fsspec
6
5
  import pandas as pd
7
6
  from IPython.display import display
8
- from dask.dataframe import dd
7
+ from tqdm import tqdm
9
8
 
10
9
  from sibi_dst.utils import Logger
11
- from tqdm import tqdm
12
10
  from sibi_dst.utils import ParquetSaver
13
11
 
12
+
14
13
  class DataWrapper:
15
14
  DEFAULT_MAX_AGE_MINUTES = 1440
16
15
  DEFAULT_HISTORY_DAYS_THRESHOLD = 30
File without changes