sibi-dst 0.3.22__tar.gz → 0.3.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/PKG-INFO +3 -2
  2. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/pyproject.toml +3 -2
  3. sibi_dst-0.3.22/sibi_dst/df_helper/backends/django/_io_dask_alt.py +0 -189
  4. sibi_dst-0.3.22/sibi_dst/df_helper/backends/sql_model/__init__.py +0 -9
  5. sibi_dst-0.3.22/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +0 -134
  6. sibi_dst-0.3.22/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +0 -101
  7. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/README.md +0 -0
  8. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/__init__.py +0 -0
  9. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/__init__.py +0 -0
  10. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/_df_helper.py +0 -0
  11. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  12. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  13. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/__init__.py +0 -0
  14. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  15. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_django_db_connection.py +0 -0
  16. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_django_load_from_db.py +0 -0
  17. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +0 -0
  18. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  19. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  20. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  21. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  22. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +0 -0
  23. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  24. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/__init__.py +0 -0
  25. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +0 -0
  26. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
  27. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
  28. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +0 -0
  29. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
  30. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/__init__.py +0 -0
  31. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/_defaults.py +0 -0
  32. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  33. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/_params_config.py +0 -0
  34. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/_query_config.py +0 -0
  35. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/__init__.py +0 -0
  36. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_airflow_manager.py +0 -0
  37. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_clickhouse_writer.py +0 -0
  38. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_credentials.py +0 -0
  39. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_data_utils.py +0 -0
  40. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_data_wrapper.py +0 -0
  41. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_date_utils.py +0 -0
  42. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_df_utils.py +0 -0
  43. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_file_utils.py +0 -0
  44. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_filepath_generator.py +0 -0
  45. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_log_utils.py +0 -0
  46. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_parquet_saver.py +0 -0
  47. {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.22
3
+ Version: 0.3.24
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -25,6 +25,7 @@ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
25
25
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
26
26
  Requires-Dist: paramiko (>=3.5.0,<4.0.0)
27
27
  Requires-Dist: psutil (>=6.1.0,<7.0.0)
28
+ Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
28
29
  Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
29
30
  Requires-Dist: pydantic (>=2.9.2,<3.0.0)
30
31
  Requires-Dist: pymysql (>=1.1.1,<2.0.0)
@@ -33,7 +34,7 @@ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
33
34
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
34
35
  Requires-Dist: tornado (>=6.4.1,<7.0.0)
35
36
  Requires-Dist: tqdm (>=4.67.0,<5.0.0)
36
- Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
37
+ Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
37
38
  Description-Content-Type: text/markdown
38
39
 
39
40
  # sibi-dst
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.22"
3
+ version = "0.3.24"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -30,10 +30,11 @@ clickhouse-driver = "^0.2.9"
30
30
  paramiko = "^3.5.0"
31
31
  chardet = "^5.2.0"
32
32
  charset-normalizer = "^3.4.0"
33
- uvicorn = "^0.32.1"
34
33
  sqlalchemy = "^2.0.36"
35
34
  djangorestframework = "^3.15.2"
36
35
  dask-expr = "^1.1.20"
36
+ psycopg2 = "^2.9.10"
37
+ uvicorn = "^0.34.0"
37
38
 
38
39
 
39
40
  [build-system]
@@ -1,189 +0,0 @@
1
- import itertools
2
-
3
- import dask.dataframe as dd
4
- import django
5
- import pandas as pd
6
- from django.core.cache import cache
7
- from django.core.exceptions import FieldDoesNotExist
8
- from django.db import models
9
- from django.db.models import Field
10
- from django.utils.encoding import force_str as force_text
11
-
12
-
13
- class ReadFrameDask:
14
- FieldDoesNotExist = (
15
- django.core.exceptions.FieldDoesNotExist
16
- if django.VERSION < (1, 8)
17
- else django.core.exceptions.FieldDoesNotExist
18
- )
19
-
20
- def __init__(
21
- self,
22
- qs,
23
- **kwargs,
24
- ):
25
- self.qs = qs
26
- self.coerce_float = kwargs.setdefault("coerce_float", False)
27
- self.chunk_size = kwargs.setdefault("chunk_size", 1000)
28
- self.verbose = kwargs.setdefault("verbose", True)
29
-
30
- @staticmethod
31
- def get_model_name(model):
32
- return model._meta.model_name
33
-
34
- @staticmethod
35
- def get_related_model(field):
36
- model = None
37
- if hasattr(field, "related_model") and field.related_model:
38
- model = field.related_model
39
- elif hasattr(field, "rel") and field.rel:
40
- model = field.rel.to
41
- return model
42
-
43
- @classmethod
44
- def get_base_cache_key(cls, model):
45
- return (
46
- f"dask_{model._meta.app_label}_{cls.get_model_name(model)}_%s_rendering"
47
- )
48
-
49
- @classmethod
50
- def replace_pk(cls, model):
51
- base_cache_key = cls.get_base_cache_key(model)
52
-
53
- def get_cache_key_from_pk(pk):
54
- return None if pk is None else base_cache_key % str(pk)
55
-
56
- def inner(pk_series):
57
- pk_series = pk_series.astype(object).where(pk_series.notnull(), None)
58
- cache_keys = pk_series.apply(get_cache_key_from_pk, convert_dtype=False)
59
- unique_cache_keys = list(filter(None, cache_keys.unique()))
60
- if not unique_cache_keys:
61
- return pk_series
62
-
63
- out_dict = cache.get_many(unique_cache_keys)
64
- if len(out_dict) < len(unique_cache_keys):
65
- out_dict = dict(
66
- [
67
- (base_cache_key % obj.pk, force_text(obj))
68
- for obj in model.objects.filter(
69
- pk__in=list(filter(None, pk_series.unique()))
70
- )
71
- ]
72
- )
73
- cache.set_many(out_dict)
74
- return list(map(out_dict.get, cache_keys))
75
-
76
- return inner
77
-
78
- @staticmethod
79
- def replace_from_choices(choices):
80
- def inner(values):
81
- return [choices.get(v, v) for v in values]
82
-
83
- return inner
84
-
85
- @classmethod
86
- def build_update_functions(cls, fieldnames, fields):
87
- for fieldname, field in zip(fieldnames, fields):
88
- if not isinstance(field, Field):
89
- yield fieldname, None
90
- else:
91
- if field.choices:
92
- choices = dict([(k, force_text(v)) for k, v in field.flatchoices])
93
- yield fieldname, cls.replace_from_choices(choices)
94
- elif field.get_internal_type() == "ForeignKey":
95
- yield fieldname, cls.replace_pk(cls.get_related_model(field))
96
-
97
- @classmethod
98
- def update_with_verbose(cls, df, fieldnames, fields):
99
- for fieldname, function in cls.build_update_functions(fieldnames, fields):
100
- if function is not None:
101
- df[fieldname] = df[fieldname].map_partitions(lambda x: function(x))
102
-
103
- @staticmethod
104
- def infer_dtypes_from_django(qs):
105
- """Infers Dask data types based on Django queryset model fields, with support for nullable integers."""
106
- django_to_dask_dtype = {
107
- 'AutoField': 'Int64', # Use nullable integer
108
- 'BigAutoField': 'Int64',
109
- 'BigIntegerField': 'Int64',
110
- 'BooleanField': 'bool',
111
- 'CharField': 'object',
112
- 'DateField': 'datetime64[ns]',
113
- 'DateTimeField': 'datetime64[ns]',
114
- 'DecimalField': 'float64',
115
- 'FloatField': 'float64',
116
- 'IntegerField': 'Int64', # Use nullable integer
117
- 'PositiveIntegerField': 'Int64',
118
- 'SmallIntegerField': 'Int64',
119
- 'TextField': 'object',
120
- 'TimeField': 'object',
121
- 'UUIDField': 'object',
122
- 'ForeignKey': 'Int64', # Use nullable integer for FK fields
123
- }
124
-
125
- dtypes = {}
126
- # Handle model fields
127
- for field in qs.model._meta.get_fields():
128
- # Skip reverse relationships and non-concrete fields
129
- if not getattr(field, 'concrete', False):
130
- continue
131
-
132
- # Check for AutoField or BigAutoField explicitly
133
- if isinstance(field, (models.AutoField, models.BigAutoField)):
134
- dtypes[field.name] = 'Int64' # Nullable integer for autoincremented fields
135
- else:
136
- # Use field type to infer dtype
137
- field_type = field.get_internal_type()
138
- dtypes[field.name] = django_to_dask_dtype.get(field_type, 'object')
139
-
140
- # Handle annotated fields
141
- for annotation_name, annotation in qs.query.annotation_select.items():
142
- if hasattr(annotation, 'output_field'):
143
- field_type = annotation.output_field.get_internal_type()
144
- dtype = django_to_dask_dtype.get(field_type, 'object')
145
- else:
146
- dtype = 'object' # Default to object for untyped annotations
147
- dtypes[annotation_name] = dtype
148
-
149
- return dtypes
150
-
151
- def read_frame(self, fillna_value=None):
152
- qs = self.qs
153
- fieldnames = tuple(qs.model._meta.get_fields())
154
- dtypes = self.infer_dtypes_from_django(qs)
155
- chunk_size = self.chunk_size
156
- verbose = self.verbose
157
-
158
- # Use values to directly fetch required fields
159
- qs = qs.values(*fieldnames)
160
-
161
- # Create partitions for Dask
162
- partitions = []
163
- iterator = qs.iterator(chunk_size=chunk_size)
164
- for chunk in itertools.islice(iterator, chunk_size):
165
- df = pd.DataFrame.from_records(chunk, columns=fieldnames)
166
-
167
- # Handle NaN values
168
- if fillna_value:
169
- df = df.fillna(fillna_value)
170
-
171
- # Optimize timezone conversions
172
- for col in df.columns:
173
- if isinstance(df[col].dtype, pd.DatetimeTZDtype):
174
- df[col] = df[col].dt.tz_localize(None)
175
-
176
- # Optimize dtype conversion
177
- df = df.convert_dtypes()
178
-
179
- # Convert to Dask DataFrame
180
- partitions.append(dd.from_pandas(df, npartitions=1))
181
-
182
- # Combine all partitions
183
- dask_df = dd.concat(partitions, axis=0, ignore_index=True)
184
-
185
- # Apply verbose updates
186
- if verbose:
187
- self.update_with_verbose(dask_df, fieldnames, qs.model._meta.fields)
188
-
189
- return dask_df
@@ -1,9 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from ._sqlmodel_db_connection import SQLModelConnectionConfig
4
- from ._sqlmodel_load_from_db import SQLModelLoadFromDb
5
-
6
- __all__ = [
7
- "SQLModelLoadFromDb",
8
- "SQLModelConnectionConfig",
9
- ]
@@ -1,134 +0,0 @@
1
- import datetime
2
- from typing import Any, Optional, Dict, Type
3
-
4
- from pydantic import BaseModel, model_validator
5
- from sqlalchemy import inspect
6
- from sqlalchemy.exc import OperationalError
7
- from sqlalchemy.sql import text
8
- from sqlalchemy.sql.sqltypes import (
9
- Integer,
10
- String,
11
- Float,
12
- Boolean,
13
- DateTime,
14
- Date,
15
- Time,
16
- Numeric,
17
- )
18
- from sqlmodel import SQLModel, Field, create_engine
19
-
20
-
21
- class SQLModelConnectionConfig(BaseModel):
22
- live: bool = False
23
- connection_url: str
24
- table: Optional[str] = None
25
- model: Optional[Any] = None
26
- engine: Optional[Any] = None # Save engine to reuse it
27
-
28
- class Config:
29
- arbitrary_types_allowed = True
30
-
31
- @model_validator(mode="after")
32
- def validate_and_initialize(self):
33
- """
34
- Validate connection parameters, initialize the engine, and build the dynamic model if necessary.
35
- """
36
- # Validate `connection_url`
37
- if not self.connection_url:
38
- raise ValueError("`connection_url` must be provided.")
39
-
40
- # Initialize the engine
41
- self.engine = create_engine(self.connection_url)
42
-
43
- # Validate the connection
44
- self.validate_connection()
45
-
46
- # If table is provided, set `live=False`
47
- if self.table:
48
- self.live = False
49
-
50
- # If model is not provided, build dynamically
51
- if not self.model:
52
- if not self.table:
53
- raise ValueError("`table_name` must be provided to build the model.")
54
- try:
55
- self.model = self.build_model()
56
- except Exception as e:
57
- raise ValueError(f"Failed to build model for table '{self.table}': {e}")
58
- else:
59
- self.live = True
60
-
61
- return self
62
-
63
- def validate_connection(self):
64
- """
65
- Test the database connection by executing a simple query.
66
- """
67
- try:
68
- with self.engine.connect() as connection:
69
- connection.execute(text("SELECT 1"))
70
- except OperationalError as e:
71
- raise ValueError(f"Failed to connect to the database: {e}")
72
-
73
- def build_model(self) -> Type[SQLModel]:
74
- """
75
- Dynamically build a SQLModel class based on the table schema.
76
- """
77
- inspector = inspect(self.engine)
78
-
79
- # Validate table existence
80
- if self.table not in inspector.get_table_names():
81
- raise ValueError(f"Table '{self.table}' does not exist in the database.")
82
-
83
- columns = inspector.get_columns(self.table)
84
- if not columns:
85
- raise ValueError(f"No columns found for table '{self.table}'.")
86
-
87
- type_mapping = {
88
- Integer: int,
89
- String: str,
90
- Float: float,
91
- Boolean: bool,
92
- DateTime: datetime.datetime,
93
- Date: datetime.date,
94
- Time: datetime.time,
95
- Numeric: float,
96
- }
97
-
98
- annotations: Dict[str, Type] = {}
99
- model_fields = {}
100
-
101
- for column in columns:
102
- name = column["name"]
103
- sa_type = column["type"]
104
- nullable = column["nullable"]
105
- default = column.get("default", None)
106
- primary_key = column.get("primary_key", False)
107
-
108
- py_type = None
109
- for sa_base_type, py_base_type in type_mapping.items():
110
- if isinstance(sa_type, sa_base_type):
111
- py_type = py_base_type
112
- break
113
-
114
- if py_type is None:
115
- raise ValueError(f"Unsupported SQLAlchemy type for column '{name}': {sa_type}")
116
-
117
- # Define field type and attributes
118
- annotations[name] = py_type
119
- model_fields[name] = Field(
120
- default=default,
121
- nullable=nullable,
122
- primary_key=primary_key,
123
- sa_column_args={"type_": sa_type},
124
- )
125
-
126
- model_fields["__annotations__"] = annotations
127
- model_fields["__table__"] = self.table
128
- model_name = self._table2model(self.table)
129
- return type(model_name, (SQLModel,), model_fields)
130
-
131
- @staticmethod
132
- def _table2model(table_name: str) -> str:
133
- """Convert table name to PascalCase model name."""
134
- return "".join(word.capitalize() for word in table_name.split("_"))
@@ -1,101 +0,0 @@
1
- import logging
2
- from typing import Any, Dict, Optional
3
-
4
- import dask.dataframe as dd
5
- import pandas as pd
6
- from sqlmodel import Session, select, text
7
-
8
-
9
- class SQLModelLoadFromDb:
10
- df: dd.DataFrame
11
-
12
- def __init__(
13
- self,
14
- db_connection,
15
- db_query: Optional[Dict[str, Any]] = None,
16
- db_params: Optional[Dict[str, Any]] = None,
17
- logger=None,
18
- **kwargs,
19
- ):
20
- """
21
- Initialize the loader with database connection, query, and parameters.
22
- """
23
- self.db_connection = db_connection
24
- self.table_name = self.db_connection.table
25
- self.model = self.db_connection.model
26
- self.engine = self.db_connection.engine
27
- self.logger = logger or self._default_logger()
28
- self.query_config = db_query or {}
29
- self.params_config = db_params or {}
30
- self.debug = kwargs.pop("debug", False)
31
-
32
- def _default_logger(self):
33
- """Create a default logger."""
34
- logging.basicConfig(level=logging.INFO)
35
- return logging.getLogger("SQLModelLoadFromDb")
36
-
37
- def build_and_load(self) -> dd.DataFrame:
38
- """
39
- Load data into a Dask DataFrame based on the query and parameters.
40
- """
41
- self.df = self._build_and_load()
42
- if not self.df.empty:
43
- self._process_loaded_data()
44
- return self.df
45
-
46
- def _build_and_load(self) -> dd.DataFrame:
47
- """
48
- Query the database and load results into a Dask DataFrame.
49
- """
50
- print(self.model.__name__)
51
- with Session(self.engine) as session:
52
- try:
53
- query = select(text(self.model.__table__))
54
- print("query:", query)
55
-
56
- # Apply filters if provided
57
- filters = self.params_config.df_params.get("filters")
58
- if filters:
59
- # Apply ORM filters (simple equality conditions)
60
- for column_name, value in filters.items():
61
- column = getattr(self.model, column_name, None)
62
- if column is not None:
63
- query = query.filter(column == value)
64
- else:
65
- self.logger.warning(f"Filter column '{column_name}' not found in model.")
66
-
67
- # Apply limit if provided in query_config
68
- n_records = self.query_config.n_records
69
- if n_records:
70
- query = query.limit(n_records)
71
-
72
- # Debug: Log the SQL query
73
- self.logger.debug(f"Executing query: {str(query)}")
74
-
75
- # Execute the query
76
- results = session.exec(query).fetchall()
77
-
78
- # Convert query results to a Dask DataFrame
79
- print("results:", results)
80
- if results:
81
- df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
82
- else:
83
- self.logger.debug("Query returned no results.")
84
- df = dd.from_pandas(pd.DataFrame(), npartitions=1)
85
-
86
- except Exception as e:
87
- print(e)
88
- self.logger.error(f"Error loading data: {e}")
89
- df = dd.from_pandas(pd.DataFrame(), npartitions=1)
90
-
91
- return df
92
-
93
- def _process_loaded_data(self):
94
- """
95
- Process and clean the loaded data.
96
- """
97
- field_map = self.params_config.get("field_map", {})
98
- if field_map:
99
- rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
100
- if rename_mapping:
101
- self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
File without changes