sibi-dst 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +15 -11
- sibi_dst/df_helper/plugins/django/_io_dask.py +4 -0
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -5
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +15 -16
- sibi_dst/utils/_clickhouse_writer.py +3 -3
- {sibi_dst-0.3.12.dist-info → sibi_dst-0.3.14.dist-info}/METADATA +4 -3
- {sibi_dst-0.3.12.dist-info → sibi_dst-0.3.14.dist-info}/RECORD +8 -8
- {sibi_dst-0.3.12.dist-info → sibi_dst-0.3.14.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -4,6 +4,7 @@ from typing import Any, Dict, TypeVar
|
|
4
4
|
from typing import Union, Optional
|
5
5
|
|
6
6
|
import dask.dataframe as dd
|
7
|
+
import dask_expr
|
7
8
|
import pandas as pd
|
8
9
|
from pydantic import BaseModel
|
9
10
|
|
@@ -116,7 +117,7 @@ class DfHelper:
|
|
116
117
|
self._post_process_df()
|
117
118
|
self.logger.info("Data successfully loaded from sqlalchemy database.")
|
118
119
|
except Exception as e:
|
119
|
-
self.logger.error(f"Failed to load data from
|
120
|
+
self.logger.error(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
|
120
121
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
121
122
|
|
122
123
|
return self.df
|
@@ -195,10 +196,16 @@ class DfHelper:
|
|
195
196
|
self.logger.info("Post-processing of DataFrame completed.")
|
196
197
|
|
197
198
|
def _process_loaded_data(self):
|
198
|
-
|
199
|
-
|
200
|
-
|
199
|
+
self.logger.info(f"Type of self.df: {type(self.df)}")
|
200
|
+
if self.df.map_partitions(len).compute().sum() > 0:
|
201
|
+
field_map = self.plugin_params.field_map or {}
|
202
|
+
if isinstance(field_map, dict):
|
201
203
|
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
204
|
+
missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
|
205
|
+
|
206
|
+
if missing_columns:
|
207
|
+
self.logger.warning(
|
208
|
+
f"The following columns in field_map are not in the DataFrame: {missing_columns}")
|
202
209
|
|
203
210
|
def rename_columns(df, mapping):
|
204
211
|
return df.rename(columns=mapping)
|
@@ -214,13 +221,10 @@ class DfHelper:
|
|
214
221
|
ps.save_to_parquet(parquet_filename)
|
215
222
|
self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
|
216
223
|
|
217
|
-
def save_to_clickhouse(self,
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
'order_by': order_by or 'id',
|
222
|
-
}
|
223
|
-
credentials = {**credentials, **click_config}
|
224
|
+
def save_to_clickhouse(self, **credentials):
|
225
|
+
if self.df.map_partitions(len).compute().sum() == 0:
|
226
|
+
self.logger.info("Cannot write to clickhouse since Dataframe is empty")
|
227
|
+
return
|
224
228
|
cs=ClickHouseWriter(logger=self.logger, **credentials)
|
225
229
|
cs.save_to_clickhouse(self.df)
|
226
230
|
self.logger.info("Save to ClickHouse completed.")
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import itertools
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
|
+
import dask_expr
|
4
5
|
import django
|
5
6
|
import pandas as pd
|
6
7
|
from django.core.cache import cache
|
@@ -239,4 +240,7 @@ class ReadFrameDask:
|
|
239
240
|
if verbose:
|
240
241
|
self.update_with_verbose(dask_df, fieldnames, fields)
|
241
242
|
|
243
|
+
if isinstance(dask_df, dask_expr._collection.DataFrame):
|
244
|
+
dask_df = dask_df.to_legacy_dataframe()
|
245
|
+
|
242
246
|
return dask_df
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import itertools
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
|
+
import dask_expr
|
4
5
|
import pandas as pd
|
5
6
|
from sqlalchemy import create_engine, inspect, select
|
6
7
|
from sqlalchemy.orm import sessionmaker
|
@@ -10,7 +11,7 @@ from sibi_dst.utils import Logger
|
|
10
11
|
|
11
12
|
|
12
13
|
class SQLAlchemyDask:
|
13
|
-
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None,
|
14
|
+
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
|
14
15
|
"""
|
15
16
|
Initialize with an SQLAlchemy query and database engine URL.
|
16
17
|
|
@@ -19,13 +20,13 @@ class SQLAlchemyDask:
|
|
19
20
|
:param engine_url: Database connection string for SQLAlchemy engine.
|
20
21
|
:param chunk_size: Number of records per chunk for Dask partitions.
|
21
22
|
:param logger: Logger instance for logging.
|
22
|
-
:param
|
23
|
+
:param debug: Whether to print detailed logs.
|
23
24
|
"""
|
24
25
|
self.query = None
|
25
26
|
self.model = model
|
26
27
|
self.filters = filters
|
27
28
|
self.chunk_size = chunk_size
|
28
|
-
self.
|
29
|
+
self.debug = debug
|
29
30
|
self.engine = create_engine(engine_url)
|
30
31
|
self.Session = sessionmaker(bind=self.engine)
|
31
32
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -118,14 +119,17 @@ class SQLAlchemyDask:
|
|
118
119
|
partitions.append(dd.from_pandas(df, npartitions=1))
|
119
120
|
|
120
121
|
# Concatenate all partitions
|
121
|
-
# print(partitions)
|
122
122
|
if partitions:
|
123
123
|
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
124
124
|
else:
|
125
125
|
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
126
126
|
|
127
|
-
if self.
|
127
|
+
if self.debug:
|
128
128
|
self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
129
|
+
|
130
|
+
if isinstance(dask_df, dask_expr._collection.DataFrame):
|
131
|
+
dask_df = dask_df.to_legacy_dataframe()
|
132
|
+
|
129
133
|
return dask_df
|
130
134
|
|
131
135
|
except Exception as e:
|
@@ -1,22 +1,13 @@
|
|
1
|
-
from typing import Dict
|
2
|
-
|
3
1
|
import dask.dataframe as dd
|
4
2
|
import pandas as pd
|
5
|
-
from sqlalchemy.inspection import inspect
|
6
|
-
from sqlalchemy.orm import sessionmaker
|
7
|
-
from sqlalchemy import select
|
8
|
-
#from sqlmodel import Session, select
|
9
3
|
|
10
|
-
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
11
|
-
normalize_sqlalchemy_type
|
4
|
+
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
12
5
|
from sibi_dst.utils import Logger
|
13
6
|
from ._io_sqlalchemy_dask import SQLAlchemyDask
|
14
|
-
from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
15
7
|
from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
|
16
8
|
|
17
|
-
|
18
9
|
class SqlAlchemyLoadFromDb:
|
19
|
-
df: dd.DataFrame
|
10
|
+
df: dd.DataFrame = None
|
20
11
|
|
21
12
|
def __init__(
|
22
13
|
self,
|
@@ -43,17 +34,25 @@ class SqlAlchemyLoadFromDb:
|
|
43
34
|
"""
|
44
35
|
Load data into a Dask DataFrame based on the query and parameters.
|
45
36
|
"""
|
46
|
-
self.
|
37
|
+
self._build_and_load()
|
47
38
|
return self.df
|
48
39
|
|
49
40
|
def _build_and_load(self) -> dd.DataFrame:
|
41
|
+
|
50
42
|
try:
|
51
|
-
reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000,
|
52
|
-
df =
|
53
|
-
|
43
|
+
# reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
|
44
|
+
self.df = SQLAlchemyDask(
|
45
|
+
model=self.model,
|
46
|
+
filters=self.params_config.filters,
|
47
|
+
engine_url=self.engine.url,
|
48
|
+
logger=self.logger,
|
49
|
+
chunk_size=1000,
|
50
|
+
debug=self.debug).read_frame()
|
51
|
+
if self.df is None or len(self.df.head().index) == 0:
|
54
52
|
self.logger.warning("Query returned no results.")
|
55
53
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
56
|
-
|
54
|
+
|
55
|
+
return self.df
|
57
56
|
except Exception as e:
|
58
57
|
self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
|
59
58
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
@@ -31,9 +31,9 @@ class ClickHouseWriter:
|
|
31
31
|
self.order_by=kwargs.setdefault('order_by','id')
|
32
32
|
|
33
33
|
def save_to_clickhouse(self, df, **kwargs):
|
34
|
-
self.df = df
|
34
|
+
self.df = df.copy()
|
35
35
|
self.order_by = kwargs.setdefault('order_by',self.order_by)
|
36
|
-
if len(self.df.index) == 0:
|
36
|
+
if len(self.df.head().index) == 0:
|
37
37
|
self.logger.info("Dataframe is empty")
|
38
38
|
return
|
39
39
|
self._handle_missing_values()
|
@@ -122,7 +122,7 @@ class ClickHouseWriter:
|
|
122
122
|
"""
|
123
123
|
Writes the Dask DataFrame to a ClickHouse table partition by partition.
|
124
124
|
"""
|
125
|
-
if len(self.df.index) == 0:
|
125
|
+
if len(self.df.head().index) == 0:
|
126
126
|
self.logger.info("No data found. Nothing written.")
|
127
127
|
return
|
128
128
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.14
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -14,7 +14,8 @@ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
|
14
14
|
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
15
15
|
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
16
16
|
Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
|
17
|
-
Requires-Dist: django (
|
17
|
+
Requires-Dist: django (>=5.1.4,<6.0.0)
|
18
|
+
Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
|
18
19
|
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
19
20
|
Requires-Dist: ipython (>=8.29.0,<9.0.0)
|
20
21
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
@@ -28,7 +29,7 @@ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
|
28
29
|
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
29
30
|
Requires-Dist: pytest (>=8.3.3,<9.0.0)
|
30
31
|
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
31
|
-
Requires-Dist:
|
32
|
+
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
|
32
33
|
Requires-Dist: tornado (>=6.4.1,<7.0.0)
|
33
34
|
Requires-Dist: tqdm (>=4.67.0,<5.0.0)
|
34
35
|
Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
|
3
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
3
|
+
sibi_dst/df_helper/_df_helper.py,sha256=iYG8uL1ILrBvjtH8oiSwbPHnlDsJLlHtSghDDlt7T-w,13365
|
4
4
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
|
5
5
|
sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
|
6
6
|
sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
|
@@ -11,7 +11,7 @@ sibi_dst/df_helper/plugins/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X
|
|
11
11
|
sibi_dst/df_helper/plugins/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
|
12
12
|
sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NrkBb5LM1A_vo3wAotqj2sVVYIuTfFGrQqIXk3xOoDs,5162
|
13
13
|
sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
|
14
|
-
sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=
|
14
|
+
sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=aGaHziEMWK4zk9kkMNq2QtVevqVOCWqoAlXT1lVgRok,9198
|
15
15
|
sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
|
16
16
|
sibi_dst/df_helper/plugins/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
|
17
17
|
sibi_dst/df_helper/plugins/http/_http_config.py,sha256=WH0d4vsxfZRhWrWI4iTVAnhsdY3421SBr9kXYZVfeYQ,2126
|
@@ -19,17 +19,17 @@ sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPj
|
|
19
19
|
sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
|
20
20
|
sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=suJC7LfNEWAo-7_R62YTMSRku3k8orysft83VxRUems,4394
|
21
21
|
sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
|
22
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=
|
22
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=1WQ390XBFWOSXo0ea6-hz1LM6Ppi-j6ToZYr7sQBldE,5330
|
23
23
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
|
24
24
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
|
25
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=
|
25
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=balWGKun0NKIfhLZW-_DCOhKuTzTo_C2NwZoKFwuSJo,2329
|
26
26
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=vrTTeFcrf8sFpzqLlQfspjduKuR7Wgn7mDPUASsQs6s,4355
|
27
27
|
sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
|
28
28
|
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
|
29
29
|
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=bLD4tEcGDKkJCfSO4b13_89tzVJcpz55I6uw9D4ERnE,3751
|
30
30
|
sibi_dst/utils/__init__.py,sha256=nkX7tASNn57kw998YdqQQGY8qXv2J4LC4-g0GoQSiic,831
|
31
31
|
sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
|
32
|
-
sibi_dst/utils/_clickhouse_writer.py,sha256=
|
32
|
+
sibi_dst/utils/_clickhouse_writer.py,sha256=kNBQeDn3D4Javrz5L8uU_5itf8Mrvm9l29uxcmcKlbg,8555
|
33
33
|
sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
|
34
34
|
sibi_dst/utils/_data_utils.py,sha256=BvmjMNSkICy671BmjW68RhvDMfN5uAXwhffSV-wEwmk,9185
|
35
35
|
sibi_dst/utils/_data_wrapper.py,sha256=SmNv1UoZLq7ovRVy4wipsWLMidKJXcRTp4HtxmaCQdk,9399
|
@@ -40,6 +40,6 @@ sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixW
|
|
40
40
|
sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
|
41
41
|
sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
|
42
42
|
sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
|
43
|
-
sibi_dst-0.3.
|
44
|
-
sibi_dst-0.3.
|
45
|
-
sibi_dst-0.3.
|
43
|
+
sibi_dst-0.3.14.dist-info/METADATA,sha256=ysmNqT8NnhY_VlPmrQ2U3FnXWFEIvfwFRi8uSGRP6g0,2090
|
44
|
+
sibi_dst-0.3.14.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
45
|
+
sibi_dst-0.3.14.dist-info/RECORD,,
|
File without changes
|