sibi-dst 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ from typing import Any, Dict, TypeVar
4
4
  from typing import Union, Optional
5
5
 
6
6
  import dask.dataframe as dd
7
+ import dask_expr
7
8
  import pandas as pd
8
9
  from pydantic import BaseModel
9
10
 
@@ -116,7 +117,7 @@ class DfHelper:
116
117
  self._post_process_df()
117
118
  self.logger.info("Data successfully loaded from sqlalchemy database.")
118
119
  except Exception as e:
119
- self.logger.error(f"Failed to load data from sqlqlchemy database: {e}")
120
+ self.logger.error(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
120
121
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
121
122
 
122
123
  return self.df
@@ -195,10 +196,16 @@ class DfHelper:
195
196
  self.logger.info("Post-processing of DataFrame completed.")
196
197
 
197
198
  def _process_loaded_data(self):
198
- if len(self.df.index) > 0:
199
- field_map = self.plugin_params.field_map or []
200
- if field_map:
199
+ self.logger.info(f"Type of self.df: {type(self.df)}")
200
+ if self.df.map_partitions(len).compute().sum() > 0:
201
+ field_map = self.plugin_params.field_map or {}
202
+ if isinstance(field_map, dict):
201
203
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
204
+ missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
205
+
206
+ if missing_columns:
207
+ self.logger.warning(
208
+ f"The following columns in field_map are not in the DataFrame: {missing_columns}")
202
209
 
203
210
  def rename_columns(df, mapping):
204
211
  return df.rename(columns=mapping)
@@ -214,13 +221,10 @@ class DfHelper:
214
221
  ps.save_to_parquet(parquet_filename)
215
222
  self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
216
223
 
217
- def save_to_clickhouse(self, database, table, order_by=None, **credentials):
218
- click_config ={
219
- 'database': database,
220
- 'table': table,
221
- 'order_by': order_by or 'id',
222
- }
223
- credentials = {**credentials, **click_config}
224
+ def save_to_clickhouse(self, **credentials):
225
+ if self.df.map_partitions(len).compute().sum() == 0:
226
+ self.logger.info("Cannot write to clickhouse since Dataframe is empty")
227
+ return
224
228
  cs=ClickHouseWriter(logger=self.logger, **credentials)
225
229
  cs.save_to_clickhouse(self.df)
226
230
  self.logger.info("Save to ClickHouse completed.")
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
+ import dask_expr
4
5
  import django
5
6
  import pandas as pd
6
7
  from django.core.cache import cache
@@ -239,4 +240,7 @@ class ReadFrameDask:
239
240
  if verbose:
240
241
  self.update_with_verbose(dask_df, fieldnames, fields)
241
242
 
243
+ if isinstance(dask_df, dask_expr._collection.DataFrame):
244
+ dask_df = dask_df.to_legacy_dataframe()
245
+
242
246
  return dask_df
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
+ import dask_expr
4
5
  import pandas as pd
5
6
  from sqlalchemy import create_engine, inspect, select
6
7
  from sqlalchemy.orm import sessionmaker
@@ -10,7 +11,7 @@ from sibi_dst.utils import Logger
10
11
 
11
12
 
12
13
  class SQLAlchemyDask:
13
- def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, verbose=True):
14
+ def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
14
15
  """
15
16
  Initialize with an SQLAlchemy query and database engine URL.
16
17
 
@@ -19,13 +20,13 @@ class SQLAlchemyDask:
19
20
  :param engine_url: Database connection string for SQLAlchemy engine.
20
21
  :param chunk_size: Number of records per chunk for Dask partitions.
21
22
  :param logger: Logger instance for logging.
22
- :param verbose: Whether to print detailed logs.
23
+ :param debug: Whether to print detailed logs.
23
24
  """
24
25
  self.query = None
25
26
  self.model = model
26
27
  self.filters = filters
27
28
  self.chunk_size = chunk_size
28
- self.verbose = verbose
29
+ self.debug = debug
29
30
  self.engine = create_engine(engine_url)
30
31
  self.Session = sessionmaker(bind=self.engine)
31
32
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -118,14 +119,17 @@ class SQLAlchemyDask:
118
119
  partitions.append(dd.from_pandas(df, npartitions=1))
119
120
 
120
121
  # Concatenate all partitions
121
- # print(partitions)
122
122
  if partitions:
123
123
  dask_df = dd.concat(partitions, axis=0, ignore_index=True)
124
124
  else:
125
125
  dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
126
126
 
127
- if self.verbose:
127
+ if self.debug:
128
128
  self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
+
130
+ if isinstance(dask_df, dask_expr._collection.DataFrame):
131
+ dask_df = dask_df.to_legacy_dataframe()
132
+
129
133
  return dask_df
130
134
 
131
135
  except Exception as e:
@@ -1,22 +1,13 @@
1
- from typing import Dict
2
-
3
1
  import dask.dataframe as dd
4
2
  import pandas as pd
5
- from sqlalchemy.inspection import inspect
6
- from sqlalchemy.orm import sessionmaker
7
- from sqlalchemy import select
8
- #from sqlmodel import Session, select
9
3
 
10
- from sibi_dst.df_helper.core import ParamsConfig, QueryConfig, sqlalchemy_field_conversion_map_dask, \
11
- normalize_sqlalchemy_type
4
+ from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
12
5
  from sibi_dst.utils import Logger
13
6
  from ._io_sqlalchemy_dask import SQLAlchemyDask
14
- from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
15
7
  from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
16
8
 
17
-
18
9
  class SqlAlchemyLoadFromDb:
19
- df: dd.DataFrame
10
+ df: dd.DataFrame = None
20
11
 
21
12
  def __init__(
22
13
  self,
@@ -43,17 +34,25 @@ class SqlAlchemyLoadFromDb:
43
34
  """
44
35
  Load data into a Dask DataFrame based on the query and parameters.
45
36
  """
46
- self.df = self._build_and_load()
37
+ self._build_and_load()
47
38
  return self.df
48
39
 
49
40
  def _build_and_load(self) -> dd.DataFrame:
41
+
50
42
  try:
51
- reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, verbose=self.debug)
52
- df = reader.read_frame()
53
- if df is None or len(df.index) == 0:
43
+ # reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
44
+ self.df = SQLAlchemyDask(
45
+ model=self.model,
46
+ filters=self.params_config.filters,
47
+ engine_url=self.engine.url,
48
+ logger=self.logger,
49
+ chunk_size=1000,
50
+ debug=self.debug).read_frame()
51
+ if self.df is None or len(self.df.head().index) == 0:
54
52
  self.logger.warning("Query returned no results.")
55
53
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
56
- return df
54
+
55
+ return self.df
57
56
  except Exception as e:
58
57
  self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
59
58
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -31,9 +31,9 @@ class ClickHouseWriter:
31
31
  self.order_by=kwargs.setdefault('order_by','id')
32
32
 
33
33
  def save_to_clickhouse(self, df, **kwargs):
34
- self.df = df
34
+ self.df = df.copy()
35
35
  self.order_by = kwargs.setdefault('order_by',self.order_by)
36
- if len(self.df.index) == 0:
36
+ if len(self.df.head().index) == 0:
37
37
  self.logger.info("Dataframe is empty")
38
38
  return
39
39
  self._handle_missing_values()
@@ -122,7 +122,7 @@ class ClickHouseWriter:
122
122
  """
123
123
  Writes the Dask DataFrame to a ClickHouse table partition by partition.
124
124
  """
125
- if len(self.df.index) == 0:
125
+ if len(self.df.head().index) == 0:
126
126
  self.logger.info("No data found. Nothing written.")
127
127
  return
128
128
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.12
3
+ Version: 0.3.14
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -14,7 +14,8 @@ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
14
14
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
15
15
  Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
16
16
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
17
- Requires-Dist: django (==4.1.13)
17
+ Requires-Dist: django (>=5.1.4,<6.0.0)
18
+ Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
18
19
  Requires-Dist: httpx (>=0.27.2,<0.28.0)
19
20
  Requires-Dist: ipython (>=8.29.0,<9.0.0)
20
21
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
@@ -28,7 +29,7 @@ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
28
29
  Requires-Dist: pymysql (>=1.1.1,<2.0.0)
29
30
  Requires-Dist: pytest (>=8.3.3,<9.0.0)
30
31
  Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
31
- Requires-Dist: sqlmodel (>=0.0.22,<0.0.23)
32
+ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
32
33
  Requires-Dist: tornado (>=6.4.1,<7.0.0)
33
34
  Requires-Dist: tqdm (>=4.67.0,<5.0.0)
34
35
  Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
@@ -1,6 +1,6 @@
1
1
  sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
2
2
  sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
3
- sibi_dst/df_helper/_df_helper.py,sha256=43-eY9mDU-j-QFeAtdMjIb3KuC2_hYzLjVi177_EKAo,13006
3
+ sibi_dst/df_helper/_df_helper.py,sha256=iYG8uL1ILrBvjtH8oiSwbPHnlDsJLlHtSghDDlt7T-w,13365
4
4
  sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
5
5
  sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
6
6
  sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
@@ -11,7 +11,7 @@ sibi_dst/df_helper/plugins/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X
11
11
  sibi_dst/df_helper/plugins/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
12
12
  sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NrkBb5LM1A_vo3wAotqj2sVVYIuTfFGrQqIXk3xOoDs,5162
13
13
  sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
14
- sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=jryDojeA62rB3seRaWWMjsAmekKacK5xctwCQGVklPQ,9063
14
+ sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=aGaHziEMWK4zk9kkMNq2QtVevqVOCWqoAlXT1lVgRok,9198
15
15
  sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
16
16
  sibi_dst/df_helper/plugins/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
17
17
  sibi_dst/df_helper/plugins/http/_http_config.py,sha256=WH0d4vsxfZRhWrWI4iTVAnhsdY3421SBr9kXYZVfeYQ,2126
@@ -19,17 +19,17 @@ sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPj
19
19
  sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
20
20
  sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=suJC7LfNEWAo-7_R62YTMSRku3k8orysft83VxRUems,4394
21
21
  sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
22
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=6IjQEREXqTAzSJE95FKfXjRkTlEjRMS4hJ_yMpyKDTg,5223
22
+ sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=1WQ390XBFWOSXo0ea6-hz1LM6Ppi-j6ToZYr7sQBldE,5330
23
23
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
24
24
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
25
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=myrtEzK12DvA73x7QFaqXFb_TxOPMrsVj-mxYHJD2dg,2371
25
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=balWGKun0NKIfhLZW-_DCOhKuTzTo_C2NwZoKFwuSJo,2329
26
26
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=vrTTeFcrf8sFpzqLlQfspjduKuR7Wgn7mDPUASsQs6s,4355
27
27
  sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
28
28
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
29
29
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=bLD4tEcGDKkJCfSO4b13_89tzVJcpz55I6uw9D4ERnE,3751
30
30
  sibi_dst/utils/__init__.py,sha256=nkX7tASNn57kw998YdqQQGY8qXv2J4LC4-g0GoQSiic,831
31
31
  sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
32
- sibi_dst/utils/_clickhouse_writer.py,sha256=mdgszbyVluhGvDmvsHY4XDTZrp42L3xtdmiyn3z2bYM,8534
32
+ sibi_dst/utils/_clickhouse_writer.py,sha256=kNBQeDn3D4Javrz5L8uU_5itf8Mrvm9l29uxcmcKlbg,8555
33
33
  sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
34
34
  sibi_dst/utils/_data_utils.py,sha256=BvmjMNSkICy671BmjW68RhvDMfN5uAXwhffSV-wEwmk,9185
35
35
  sibi_dst/utils/_data_wrapper.py,sha256=SmNv1UoZLq7ovRVy4wipsWLMidKJXcRTp4HtxmaCQdk,9399
@@ -40,6 +40,6 @@ sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixW
40
40
  sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
41
41
  sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
42
42
  sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
43
- sibi_dst-0.3.12.dist-info/METADATA,sha256=5mezOBAiUV2pMgNsVqI7iCZZgmxeZpLuYWDYUAZCTVk,2030
44
- sibi_dst-0.3.12.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
- sibi_dst-0.3.12.dist-info/RECORD,,
43
+ sibi_dst-0.3.14.dist-info/METADATA,sha256=ysmNqT8NnhY_VlPmrQ2U3FnXWFEIvfwFRi8uSGRP6g0,2090
44
+ sibi_dst-0.3.14.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
+ sibi_dst-0.3.14.dist-info/RECORD,,