sibi-dst 0.3.11__tar.gz → 0.3.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/PKG-INFO +7 -3
  2. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/pyproject.toml +7 -3
  3. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/_df_helper.py +20 -13
  4. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_io_dask.py +4 -0
  5. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/http/_http_config.py +2 -3
  6. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +4 -0
  7. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/parquet/_parquet_options.py +2 -0
  8. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -5
  9. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +15 -16
  10. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/__init__.py +2 -1
  11. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_clickhouse_writer.py +3 -3
  12. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_data_utils.py +91 -62
  13. sibi_dst-0.3.14/sibi_dst/utils/_data_wrapper.py +238 -0
  14. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_date_utils.py +130 -0
  15. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_df_utils.py +91 -0
  16. sibi_dst-0.3.11/sibi_dst/utils/_data_wrapper.py +0 -556
  17. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/README.md +0 -0
  18. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/__init__.py +0 -0
  19. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/__init__.py +0 -0
  20. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  21. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/core/__init__.py +0 -0
  22. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/core/_defaults.py +0 -0
  23. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/core/_params_config.py +0 -0
  24. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/core/_query_config.py +0 -0
  25. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/__init__.py +0 -0
  26. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/__init__.py +0 -0
  27. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_django_db_connection.py +0 -0
  28. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_django_load_from_db.py +0 -0
  29. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py +0 -0
  30. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/django/_io_dask_alt.py +0 -0
  31. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/http/__init__.py +0 -0
  32. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/parquet/__init__.py +0 -0
  33. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/__init__.py +0 -0
  34. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
  35. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
  36. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
  37. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_model/__init__.py +0 -0
  38. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py +0 -0
  39. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +0 -0
  40. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_airflow_manager.py +0 -0
  41. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_credentials.py +0 -0
  42. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_file_utils.py +0 -0
  43. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_filepath_generator.py +0 -0
  44. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_log_utils.py +0 -0
  45. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_parquet_saver.py +0 -0
  46. {sibi_dst-0.3.11 → sibi_dst-0.3.14}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.11
3
+ Version: 0.3.14
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -9,10 +9,13 @@ Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.11
10
10
  Classifier: Programming Language :: Python :: 3.12
11
11
  Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
12
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
+ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
12
14
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
13
15
  Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
14
16
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
15
- Requires-Dist: django (==4.1.13)
17
+ Requires-Dist: django (>=5.1.4,<6.0.0)
18
+ Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
16
19
  Requires-Dist: httpx (>=0.27.2,<0.28.0)
17
20
  Requires-Dist: ipython (>=8.29.0,<9.0.0)
18
21
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
@@ -26,9 +29,10 @@ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
26
29
  Requires-Dist: pymysql (>=1.1.1,<2.0.0)
27
30
  Requires-Dist: pytest (>=8.3.3,<9.0.0)
28
31
  Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
29
- Requires-Dist: sqlmodel (>=0.0.22,<0.0.23)
32
+ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
30
33
  Requires-Dist: tornado (>=6.4.1,<7.0.0)
31
34
  Requires-Dist: tqdm (>=4.67.0,<5.0.0)
35
+ Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
32
36
  Description-Content-Type: text/markdown
33
37
 
34
38
  # sibi-dst
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.11"
3
+ version = "0.3.14"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -13,13 +13,12 @@ dask = {extras = ["complete"], version = "^2024.11.1"}
13
13
  pydantic = "^2.9.2"
14
14
  tornado = "^6.4.1"
15
15
  psutil = "^6.1.0"
16
- django = "4.1.13"
16
+ django = "^5.1.4"
17
17
  pyarrow = "^18.0.0"
18
18
  mysqlclient = "^2.2.6"
19
19
  pymysql = "^1.1.1"
20
20
  httpx = "^0.27.2"
21
21
  python-dotenv = "^1.0.1"
22
- sqlmodel = "^0.0.22"
23
22
  tqdm = "^4.67.0"
24
23
  openpyxl = "^3.1.5"
25
24
  jinja2 = "^3.1.4"
@@ -29,6 +28,11 @@ pytest = "^8.3.3"
29
28
  clickhouse-connect = "^0.8.7"
30
29
  clickhouse-driver = "^0.2.9"
31
30
  paramiko = "^3.5.0"
31
+ chardet = "^5.2.0"
32
+ charset-normalizer = "^3.4.0"
33
+ uvicorn = "^0.32.1"
34
+ sqlalchemy = "^2.0.36"
35
+ djangorestframework = "^3.15.2"
32
36
 
33
37
 
34
38
  [build-system]
@@ -4,6 +4,7 @@ from typing import Any, Dict, TypeVar
4
4
  from typing import Union, Optional
5
5
 
6
6
  import dask.dataframe as dd
7
+ import dask_expr
7
8
  import pandas as pd
8
9
  from pydantic import BaseModel
9
10
 
@@ -42,6 +43,7 @@ class DfHelper:
42
43
  self.dt_field=kwargs.setdefault("dt_field", None)
43
44
  self.as_pandas = kwargs.setdefault("as_pandas", False)
44
45
  kwargs.setdefault("live", True)
46
+ kwargs.setdefault("logger", self.logger)
45
47
  self.post_init(**kwargs)
46
48
 
47
49
 
@@ -115,7 +117,7 @@ class DfHelper:
115
117
  self._post_process_df()
116
118
  self.logger.info("Data successfully loaded from sqlalchemy database.")
117
119
  except Exception as e:
118
- self.logger.error(f"Failed to load data from sqlqlchemy database: {e}")
120
+ self.logger.error(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
119
121
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
120
122
 
121
123
  return self.df
@@ -194,10 +196,16 @@ class DfHelper:
194
196
  self.logger.info("Post-processing of DataFrame completed.")
195
197
 
196
198
  def _process_loaded_data(self):
197
- if len(self.df.index) > 0:
198
- field_map = self.plugin_params.field_map or []
199
- if field_map:
199
+ self.logger.info(f"Type of self.df: {type(self.df)}")
200
+ if self.df.map_partitions(len).compute().sum() > 0:
201
+ field_map = self.plugin_params.field_map or {}
202
+ if isinstance(field_map, dict):
200
203
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
204
+ missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
205
+
206
+ if missing_columns:
207
+ self.logger.warning(
208
+ f"The following columns in field_map are not in the DataFrame: {missing_columns}")
201
209
 
202
210
  def rename_columns(df, mapping):
203
211
  return df.rename(columns=mapping)
@@ -211,21 +219,20 @@ class DfHelper:
211
219
  def save_to_parquet(self, parquet_filename: Optional[str] = None):
212
220
  ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
213
221
  ps.save_to_parquet(parquet_filename)
222
+ self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
214
223
 
215
- def save_to_clickhouse(self, database, table, order_by=None, **credentials):
216
- click_config ={
217
- 'database': database,
218
- 'table': table,
219
- 'order_by': order_by or 'id',
220
- }
221
- credentials = {**credentials, **click_config}
222
- cs=ClickHouseWriter(**credentials)
224
+ def save_to_clickhouse(self, **credentials):
225
+ if self.df.map_partitions(len).compute().sum() == 0:
226
+ self.logger.info("Cannot write to clickhouse since Dataframe is empty")
227
+ return
228
+ cs=ClickHouseWriter(logger=self.logger, **credentials)
223
229
  cs.save_to_clickhouse(self.df)
230
+ self.logger.info("Save to ClickHouse completed.")
224
231
 
225
232
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
226
233
  self.df = self.plugin_parquet.load_files()
227
234
  if options:
228
- self.df = ParquetFilterHandler().apply_filters_dask(self.df, options)
235
+ self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
229
236
  return self.df
230
237
 
231
238
  def load_period(self, **kwargs):
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
+ import dask_expr
4
5
  import django
5
6
  import pandas as pd
6
7
  from django.core.cache import cache
@@ -239,4 +240,7 @@ class ReadFrameDask:
239
240
  if verbose:
240
241
  self.update_with_verbose(dask_df, fieldnames, fields)
241
242
 
243
+ if isinstance(dask_df, dask_expr._collection.DataFrame):
244
+ dask_df = dask_df.to_legacy_dataframe()
245
+
242
246
  return dask_df
@@ -13,11 +13,10 @@ class HttpConfig(BaseModel):
13
13
  api_key: Optional[SecretStr] = None
14
14
  model_config = ConfigDict(arbitrary_types_allowed=True)
15
15
 
16
- def __init__(self, **data):
16
+ def __init__(self, logger=None, **data):
17
17
  super().__init__(**data)
18
18
  # Initialize the logger if not provided
19
- if not self.logger:
20
- self.logger = Logger(log_dir='./logs/', logger_name="HttpDataSource", log_file='http_data_source.log')
19
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
21
20
 
22
21
  async def fetch_data(self, **options) -> dd.DataFrame:
23
22
  """Asynchronously fetch JSON data from HTTP endpoint, substituting options into the URL path."""
@@ -1,7 +1,11 @@
1
1
  import pandas as pd
2
2
  import dask.dataframe as dd
3
+ from sibi_dst.utils import Logger
3
4
 
4
5
  class ParquetFilterHandler(object):
6
+ def __init__(self, logger=None):
7
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
8
+
5
9
  @staticmethod
6
10
  def apply_filters_dask(df, filters):
7
11
  dt_operators = ['date', 'time']
@@ -25,6 +25,8 @@ class ParquetConfig(BaseModel):
25
25
  @model_validator(mode='after')
26
26
  def check_parquet_params(self):
27
27
  # Configure paths based on fsspec
28
+ if self.logger is None:
29
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
28
30
  self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
29
31
 
30
32
  # Validation for parquet path
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
+ import dask_expr
4
5
  import pandas as pd
5
6
  from sqlalchemy import create_engine, inspect, select
6
7
  from sqlalchemy.orm import sessionmaker
@@ -10,7 +11,7 @@ from sibi_dst.utils import Logger
10
11
 
11
12
 
12
13
  class SQLAlchemyDask:
13
- def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, verbose=True):
14
+ def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
14
15
  """
15
16
  Initialize with an SQLAlchemy query and database engine URL.
16
17
 
@@ -19,13 +20,13 @@ class SQLAlchemyDask:
19
20
  :param engine_url: Database connection string for SQLAlchemy engine.
20
21
  :param chunk_size: Number of records per chunk for Dask partitions.
21
22
  :param logger: Logger instance for logging.
22
- :param verbose: Whether to print detailed logs.
23
+ :param debug: Whether to print detailed logs.
23
24
  """
24
25
  self.query = None
25
26
  self.model = model
26
27
  self.filters = filters
27
28
  self.chunk_size = chunk_size
28
- self.verbose = verbose
29
+ self.debug = debug
29
30
  self.engine = create_engine(engine_url)
30
31
  self.Session = sessionmaker(bind=self.engine)
31
32
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -118,14 +119,17 @@ class SQLAlchemyDask:
118
119
  partitions.append(dd.from_pandas(df, npartitions=1))
119
120
 
120
121
  # Concatenate all partitions
121
- # print(partitions)
122
122
  if partitions:
123
123
  dask_df = dd.concat(partitions, axis=0, ignore_index=True)
124
124
  else:
125
125
  dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
126
126
 
127
- if self.verbose:
127
+ if self.debug:
128
128
  self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
+
130
+ if isinstance(dask_df, dask_expr._collection.DataFrame):
131
+ dask_df = dask_df.to_legacy_dataframe()
132
+
129
133
  return dask_df
130
134
 
131
135
  except Exception as e:
@@ -1,22 +1,13 @@
1
- from typing import Dict
2
-
3
1
  import dask.dataframe as dd
4
2
  import pandas as pd
5
- from sqlalchemy.inspection import inspect
6
- from sqlalchemy.orm import sessionmaker
7
- from sqlalchemy import select
8
- #from sqlmodel import Session, select
9
3
 
10
- from sibi_dst.df_helper.core import ParamsConfig, QueryConfig, sqlalchemy_field_conversion_map_dask, \
11
- normalize_sqlalchemy_type
4
+ from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
12
5
  from sibi_dst.utils import Logger
13
6
  from ._io_sqlalchemy_dask import SQLAlchemyDask
14
- from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
15
7
  from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
16
8
 
17
-
18
9
  class SqlAlchemyLoadFromDb:
19
- df: dd.DataFrame
10
+ df: dd.DataFrame = None
20
11
 
21
12
  def __init__(
22
13
  self,
@@ -43,17 +34,25 @@ class SqlAlchemyLoadFromDb:
43
34
  """
44
35
  Load data into a Dask DataFrame based on the query and parameters.
45
36
  """
46
- self.df = self._build_and_load()
37
+ self._build_and_load()
47
38
  return self.df
48
39
 
49
40
  def _build_and_load(self) -> dd.DataFrame:
41
+
50
42
  try:
51
- reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, verbose=self.debug)
52
- df = reader.read_frame()
53
- if df is None or len(df.index) == 0:
43
+ # reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
44
+ self.df = SQLAlchemyDask(
45
+ model=self.model,
46
+ filters=self.params_config.filters,
47
+ engine_url=self.engine.url,
48
+ logger=self.logger,
49
+ chunk_size=1000,
50
+ debug=self.debug).read_frame()
51
+ if self.df is None or len(self.df.head().index) == 0:
54
52
  self.logger.warning("Query returned no results.")
55
53
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
56
- return df
54
+
55
+ return self.df
57
56
  except Exception as e:
58
57
  self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
59
58
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
  from ._credentials import ConfigManager, ConfigLoader
3
3
  from ._log_utils import Logger
4
- from ._date_utils import DateUtils
4
+ from ._date_utils import DateUtils, BusinessDays
5
5
  from ._data_utils import DataUtils
6
6
  from ._file_utils import FileUtils
7
7
  from ._filepath_generator import FilePathGenerator
@@ -17,6 +17,7 @@ __all__=[
17
17
  "ConfigLoader",
18
18
  "Logger",
19
19
  "DateUtils",
20
+ "BusinessDays",
20
21
  "FileUtils",
21
22
  "DataWrapper",
22
23
  "DataUtils",
@@ -31,9 +31,9 @@ class ClickHouseWriter:
31
31
  self.order_by=kwargs.setdefault('order_by','id')
32
32
 
33
33
  def save_to_clickhouse(self, df, **kwargs):
34
- self.df = df
34
+ self.df = df.copy()
35
35
  self.order_by = kwargs.setdefault('order_by',self.order_by)
36
- if len(self.df.index) == 0:
36
+ if len(self.df.head().index) == 0:
37
37
  self.logger.info("Dataframe is empty")
38
38
  return
39
39
  self._handle_missing_values()
@@ -122,7 +122,7 @@ class ClickHouseWriter:
122
122
  """
123
123
  Writes the Dask DataFrame to a ClickHouse table partition by partition.
124
124
  """
125
- if len(self.df.index) == 0:
125
+ if len(self.df.head().index) == 0:
126
126
  self.logger.info("No data found. Nothing written.")
127
127
  return
128
128
 
@@ -7,6 +7,27 @@ class DataUtils:
7
7
  def __init__(self, logger=None):
8
8
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
9
9
 
10
+ def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
11
+ if not columns:
12
+ self.logger.warning('No columns specified')
13
+
14
+ columns = [column for column in columns if column in df.columns]
15
+ for col in columns:
16
+ if isinstance(df, dd.DataFrame):
17
+ # Replace NaN with 0, then convert to boolean
18
+ df[col] = df[col].map_partitions(
19
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
20
+ .fillna(fill_value) # Replace NaN with 0
21
+ .astype(dtype),
22
+ meta=(col, dtype)
23
+ )
24
+ else:
25
+ # For Pandas DataFrame, handle mixed types and invalid values
26
+ df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
27
+ df[col] = df[col].fillna(fill_value).astype(dtype)
28
+
29
+ return df
30
+
10
31
  @staticmethod
11
32
  def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
12
33
  """
@@ -14,6 +35,7 @@ class DataUtils:
14
35
 
15
36
  Parameters:
16
37
  - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
38
+ - columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
17
39
  - fill_value (int or float): The value to replace NA values with.
18
40
  - transform_func (callable, optional): The transformation function to apply.
19
41
  If None, no additional transformation is applied.
@@ -28,31 +50,28 @@ class DataUtils:
28
50
  if not columns:
29
51
  return df
30
52
 
53
+ columns = [column for column in columns if column in df.columns]
31
54
  # Default transformation function (identity) if none is provided
32
55
  if transform_func is None:
33
56
  transform_func = lambda x: x
34
57
 
35
- # Apply transformations
36
- for col in columns:
37
- dtype = df[col].dtype
38
- if pd.api.types.is_integer_dtype(dtype):
39
- meta_type = 'int64'
40
- elif pd.api.types.is_float_dtype(dtype):
41
- meta_type = 'float64'
42
- else:
43
- continue # Skip non-numeric columns
58
+ # Batch processing for Dask
59
+ if isinstance(df, dd.DataFrame):
60
+ def transform_partition(partition):
61
+ # Apply transformations for all numeric columns in a single pass
62
+ partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
63
+ return partition
64
+
65
+ # Apply the transformation function to all specified columns
66
+ df = df.map_partitions(transform_partition, meta=df)
67
+ else:
68
+ # Pandas: Vectorized operations for all specified columns
69
+ df[columns] = df[columns].fillna(fill_value).map(transform_func)
44
70
 
45
- df[col] = df[col].fillna(fill_value).astype(meta_type)
46
- if isinstance(df, dd.DataFrame):
47
- df[col] = df[col].map_partitions(
48
- lambda s: s.apply(transform_func), meta=(col, meta_type)
49
- )
50
- else:
51
- df[col] = df[col].apply(transform_func)
52
71
  return df
53
72
 
54
73
  @staticmethod
55
- def transform_boolean_columns(df, columns=None, sample_size=100):
74
+ def transform_boolean_columns(df, columns=None):
56
75
  """
57
76
  Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
58
77
  and convert them to boolean. Detection is performed using a sample.
@@ -96,73 +115,67 @@ class DataUtils:
96
115
  Returns:
97
116
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
98
117
  """
99
- # Check if the DataFrame is empty
118
+ # Return early if the DataFrame is empty
100
119
  if self.is_dataframe_empty(df):
101
120
  return df
102
121
 
103
- # Extract required parameters with default values
104
- source_col = kwargs.pop('source_col', None)
105
- lookup_col = kwargs.pop('lookup_col', None)
106
- lookup_description_col = kwargs.pop('lookup_description_col', None)
107
- source_description_alias = kwargs.pop('source_description_alias', None)
108
- fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
109
- fieldnames = kwargs.get('fieldnames', None)
110
- column_names = kwargs.get('column_names', None)
122
+ # Extract and validate required parameters
123
+ required_params = ['source_col', 'lookup_col', 'lookup_description_col', 'source_description_alias']
124
+ missing_params = [param for param in required_params if param not in kwargs]
125
+ if missing_params:
126
+ raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
111
127
 
112
- # Validate required parameters
113
- if not all([source_col, lookup_col, lookup_description_col, source_description_alias]):
114
- raise ValueError(
115
- 'source_col, lookup_col, lookup_description_col, and source_description_alias must be specified'
116
- )
128
+ source_col = kwargs.pop('source_col')
129
+ lookup_col = kwargs.pop('lookup_col')
130
+ lookup_description_col = kwargs.pop('lookup_description_col')
131
+ source_description_alias = kwargs.pop('source_description_alias')
132
+
133
+ # Optional parameters with default values
134
+ fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
135
+ fieldnames = kwargs.pop('fieldnames', (lookup_col, lookup_description_col))
136
+ column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
117
137
 
118
138
  if source_col not in df.columns:
119
- self.logger.info(f'{source_col} not in DataFrame columns')
139
+ self.logger.info(f"{source_col} not in DataFrame columns")
120
140
  return df
121
141
 
122
142
  # Get unique IDs from source column
123
143
  ids = df[source_col].dropna().unique()
124
144
  if isinstance(ids, dd.Series):
125
145
  ids = ids.compute()
126
- ids = ids.tolist()
127
-
128
- if not ids:
129
- self.logger.info(f'No IDs found in the source column: {source_col}')
146
+ if not len(ids):
147
+ self.logger.info(f"No IDs found in the source column: {source_col}")
130
148
  return df
131
-
132
- # Set default fieldnames and column_names if not provided
133
- if fieldnames is None:
134
- kwargs['fieldnames'] = (lookup_col, lookup_description_col)
135
- if column_names is None:
136
- kwargs['column_names'] = ['temp_join_col', source_description_alias]
137
-
149
+ ids = sorted(ids.tolist())
138
150
  # Prepare kwargs for loading lookup data
139
151
  load_kwargs = kwargs.copy()
140
- load_kwargs[f'{lookup_col}__in'] = ids
141
-
152
+ load_kwargs.update({
153
+ 'fieldnames': fieldnames,
154
+ 'column_names': column_names,
155
+ f'{lookup_col}__in': ids
156
+ })
142
157
  # Load lookup data
143
- lookup_instance = classname()
158
+ lookup_instance = classname(debug=True, verbose_debug=True)
144
159
  result = lookup_instance.load(**load_kwargs)
145
-
160
+ if len(result.index) == 0:
161
+ self.logger.info(f"No IDs found in the source column: {source_col}")
162
+ return df
146
163
  # Determine the join column on the result DataFrame
147
- if 'temp_join_col' in kwargs.get("column_names", []):
148
- temp_join_col = 'temp_join_col'
149
- else:
150
- temp_join_col = lookup_col
164
+ temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
151
165
 
152
166
  # Merge DataFrames
153
167
  df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
154
168
 
155
169
  if fillna_source_description_alias and source_description_alias in df.columns:
156
- df[source_description_alias] = df[source_description_alias].fillna('')
170
+ df[source_description_alias]=df[source_description_alias].fillna('')
157
171
 
158
172
  # Drop temp_join_col if present
159
- if 'temp_join_col' in df.columns:
160
- df = df.drop(columns='temp_join_col')
173
+ df = df.drop(columns='temp_join_col', errors='ignore')
161
174
 
162
175
  return df
163
176
 
164
- @staticmethod
165
- def is_dataframe_empty(df):
177
+
178
+ def is_dataframe_empty(self, df):
166
179
  """
167
180
  Check if a DataFrame (Pandas or Dask) is empty.
168
181
 
@@ -173,14 +186,30 @@ class DataUtils:
173
186
  - bool: True if the DataFrame is empty, False otherwise.
174
187
  """
175
188
  if isinstance(df, dd.DataFrame):
176
- df_size = df.map_partitions(len).sum().compute()
177
- return df_size == 0
178
- else:
189
+ try:
190
+ return len(df.index) == 0
191
+ except Exception as e:
192
+ self.logger.error(f"Error while processing Dask DataFrame: {e}")
193
+ return False
194
+ elif isinstance(df, pd.DataFrame):
179
195
  return df.empty
196
+ else:
197
+ self.logger.error("Input must be a pandas or dask DataFrame.")
198
+ return False
180
199
 
181
200
  @staticmethod
182
- def convert_to_datetime(df, date_fields):
201
+ def convert_to_datetime_dask(df, date_fields):
202
+ """
203
+ Convert specified columns in a Dask DataFrame to datetime, handling errors gracefully.
204
+
205
+ Parameters:
206
+ - df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
207
+ - date_fields (list of str): List of column names to convert to datetime.
208
+
209
+ Returns:
210
+ - dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
211
+ """
183
212
  for col in date_fields:
184
213
  if col in df.columns:
185
- df[col] = pd.to_datetime(df[col], errors='coerce')
186
- return df
214
+ df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
215
+ return df