sibi-dst 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ from typing import Any, Dict, TypeVar
4
4
  from typing import Union, Optional
5
5
 
6
6
  import dask.dataframe as dd
7
+ import dask_expr
7
8
  import pandas as pd
8
9
  from pydantic import BaseModel
9
10
 
@@ -42,6 +43,7 @@ class DfHelper:
42
43
  self.dt_field=kwargs.setdefault("dt_field", None)
43
44
  self.as_pandas = kwargs.setdefault("as_pandas", False)
44
45
  kwargs.setdefault("live", True)
46
+ kwargs.setdefault("logger", self.logger)
45
47
  self.post_init(**kwargs)
46
48
 
47
49
 
@@ -115,7 +117,7 @@ class DfHelper:
115
117
  self._post_process_df()
116
118
  self.logger.info("Data successfully loaded from sqlalchemy database.")
117
119
  except Exception as e:
118
- self.logger.error(f"Failed to load data from sqlqlchemy database: {e}")
120
+ self.logger.error(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
119
121
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
120
122
 
121
123
  return self.df
@@ -194,10 +196,16 @@ class DfHelper:
194
196
  self.logger.info("Post-processing of DataFrame completed.")
195
197
 
196
198
  def _process_loaded_data(self):
197
- if len(self.df.index) > 0:
198
- field_map = self.plugin_params.field_map or []
199
- if field_map:
199
+ self.logger.info(f"Type of self.df: {type(self.df)}")
200
+ if self.df.map_partitions(len).compute().sum() > 0:
201
+ field_map = self.plugin_params.field_map or {}
202
+ if isinstance(field_map, dict):
200
203
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
204
+ missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
205
+
206
+ if missing_columns:
207
+ self.logger.warning(
208
+ f"The following columns in field_map are not in the DataFrame: {missing_columns}")
201
209
 
202
210
  def rename_columns(df, mapping):
203
211
  return df.rename(columns=mapping)
@@ -211,21 +219,20 @@ class DfHelper:
211
219
  def save_to_parquet(self, parquet_filename: Optional[str] = None):
212
220
  ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
213
221
  ps.save_to_parquet(parquet_filename)
222
+ self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
214
223
 
215
- def save_to_clickhouse(self, database, table, order_by=None, **credentials):
216
- click_config ={
217
- 'database': database,
218
- 'table': table,
219
- 'order_by': order_by or 'id',
220
- }
221
- credentials = {**credentials, **click_config}
222
- cs=ClickHouseWriter(**credentials)
224
+ def save_to_clickhouse(self, **credentials):
225
+ if self.df.map_partitions(len).compute().sum() == 0:
226
+ self.logger.info("Cannot write to clickhouse since Dataframe is empty")
227
+ return
228
+ cs=ClickHouseWriter(logger=self.logger, **credentials)
223
229
  cs.save_to_clickhouse(self.df)
230
+ self.logger.info("Save to ClickHouse completed.")
224
231
 
225
232
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
226
233
  self.df = self.plugin_parquet.load_files()
227
234
  if options:
228
- self.df = ParquetFilterHandler().apply_filters_dask(self.df, options)
235
+ self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
229
236
  return self.df
230
237
 
231
238
  def load_period(self, **kwargs):
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
+ import dask_expr
4
5
  import django
5
6
  import pandas as pd
6
7
  from django.core.cache import cache
@@ -239,4 +240,7 @@ class ReadFrameDask:
239
240
  if verbose:
240
241
  self.update_with_verbose(dask_df, fieldnames, fields)
241
242
 
243
+ if isinstance(dask_df, dask_expr._collection.DataFrame):
244
+ dask_df = dask_df.to_legacy_dataframe()
245
+
242
246
  return dask_df
@@ -13,11 +13,10 @@ class HttpConfig(BaseModel):
13
13
  api_key: Optional[SecretStr] = None
14
14
  model_config = ConfigDict(arbitrary_types_allowed=True)
15
15
 
16
- def __init__(self, **data):
16
+ def __init__(self, logger=None, **data):
17
17
  super().__init__(**data)
18
18
  # Initialize the logger if not provided
19
- if not self.logger:
20
- self.logger = Logger(log_dir='./logs/', logger_name="HttpDataSource", log_file='http_data_source.log')
19
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
21
20
 
22
21
  async def fetch_data(self, **options) -> dd.DataFrame:
23
22
  """Asynchronously fetch JSON data from HTTP endpoint, substituting options into the URL path."""
@@ -1,7 +1,11 @@
1
1
  import pandas as pd
2
2
  import dask.dataframe as dd
3
+ from sibi_dst.utils import Logger
3
4
 
4
5
  class ParquetFilterHandler(object):
6
+ def __init__(self, logger=None):
7
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
8
+
5
9
  @staticmethod
6
10
  def apply_filters_dask(df, filters):
7
11
  dt_operators = ['date', 'time']
@@ -25,6 +25,8 @@ class ParquetConfig(BaseModel):
25
25
  @model_validator(mode='after')
26
26
  def check_parquet_params(self):
27
27
  # Configure paths based on fsspec
28
+ if self.logger is None:
29
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
28
30
  self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
29
31
 
30
32
  # Validation for parquet path
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
 
3
3
  import dask.dataframe as dd
4
+ import dask_expr
4
5
  import pandas as pd
5
6
  from sqlalchemy import create_engine, inspect, select
6
7
  from sqlalchemy.orm import sessionmaker
@@ -10,7 +11,7 @@ from sibi_dst.utils import Logger
10
11
 
11
12
 
12
13
  class SQLAlchemyDask:
13
- def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, verbose=True):
14
+ def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
14
15
  """
15
16
  Initialize with an SQLAlchemy query and database engine URL.
16
17
 
@@ -19,13 +20,13 @@ class SQLAlchemyDask:
19
20
  :param engine_url: Database connection string for SQLAlchemy engine.
20
21
  :param chunk_size: Number of records per chunk for Dask partitions.
21
22
  :param logger: Logger instance for logging.
22
- :param verbose: Whether to print detailed logs.
23
+ :param debug: Whether to print detailed logs.
23
24
  """
24
25
  self.query = None
25
26
  self.model = model
26
27
  self.filters = filters
27
28
  self.chunk_size = chunk_size
28
- self.verbose = verbose
29
+ self.debug = debug
29
30
  self.engine = create_engine(engine_url)
30
31
  self.Session = sessionmaker(bind=self.engine)
31
32
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -118,14 +119,17 @@ class SQLAlchemyDask:
118
119
  partitions.append(dd.from_pandas(df, npartitions=1))
119
120
 
120
121
  # Concatenate all partitions
121
- # print(partitions)
122
122
  if partitions:
123
123
  dask_df = dd.concat(partitions, axis=0, ignore_index=True)
124
124
  else:
125
125
  dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
126
126
 
127
- if self.verbose:
127
+ if self.debug:
128
128
  self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
+
130
+ if isinstance(dask_df, dask_expr._collection.DataFrame):
131
+ dask_df = dask_df.to_legacy_dataframe()
132
+
129
133
  return dask_df
130
134
 
131
135
  except Exception as e:
@@ -1,22 +1,13 @@
1
- from typing import Dict
2
-
3
1
  import dask.dataframe as dd
4
2
  import pandas as pd
5
- from sqlalchemy.inspection import inspect
6
- from sqlalchemy.orm import sessionmaker
7
- from sqlalchemy import select
8
- #from sqlmodel import Session, select
9
3
 
10
- from sibi_dst.df_helper.core import ParamsConfig, QueryConfig, sqlalchemy_field_conversion_map_dask, \
11
- normalize_sqlalchemy_type
4
+ from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
12
5
  from sibi_dst.utils import Logger
13
6
  from ._io_sqlalchemy_dask import SQLAlchemyDask
14
- from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
15
7
  from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
16
8
 
17
-
18
9
  class SqlAlchemyLoadFromDb:
19
- df: dd.DataFrame
10
+ df: dd.DataFrame = None
20
11
 
21
12
  def __init__(
22
13
  self,
@@ -43,17 +34,25 @@ class SqlAlchemyLoadFromDb:
43
34
  """
44
35
  Load data into a Dask DataFrame based on the query and parameters.
45
36
  """
46
- self.df = self._build_and_load()
37
+ self._build_and_load()
47
38
  return self.df
48
39
 
49
40
  def _build_and_load(self) -> dd.DataFrame:
41
+
50
42
  try:
51
- reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, verbose=self.debug)
52
- df = reader.read_frame()
53
- if df is None or len(df.index) == 0:
43
+ # reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
44
+ self.df = SQLAlchemyDask(
45
+ model=self.model,
46
+ filters=self.params_config.filters,
47
+ engine_url=self.engine.url,
48
+ logger=self.logger,
49
+ chunk_size=1000,
50
+ debug=self.debug).read_frame()
51
+ if self.df is None or len(self.df.head().index) == 0:
54
52
  self.logger.warning("Query returned no results.")
55
53
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
56
- return df
54
+
55
+ return self.df
57
56
  except Exception as e:
58
57
  self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
59
58
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
  from ._credentials import ConfigManager, ConfigLoader
3
3
  from ._log_utils import Logger
4
- from ._date_utils import DateUtils
4
+ from ._date_utils import DateUtils, BusinessDays
5
5
  from ._data_utils import DataUtils
6
6
  from ._file_utils import FileUtils
7
7
  from ._filepath_generator import FilePathGenerator
@@ -17,6 +17,7 @@ __all__=[
17
17
  "ConfigLoader",
18
18
  "Logger",
19
19
  "DateUtils",
20
+ "BusinessDays",
20
21
  "FileUtils",
21
22
  "DataWrapper",
22
23
  "DataUtils",
@@ -31,9 +31,9 @@ class ClickHouseWriter:
31
31
  self.order_by=kwargs.setdefault('order_by','id')
32
32
 
33
33
  def save_to_clickhouse(self, df, **kwargs):
34
- self.df = df
34
+ self.df = df.copy()
35
35
  self.order_by = kwargs.setdefault('order_by',self.order_by)
36
- if len(self.df.index) == 0:
36
+ if len(self.df.head().index) == 0:
37
37
  self.logger.info("Dataframe is empty")
38
38
  return
39
39
  self._handle_missing_values()
@@ -122,7 +122,7 @@ class ClickHouseWriter:
122
122
  """
123
123
  Writes the Dask DataFrame to a ClickHouse table partition by partition.
124
124
  """
125
- if len(self.df.index) == 0:
125
+ if len(self.df.head().index) == 0:
126
126
  self.logger.info("No data found. Nothing written.")
127
127
  return
128
128
 
@@ -7,6 +7,27 @@ class DataUtils:
7
7
  def __init__(self, logger=None):
8
8
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
9
9
 
10
+ def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
11
+ if not columns:
12
+ self.logger.warning('No columns specified')
13
+
14
+ columns = [column for column in columns if column in df.columns]
15
+ for col in columns:
16
+ if isinstance(df, dd.DataFrame):
17
+ # Replace NaN with 0, then convert to boolean
18
+ df[col] = df[col].map_partitions(
19
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
20
+ .fillna(fill_value) # Replace NaN with 0
21
+ .astype(dtype),
22
+ meta=(col, dtype)
23
+ )
24
+ else:
25
+ # For Pandas DataFrame, handle mixed types and invalid values
26
+ df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
27
+ df[col] = df[col].fillna(fill_value).astype(dtype)
28
+
29
+ return df
30
+
10
31
  @staticmethod
11
32
  def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
12
33
  """
@@ -14,6 +35,7 @@ class DataUtils:
14
35
 
15
36
  Parameters:
16
37
  - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
38
+ - columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
17
39
  - fill_value (int or float): The value to replace NA values with.
18
40
  - transform_func (callable, optional): The transformation function to apply.
19
41
  If None, no additional transformation is applied.
@@ -28,31 +50,28 @@ class DataUtils:
28
50
  if not columns:
29
51
  return df
30
52
 
53
+ columns = [column for column in columns if column in df.columns]
31
54
  # Default transformation function (identity) if none is provided
32
55
  if transform_func is None:
33
56
  transform_func = lambda x: x
34
57
 
35
- # Apply transformations
36
- for col in columns:
37
- dtype = df[col].dtype
38
- if pd.api.types.is_integer_dtype(dtype):
39
- meta_type = 'int64'
40
- elif pd.api.types.is_float_dtype(dtype):
41
- meta_type = 'float64'
42
- else:
43
- continue # Skip non-numeric columns
58
+ # Batch processing for Dask
59
+ if isinstance(df, dd.DataFrame):
60
+ def transform_partition(partition):
61
+ # Apply transformations for all numeric columns in a single pass
62
+ partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
63
+ return partition
64
+
65
+ # Apply the transformation function to all specified columns
66
+ df = df.map_partitions(transform_partition, meta=df)
67
+ else:
68
+ # Pandas: Vectorized operations for all specified columns
69
+ df[columns] = df[columns].fillna(fill_value).map(transform_func)
44
70
 
45
- df[col] = df[col].fillna(fill_value).astype(meta_type)
46
- if isinstance(df, dd.DataFrame):
47
- df[col] = df[col].map_partitions(
48
- lambda s: s.apply(transform_func), meta=(col, meta_type)
49
- )
50
- else:
51
- df[col] = df[col].apply(transform_func)
52
71
  return df
53
72
 
54
73
  @staticmethod
55
- def transform_boolean_columns(df, columns=None, sample_size=100):
74
+ def transform_boolean_columns(df, columns=None):
56
75
  """
57
76
  Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
58
77
  and convert them to boolean. Detection is performed using a sample.
@@ -96,73 +115,67 @@ class DataUtils:
96
115
  Returns:
97
116
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
98
117
  """
99
- # Check if the DataFrame is empty
118
+ # Return early if the DataFrame is empty
100
119
  if self.is_dataframe_empty(df):
101
120
  return df
102
121
 
103
- # Extract required parameters with default values
104
- source_col = kwargs.pop('source_col', None)
105
- lookup_col = kwargs.pop('lookup_col', None)
106
- lookup_description_col = kwargs.pop('lookup_description_col', None)
107
- source_description_alias = kwargs.pop('source_description_alias', None)
108
- fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
109
- fieldnames = kwargs.get('fieldnames', None)
110
- column_names = kwargs.get('column_names', None)
122
+ # Extract and validate required parameters
123
+ required_params = ['source_col', 'lookup_col', 'lookup_description_col', 'source_description_alias']
124
+ missing_params = [param for param in required_params if param not in kwargs]
125
+ if missing_params:
126
+ raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
111
127
 
112
- # Validate required parameters
113
- if not all([source_col, lookup_col, lookup_description_col, source_description_alias]):
114
- raise ValueError(
115
- 'source_col, lookup_col, lookup_description_col, and source_description_alias must be specified'
116
- )
128
+ source_col = kwargs.pop('source_col')
129
+ lookup_col = kwargs.pop('lookup_col')
130
+ lookup_description_col = kwargs.pop('lookup_description_col')
131
+ source_description_alias = kwargs.pop('source_description_alias')
132
+
133
+ # Optional parameters with default values
134
+ fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
135
+ fieldnames = kwargs.pop('fieldnames', (lookup_col, lookup_description_col))
136
+ column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
117
137
 
118
138
  if source_col not in df.columns:
119
- self.logger.info(f'{source_col} not in DataFrame columns')
139
+ self.logger.info(f"{source_col} not in DataFrame columns")
120
140
  return df
121
141
 
122
142
  # Get unique IDs from source column
123
143
  ids = df[source_col].dropna().unique()
124
144
  if isinstance(ids, dd.Series):
125
145
  ids = ids.compute()
126
- ids = ids.tolist()
127
-
128
- if not ids:
129
- self.logger.info(f'No IDs found in the source column: {source_col}')
146
+ if not len(ids):
147
+ self.logger.info(f"No IDs found in the source column: {source_col}")
130
148
  return df
131
-
132
- # Set default fieldnames and column_names if not provided
133
- if fieldnames is None:
134
- kwargs['fieldnames'] = (lookup_col, lookup_description_col)
135
- if column_names is None:
136
- kwargs['column_names'] = ['temp_join_col', source_description_alias]
137
-
149
+ ids = sorted(ids.tolist())
138
150
  # Prepare kwargs for loading lookup data
139
151
  load_kwargs = kwargs.copy()
140
- load_kwargs[f'{lookup_col}__in'] = ids
141
-
152
+ load_kwargs.update({
153
+ 'fieldnames': fieldnames,
154
+ 'column_names': column_names,
155
+ f'{lookup_col}__in': ids
156
+ })
142
157
  # Load lookup data
143
- lookup_instance = classname()
158
+ lookup_instance = classname(debug=True, verbose_debug=True)
144
159
  result = lookup_instance.load(**load_kwargs)
145
-
160
+ if len(result.index) == 0:
161
+ self.logger.info(f"No IDs found in the source column: {source_col}")
162
+ return df
146
163
  # Determine the join column on the result DataFrame
147
- if 'temp_join_col' in kwargs.get("column_names", []):
148
- temp_join_col = 'temp_join_col'
149
- else:
150
- temp_join_col = lookup_col
164
+ temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
151
165
 
152
166
  # Merge DataFrames
153
167
  df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
154
168
 
155
169
  if fillna_source_description_alias and source_description_alias in df.columns:
156
- df[source_description_alias] = df[source_description_alias].fillna('')
170
+ df[source_description_alias]=df[source_description_alias].fillna('')
157
171
 
158
172
  # Drop temp_join_col if present
159
- if 'temp_join_col' in df.columns:
160
- df = df.drop(columns='temp_join_col')
173
+ df = df.drop(columns='temp_join_col', errors='ignore')
161
174
 
162
175
  return df
163
176
 
164
- @staticmethod
165
- def is_dataframe_empty(df):
177
+
178
+ def is_dataframe_empty(self, df):
166
179
  """
167
180
  Check if a DataFrame (Pandas or Dask) is empty.
168
181
 
@@ -173,14 +186,30 @@ class DataUtils:
173
186
  - bool: True if the DataFrame is empty, False otherwise.
174
187
  """
175
188
  if isinstance(df, dd.DataFrame):
176
- df_size = df.map_partitions(len).sum().compute()
177
- return df_size == 0
178
- else:
189
+ try:
190
+ return len(df.index) == 0
191
+ except Exception as e:
192
+ self.logger.error(f"Error while processing Dask DataFrame: {e}")
193
+ return False
194
+ elif isinstance(df, pd.DataFrame):
179
195
  return df.empty
196
+ else:
197
+ self.logger.error("Input must be a pandas or dask DataFrame.")
198
+ return False
180
199
 
181
200
  @staticmethod
182
- def convert_to_datetime(df, date_fields):
201
+ def convert_to_datetime_dask(df, date_fields):
202
+ """
203
+ Convert specified columns in a Dask DataFrame to datetime, handling errors gracefully.
204
+
205
+ Parameters:
206
+ - df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
207
+ - date_fields (list of str): List of column names to convert to datetime.
208
+
209
+ Returns:
210
+ - dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
211
+ """
183
212
  for col in date_fields:
184
213
  if col in df.columns:
185
- df[col] = pd.to_datetime(df[col], errors='coerce')
186
- return df
214
+ df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
215
+ return df
@@ -164,7 +164,7 @@ class DataWrapper:
164
164
 
165
165
  date_range = self.generate_date_range()
166
166
  if self.show_progress:
167
- date_range = tqdm(date_range, desc="Evaluating update plan", unit="date")
167
+ date_range = tqdm(date_range, desc=f"Evaluating update plan {self.__class__.__name__}", unit="date")
168
168
 
169
169
  for current_date in date_range:
170
170
  folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
@@ -207,325 +207,7 @@ class DataWrapper:
207
207
  return update_plan_table
208
208
 
209
209
 
210
- # import datetime
211
- # from typing import Type, Any, Dict, Optional
212
- # import fsspec
213
- # import pandas as pd
214
- # from IPython.display import display
215
- #
216
- # from sibi_dst.utils import Logger
217
- # from tqdm import tqdm
218
- # from sibi_dst.utils import ParquetSaver
219
- #
220
- # class DataWrapper:
221
- # DEFAULT_MAX_AGE_MINUTES = 1440
222
- # DEFAULT_HISTORY_DAYS_THRESHOLD = 30
223
- #
224
- # def __init__(self,
225
- # dataclass: Type,
226
- # date_field: str,
227
- # data_path: str,
228
- # parquet_filename: str,
229
- # start_date: Any,
230
- # end_date: Any,
231
- # filesystem_type: str = "file",
232
- # filesystem_options: Optional[Dict] = None,
233
- # verbose: bool = False,
234
- # class_params: Optional[Dict] = None,
235
- # load_params: Optional[Dict] = None,
236
- # reverse_order: bool = False,
237
- # overwrite: bool = False,
238
- # ignore_missing: bool = False,
239
- # logger: Optional[Logger] = None,
240
- # max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
241
- # history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
242
- # show_progress: bool = False):
243
- # self.dataclass = dataclass
244
- # self.date_field = date_field
245
- # self.data_path = self.ensure_forward_slash(data_path)
246
- # self.parquet_filename = parquet_filename
247
- # self.filesystem_type = filesystem_type
248
- # self.filesystem_options = filesystem_options or {}
249
- # self.fs = fsspec.filesystem(filesystem_type, **self.filesystem_options)
250
- # self.verbose = verbose
251
- # self.class_params = class_params or {}
252
- # self.load_params = load_params or {}
253
- # self.reverse_order = reverse_order
254
- # self.overwrite = overwrite
255
- # self.ignore_missing = ignore_missing
256
- # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
257
- # self.max_age_minutes = max_age_minutes
258
- # self.history_days_threshold = history_days_threshold
259
- # self.show_progress = show_progress
260
- #
261
- # self.start_date = self.convert_to_date(start_date)
262
- # self.end_date = self.convert_to_date(end_date)
263
- #
264
- #
265
- # def convert_to_date(self, date: Any) -> datetime.date:
266
- # try:
267
- # return datetime.datetime.strptime(date, '%Y-%m-%d').date() if isinstance(date, str) else date
268
- # except ValueError as e:
269
- # self.logger.error(f"Error converting {date} to datetime: {e}")
270
- # raise
271
- #
272
- # @staticmethod
273
- # def ensure_forward_slash(path: str) -> str:
274
- # return path if path.endswith('/') else path + '/'
275
- #
276
- # def generate_date_range(self):
277
- # step = -1 if self.reverse_order else 1
278
- # start, end = (self.end_date, self.start_date) if self.reverse_order else (self.start_date, self.end_date)
279
- # current_date = start
280
- # while current_date != end + datetime.timedelta(days=step):
281
- # yield current_date
282
- # current_date += datetime.timedelta(days=step)
283
- #
284
- # def process(self):
285
- # """Execute the update plan following the specified hierarchy."""
286
- # update_plan, update_plan_table = self.generate_update_plan_with_conditions()
287
- #
288
- # # Display the update plan table to the user
289
- #
290
- # display(update_plan_table)
291
- #
292
- # # Process files according to the hierarchy, considering only `update_required` dates
293
- # for category, description in [
294
- # ("overwrite", "Processing files due to overwrite=True"),
295
- # ("history_days", "Processing files within history_days_threshold"),
296
- # ("missing_files", "Processing missing files")
297
- # ]:
298
- # # Filter dates in the category where `update_required` is True
299
- # dates_to_process = update_plan_table[
300
- # (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
301
- # ]["date"].tolist()
302
- #
303
- # for current_date in tqdm(dates_to_process, desc=description, unit="date"):
304
- # self.process_date(current_date)
305
- #
306
- # def is_file_older_than(self, file_path: str, current_date: datetime.date) -> bool:
307
- # """
308
- # Check if a file is older than the specified max_age_minutes.
309
- # """
310
- # if not self.fs.exists(file_path):
311
- # return True # Treat missing files as old
312
- #
313
- # # Get the file modification time
314
- # file_modification_time = self.fs.info(file_path)['mtime']
315
- # file_modification_datetime = datetime.datetime.fromtimestamp(file_modification_time, tz=datetime.timezone.utc)
316
- #
317
- # # Get the current UTC time as a timezone-aware object
318
- # current_time = datetime.datetime.now(datetime.timezone.utc)
319
- #
320
- # # Calculate file age in seconds and minutes
321
- # file_age_seconds = (current_time - file_modification_datetime).total_seconds()
322
- # file_age_minutes = file_age_seconds / 60
323
- #
324
- # if self.verbose:
325
- # self.logger.info(
326
- # f"File {file_path} is {round(file_age_minutes, 2)} minutes old (threshold: {self.max_age_minutes} minutes)")
327
- #
328
- # # Check if the file date is within the history threshold
329
- # history_start_date = datetime.date.today() - datetime.timedelta(days=self.history_days_threshold)
330
- # within_history_threshold = current_date >= history_start_date
331
- #
332
- # # File is considered old if it exceeds max_age_minutes and is within the history threshold
333
- # return file_age_minutes > self.max_age_minutes and within_history_threshold
334
- #
335
- # def process_date(self, date: datetime.date):
336
- # """Process a specific date by regenerating data as necessary."""
337
- # folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
338
- # full_parquet_filename = f"{folder}{self.parquet_filename}"
339
- #
340
- # start_time = datetime.datetime.now()
341
- #
342
- # if self.verbose:
343
- # self.logger.info(f"Processing {full_parquet_filename}...")
344
- #
345
- # data_object = self.dataclass(**self.class_params)
346
- # #date_filter_params = {
347
- # # f'{self.date_field}__year': date.year,
348
- # # f'{self.date_field}__month': date.month,
349
- # # f'{self.date_field}__day': date.day
350
- # #}
351
- # df=data_object.load_period(dt_field=self.date_field, start=date, end=date)
352
- # #df = data_object.load(**self.load_params, **date_filter_params)
353
- #
354
- # if len(df.index) == 0:
355
- # if self.verbose:
356
- # self.logger.info("No data found for the specified date.")
357
- # return
358
- #
359
- # parquet_saver = ParquetSaver(df, folder, self.logger)
360
- # parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
361
- #
362
- # end_time = datetime.datetime.now()
363
- # duration_seconds = (end_time - start_time).total_seconds()
364
- #
365
- # if self.verbose:
366
- # self.logger.info(f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds")
367
- #
368
- #
369
- # def remove_empty_directories(self, path: str):
370
- # if not self.fs.isdir(path) or self.fs.abspath(path) == self.fs.abspath(self.data_path):
371
- # return
372
- #
373
- # if not self.fs.ls(path): # Check if directory is empty
374
- # try:
375
- # self.fs.rmdir(path)
376
- # if self.verbose:
377
- # self.logger.info(f"Removed empty directory: {path}")
378
- # self.remove_empty_directories(self.fs.path.dirname(path))
379
- # except Exception as e:
380
- # if self.verbose:
381
- # self.logger.error(f"Error removing directory {path}: {e}")
382
- # else:
383
- # if self.verbose:
384
- # self.logger.info(f"Directory not empty, stopping: {path}")
385
- #
386
- # def generate_update_plan_with_conditions(self):
387
- # """
388
- # Generate an update plan that evaluates files based on the specified hierarchy:
389
- # 1. Overwrite (all files regenerated).
390
- # 2. History threshold: Files within `history_days_threshold` are evaluated for `max_age_minutes`.
391
- # 3. Missing files: Detect missing files, ignoring future dates.
392
- # """
393
- # update_plan = {
394
- # "overwrite": [],
395
- # "history_days": [],
396
- # "missing_files": []
397
- # }
398
- # rows = []
399
- #
400
- # today = datetime.date.today()
401
- # history_start_date = today - datetime.timedelta(
402
- # days=self.history_days_threshold) if self.history_days_threshold else None
403
- #
404
- # for current_date in tqdm(self.generate_date_range(), desc="Evaluating update plan", unit="date"):
405
- # folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
406
- # full_parquet_filename = f"{folder}{self.parquet_filename}"
407
- #
408
- # file_exists = self.fs.exists(full_parquet_filename)
409
- # file_age_minutes = None # Initialize file_age_minutes as None
410
- # file_is_old = False
411
- # within_history = False
412
- # missing_file = not file_exists and not self.ignore_missing
413
- # category = None
414
- #
415
- # if file_exists:
416
- # # Calculate file age in minutes
417
- # file_modification_time = self.fs.info(full_parquet_filename)['mtime']
418
- # file_modification_datetime = datetime.datetime.fromtimestamp(file_modification_time,
419
- # tz=datetime.timezone.utc)
420
- # current_time = datetime.datetime.now(datetime.timezone.utc)
421
- # file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
422
- #
423
- # # Determine if the file is old
424
- # file_is_old = file_age_minutes > self.max_age_minutes
425
- #
426
- # # Determine if the file is within the history threshold
427
- # if self.history_days_threshold and history_start_date and history_start_date <= current_date <= today:
428
- # within_history = True
429
- #
430
- # # Hierarchy 1: Overwrite (all files are marked for regeneration)
431
- # if self.overwrite:
432
- # category = "overwrite"
433
- #
434
- # # Hierarchy 2: History threshold evaluation
435
- # elif within_history and (missing_file or file_is_old):
436
- # category = "history_days"
437
- #
438
- # # Hierarchy 3: Detect missing files, ignoring future dates
439
- # elif missing_file and current_date <= today:
440
- # category = "missing_files"
441
- #
442
- # # Append to update plan
443
- # if category:
444
- # update_plan[category].append(current_date)
445
- #
446
- # # Collect condition descriptions for the update plan table
447
- # rows.append({
448
- # "date": current_date,
449
- # "file_exists": file_exists,
450
- # "file_age_minutes": file_age_minutes, # Add file age to the table
451
- # "file_is_old": file_is_old,
452
- # "within_history": within_history,
453
- # "missing_file": missing_file,
454
- # "update_required": category is not None, # Mark as true if a category is assigned
455
- # "update_category": category
456
- # })
457
- #
458
- # # Sort dates in descending order if reverse_order is True
459
- # if self.reverse_order:
460
- # for key in update_plan:
461
- # update_plan[key].sort(reverse=True)
462
- #
463
- # update_plan_table = pd.DataFrame(rows)
464
- # return update_plan, update_plan_table
465
- # # def generate_update_plan_with_conditions(self):
466
- # # """
467
- # # Generate an update plan that evaluates files based on the specified hierarchy:
468
- # # 1. Overwrite (all files regenerated).
469
- # # 2. History threshold: Files within `history_days_threshold` are evaluated for `max_age_minutes`.
470
- # # 3. Missing files: Detect missing files, ignoring future dates.
471
- # # """
472
- # # update_plan = {
473
- # # "overwrite": [],
474
- # # "history_days": [],
475
- # # "missing_files": []
476
- # # }
477
- # # rows = []
478
- # #
479
- # # today = datetime.date.today()
480
- # # history_start_date = today - datetime.timedelta(days=self.history_days_threshold) if self.history_days_threshold else None
481
- # #
482
- # # for current_date in tqdm(self.generate_date_range(), desc="Evaluating update plan", unit="date"):
483
- # # folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
484
- # # full_parquet_filename = f"{folder}{self.parquet_filename}"
485
- # #
486
- # # file_exists = self.fs.exists(full_parquet_filename)
487
- # # file_is_old = file_exists and self.is_file_older_than(full_parquet_filename, current_date)
488
- # # within_history = False
489
- # # missing_file = not file_exists and not self.ignore_missing
490
- # # category = None
491
- # #
492
- # # # Hierarchy 1: Overwrite (all files are marked for regeneration)
493
- # # if self.overwrite:
494
- # # category = "overwrite"
495
- # #
496
- # # # Hierarchy 2: History threshold evaluation
497
- # # elif self.history_days_threshold and history_start_date and history_start_date <= current_date <= today:
498
- # # within_history = True
499
- # # if missing_file or self.is_file_older_than(full_parquet_filename, current_date):
500
- # # category = "history_days"
501
- # #
502
- # # # Hierarchy 3: Detect missing files, ignoring future dates
503
- # # elif missing_file and current_date <= today:
504
- # # category = "missing_files"
505
- # #
506
- # # # Append to update plan
507
- # # if category:
508
- # # update_plan[category].append(current_date)
509
- # #
510
- # # # Collect condition descriptions for the update plan table
511
- # # rows.append({
512
- # # "date": current_date,
513
- # # "file_exists": file_exists,
514
- # # "file_is_old": file_is_old,
515
- # # "within_history": within_history,
516
- # # "missing_file": missing_file,
517
- # # "update_required": category is not None,
518
- # # "update_category": category
519
- # # })
520
- # #
521
- # # # Sort dates in descending order if reverse_order is True
522
- # # if self.reverse_order:
523
- # # for key in update_plan:
524
- # # update_plan[key].sort(reverse=True)
525
- # #
526
- # # update_plan_table = pd.DataFrame(rows)
527
- # # return update_plan, update_plan_table
528
- #
210
+
529
211
  # # Usage:
530
212
  # # wrapper = DataWrapper(
531
213
  # # dataclass=YourDataClass,
@@ -1,5 +1,7 @@
1
1
  import datetime
2
2
  from typing import Union, Tuple, Callable, Dict, Any
3
+
4
+ import numpy as np
3
5
  import pandas as pd
4
6
  from sibi_dst.utils import Logger
5
7
 
@@ -118,6 +120,134 @@ class DateUtils:
118
120
  'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
119
121
  }
120
122
 
123
+ class BusinessDays:
124
+ def __init__(self, holiday_list, logger):
125
+ """
126
+ Initialize a BusinessDays object with a given holiday list.
127
+ """
128
+ self.logger = logger
129
+ self.HOLIDAY_LIST = holiday_list
130
+ bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
131
+ self.bd_cal = np.busdaycalendar(holidays=bd_holidays, weekmask="1111100")
132
+ self.holidays = self.bd_cal.holidays
133
+ self.week_mask = self.bd_cal.weekmask
134
+
135
+ def get_business_days_count(self, begin_date, end_date):
136
+ """
137
+ Calculate the number of business days between two dates.
138
+ """
139
+ try:
140
+ begin_date = pd.to_datetime(begin_date)
141
+ end_date = pd.to_datetime(end_date)
142
+ except Exception as e:
143
+ raise ValueError(f"Invalid date format: {e}")
144
+
145
+ years = [str(year) for year in range(begin_date.year, end_date.year + 1)]
146
+ if not all(year in self.HOLIDAY_LIST for year in years):
147
+ raise ValueError("Not all years in date range are in the holiday list")
148
+
149
+ return np.busday_count(
150
+ begin_date.strftime("%Y-%m-%d"),
151
+ end_date.strftime("%Y-%m-%d"),
152
+ busdaycal=self.bd_cal,
153
+ )
154
+
155
+ def calc_business_days_from_df(self, df, begin_date_col, end_date_col, result_col="business_days"):
156
+ """
157
+ Add a column to a Dask DataFrame with the number of business days between two date columns.
158
+ """
159
+ if not all(col in df.columns for col in [begin_date_col, end_date_col]):
160
+ self.logger.error("Column names not found in DataFrame")
161
+ raise ValueError("Required columns are missing")
162
+
163
+ # Extract holidays and weekmask to recreate the busdaycalendar
164
+ holidays = self.bd_cal.holidays
165
+ weekmask = self.bd_cal.weekmask
166
+
167
+ # Define a function to calculate business days
168
+ def calculate_business_days(row, holidays, weekmask):
169
+ begin_date = pd.to_datetime(row[begin_date_col])
170
+ end_date = pd.to_datetime(row[end_date_col])
171
+ busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
172
+ return np.busday_count(
173
+ begin_date.strftime("%Y-%m-%d"),
174
+ end_date.strftime("%Y-%m-%d"),
175
+ busdaycal=busdaycal,
176
+ )
177
+
178
+ # Define a wrapper function for partition-wise operations
179
+ def apply_business_days(partition, holidays, weekmask):
180
+ return partition.apply(
181
+ calculate_business_days, axis=1, holidays=holidays, weekmask=weekmask
182
+ )
183
+
184
+ # Apply the function using map_partitions
185
+ df[result_col] = df.map_partitions(
186
+ apply_business_days,
187
+ holidays,
188
+ weekmask,
189
+ meta=(result_col, "int64"),
190
+ )
191
+
192
+ return df
193
+
194
+ def add_business_days(self, start_date, n_days):
195
+ """
196
+ Add n_days business days to start_date.
197
+ """
198
+ try:
199
+ start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
200
+ except ValueError:
201
+ raise ValueError("Date should be a string in the format YYYY-MM-DD")
202
+
203
+ if str(start_date.year) not in self.HOLIDAY_LIST:
204
+ self.logger.warning(f"Year {start_date.year} is not in the holiday list")
205
+
206
+ return np.busday_offset(
207
+ start_date.strftime("%Y-%m-%d"),
208
+ n_days,
209
+ roll="forward",
210
+ busdaycal=self.bd_cal,
211
+ )
212
+
213
+ def calc_sla_end_date(self, df, start_date_col, n_days_col, result_col="sla_end_date"):
214
+ """
215
+ Add a column to a Dask DataFrame with SLA end dates based on start date and SLA days.
216
+ """
217
+ if not all(col in df.columns for col in [start_date_col, n_days_col]):
218
+ raise ValueError("Column names not found in DataFrame")
219
+
220
+ # Extract holidays and weekmask to recreate the busdaycalendar
221
+ holidays = self.bd_cal.holidays
222
+ weekmask = self.bd_cal.weekmask
223
+
224
+ # Define a function to calculate SLA end dates
225
+ def calculate_sla_end_date(row, holidays, weekmask):
226
+ start_date = pd.to_datetime(row[start_date_col])
227
+ n_days = row[n_days_col]
228
+ busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
229
+ return np.busday_offset(
230
+ start_date.strftime("%Y-%m-%d"),
231
+ n_days,
232
+ roll="forward",
233
+ busdaycal=busdaycal,
234
+ )
235
+
236
+ # Define a wrapper for partition-wise operation
237
+ def apply_sla_end_date(partition, holidays, weekmask):
238
+ return partition.apply(
239
+ calculate_sla_end_date, axis=1, holidays=holidays, weekmask=weekmask
240
+ )
241
+
242
+ # Apply the function using map_partitions
243
+ df[result_col] = df.map_partitions(
244
+ apply_sla_end_date,
245
+ holidays,
246
+ weekmask,
247
+ meta=(result_col, "object"),
248
+ )
249
+
250
+ return df
121
251
  # Class enhancements
122
252
  # DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
123
253
  # datetime.date.today() + datetime.timedelta(days=13)))
@@ -12,6 +12,97 @@ class DfUtils:
12
12
  """
13
13
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
14
14
 
15
+ def align_and_merge_by_type(self, df_left, df_right, type_mapping, how='left'):
16
+ """
17
+ Align column data types in two DataFrames based on a type mapping dictionary and perform the merge.
18
+
19
+ Parameters:
20
+ - df_left (pd.DataFrame or dd.DataFrame): Left DataFrame
21
+ - df_right (pd.DataFrame or dd.DataFrame): Right DataFrame
22
+ - type_mapping (dict): Dictionary mapping target dtypes to column pairs.
23
+ Example: {
24
+ 'integer': [('customer_id', 'temp1'), ('product_type_id', 'temp2')],
25
+ 'string': [('group2', 'temp4')]
26
+ }
27
+
28
+ Returns:
29
+ - Merged DataFrame
30
+ """
31
+ # Map string keys to actual dtypes
32
+ dtype_map = {
33
+ 'integer': 'int64',
34
+ 'float': 'float64',
35
+ 'string': 'string',
36
+ 'datetime': 'datetime64[ns]',
37
+ 'boolean': 'bool',
38
+ }
39
+
40
+ # Iterate over each dtype and align the column pairs
41
+ for target_type, column_pairs in type_mapping.items():
42
+ if target_type not in dtype_map:
43
+ self.logger.error(f"Unsupported type: {target_type}")
44
+
45
+ for left_col, right_col in column_pairs:
46
+ # Align dtypes in left and right DataFrames
47
+ if left_col in df_left.columns and right_col in df_right.columns:
48
+ df_left[left_col] = df_left[left_col].astype(dtype_map[target_type])
49
+ df_right[right_col] = df_right[right_col].astype(dtype_map[target_type])
50
+
51
+ # Flatten all column pairs for the merge operation
52
+ all_pairs = [pair for pairs in type_mapping.values() for pair in pairs]
53
+
54
+ # Perform the merge
55
+ return df_left.merge(
56
+ df_right,
57
+ how=how,
58
+ left_on=[pair[0] for pair in all_pairs],
59
+ right_on=[pair[1] for pair in all_pairs]
60
+ )
61
+
62
+ def exclude_from_dataframe(self, df, conditions):
63
+ """
64
+ Generic function to filter rows from a DataFrame (Pandas or Dask).
65
+
66
+ Parameters:
67
+ - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame to filter.
68
+ - conditions (list of tuples): List of conditions to apply for filtering.
69
+ Each condition is a tuple: (column_name, operator, value).
70
+
71
+ Returns:
72
+ - pandas.DataFrame or dask.dataframe.DataFrame: Filtered DataFrame.
73
+ """
74
+ import operator
75
+
76
+ # Mapping string operators to actual Python operators
77
+ ops = {
78
+ "==": operator.eq,
79
+ "!=": operator.ne,
80
+ "<": operator.lt,
81
+ "<=": operator.le,
82
+ ">": operator.gt,
83
+ ">=": operator.ge,
84
+ }
85
+ # Ensure all specified columns exist in the DataFrame
86
+ missing_columns = [col for col, _, _ in conditions if col not in df.columns]
87
+ if missing_columns:
88
+ self.logger.info(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
89
+ return df
90
+
91
+ # Build the combined filtering condition
92
+ combined_condition = None
93
+ for col, op, value in conditions:
94
+ if op not in ops:
95
+ raise ValueError(f"Unsupported operator: {op}")
96
+
97
+ # Get the individual condition
98
+ condition = ops[op](df[col], value)
99
+
100
+ # Combine the condition with AND (&)
101
+ combined_condition = condition if combined_condition is None else (combined_condition & condition)
102
+
103
+ # Apply the filtering and return the DataFrame
104
+ return df[~combined_condition]
105
+
15
106
  def load_grouped_activity(self, df, group_by_expr, group_expr='count', debug=False):
16
107
  """
17
108
  Groups the DataFrame by the specified expression and computes the size.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.11
3
+ Version: 0.3.14
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -9,10 +9,13 @@ Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.11
10
10
  Classifier: Programming Language :: Python :: 3.12
11
11
  Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
12
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
+ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
12
14
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
13
15
  Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
14
16
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
15
- Requires-Dist: django (==4.1.13)
17
+ Requires-Dist: django (>=5.1.4,<6.0.0)
18
+ Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
16
19
  Requires-Dist: httpx (>=0.27.2,<0.28.0)
17
20
  Requires-Dist: ipython (>=8.29.0,<9.0.0)
18
21
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
@@ -26,9 +29,10 @@ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
26
29
  Requires-Dist: pymysql (>=1.1.1,<2.0.0)
27
30
  Requires-Dist: pytest (>=8.3.3,<9.0.0)
28
31
  Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
29
- Requires-Dist: sqlmodel (>=0.0.22,<0.0.23)
32
+ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
30
33
  Requires-Dist: tornado (>=6.4.1,<7.0.0)
31
34
  Requires-Dist: tqdm (>=4.67.0,<5.0.0)
35
+ Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
32
36
  Description-Content-Type: text/markdown
33
37
 
34
38
  # sibi-dst
@@ -1,6 +1,6 @@
1
1
  sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
2
2
  sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
3
- sibi_dst/df_helper/_df_helper.py,sha256=e-ptCEDYt5dx8byNiA0ca8Eejl1DG1V5pioZUzabEnY,12747
3
+ sibi_dst/df_helper/_df_helper.py,sha256=iYG8uL1ILrBvjtH8oiSwbPHnlDsJLlHtSghDDlt7T-w,13365
4
4
  sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
5
5
  sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
6
6
  sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
@@ -11,35 +11,35 @@ sibi_dst/df_helper/plugins/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X
11
11
  sibi_dst/df_helper/plugins/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
12
12
  sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NrkBb5LM1A_vo3wAotqj2sVVYIuTfFGrQqIXk3xOoDs,5162
13
13
  sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
14
- sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=jryDojeA62rB3seRaWWMjsAmekKacK5xctwCQGVklPQ,9063
14
+ sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=aGaHziEMWK4zk9kkMNq2QtVevqVOCWqoAlXT1lVgRok,9198
15
15
  sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
16
16
  sibi_dst/df_helper/plugins/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
17
- sibi_dst/df_helper/plugins/http/_http_config.py,sha256=TaoI0F5S-Gf9jiWJp3ngQZTw2jlks-_WNDzKX1Wybtc,2165
17
+ sibi_dst/df_helper/plugins/http/_http_config.py,sha256=WH0d4vsxfZRhWrWI4iTVAnhsdY3421SBr9kXYZVfeYQ,2126
18
18
  sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPjLDTVHW2NQqqfQwWAw,187
19
- sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=45mHID1azAg5PmaYWbuRlghoRd3H2aTLj1XcycfLJo0,3497
20
- sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=cKyRj0UCby9-iYPPFnlel1H03x8MnAoEv8k1tp7kHXw,4277
19
+ sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
20
+ sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=suJC7LfNEWAo-7_R62YTMSRku3k8orysft83VxRUems,4394
21
21
  sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
22
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=6IjQEREXqTAzSJE95FKfXjRkTlEjRMS4hJ_yMpyKDTg,5223
22
+ sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=1WQ390XBFWOSXo0ea6-hz1LM6Ppi-j6ToZYr7sQBldE,5330
23
23
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
24
24
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
25
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=myrtEzK12DvA73x7QFaqXFb_TxOPMrsVj-mxYHJD2dg,2371
25
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=balWGKun0NKIfhLZW-_DCOhKuTzTo_C2NwZoKFwuSJo,2329
26
26
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=vrTTeFcrf8sFpzqLlQfspjduKuR7Wgn7mDPUASsQs6s,4355
27
27
  sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
28
28
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
29
29
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=bLD4tEcGDKkJCfSO4b13_89tzVJcpz55I6uw9D4ERnE,3751
30
- sibi_dst/utils/__init__.py,sha256=jiXJSnmsaGZTRhUThtIo6cssWXBWXNij8ffYmv77QK4,797
30
+ sibi_dst/utils/__init__.py,sha256=nkX7tASNn57kw998YdqQQGY8qXv2J4LC4-g0GoQSiic,831
31
31
  sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
32
- sibi_dst/utils/_clickhouse_writer.py,sha256=mdgszbyVluhGvDmvsHY4XDTZrp42L3xtdmiyn3z2bYM,8534
32
+ sibi_dst/utils/_clickhouse_writer.py,sha256=kNBQeDn3D4Javrz5L8uU_5itf8Mrvm9l29uxcmcKlbg,8555
33
33
  sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
34
- sibi_dst/utils/_data_utils.py,sha256=3hBMg852ANpS5bOtlU-F4H-Q91WIGga5LrKWWyDvnAA,7354
35
- sibi_dst/utils/_data_wrapper.py,sha256=pZnylBFTvsLGfYGv2tTyQHzyb6IbIahfaXR-PxHdivk,24099
36
- sibi_dst/utils/_date_utils.py,sha256=6HCrcTiuYLNsbgrNB3eAVAAgXbfx7Ce1qNc3OJla9nM,5621
37
- sibi_dst/utils/_df_utils.py,sha256=o2bK5-xMGKqIG4i9xfavYRxIkiHLA0nz5TQTN78998k,7350
34
+ sibi_dst/utils/_data_utils.py,sha256=BvmjMNSkICy671BmjW68RhvDMfN5uAXwhffSV-wEwmk,9185
35
+ sibi_dst/utils/_data_wrapper.py,sha256=SmNv1UoZLq7ovRVy4wipsWLMidKJXcRTp4HtxmaCQdk,9399
36
+ sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
37
+ sibi_dst/utils/_df_utils.py,sha256=9_dNYoZ9_ofU0t_sxMdsXALWCuh02gvqUrei-6Lhr6w,10910
38
38
  sibi_dst/utils/_file_utils.py,sha256=5EN90c8N1n9d-_xwz2RzaYcXRMQY_rws2Q3EA3pNAog,1254
39
39
  sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixWnn_68_n0,6686
40
40
  sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
41
41
  sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
42
42
  sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
43
- sibi_dst-0.3.11.dist-info/METADATA,sha256=gwl565etE5wLVGk0rqQ7umOyBRtEXpQ_IdCXyEkv2s8,1897
44
- sibi_dst-0.3.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
- sibi_dst-0.3.11.dist-info/RECORD,,
43
+ sibi_dst-0.3.14.dist-info/METADATA,sha256=ysmNqT8NnhY_VlPmrQ2U3FnXWFEIvfwFRi8uSGRP6g0,2090
44
+ sibi_dst-0.3.14.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
+ sibi_dst-0.3.14.dist-info/RECORD,,