sibi-dst 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +42 -30
- sibi_dst/df_helper/core/__init__.py +6 -4
- sibi_dst/df_helper/core/_filter_handler.py +216 -0
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py +32 -20
- sibi_dst/df_helper/plugins/django/_io_dask.py +0 -3
- sibi_dst/df_helper/plugins/http/_http_config.py +4 -4
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -9
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +4 -2
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +8 -6
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +5 -2
- sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +2 -3
- sibi_dst/utils/_clickhouse_writer.py +16 -16
- sibi_dst/utils/_data_utils.py +40 -81
- sibi_dst/utils/_data_wrapper.py +8 -4
- sibi_dst/utils/_df_utils.py +5 -5
- sibi_dst/utils/_log_utils.py +3 -0
- sibi_dst/utils/_parquet_saver.py +3 -108
- {sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/METADATA +2 -1
- {sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/RECORD +20 -19
- {sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/WHEEL +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
import dask.dataframe as dd
|
2
|
+
import dask_expr
|
2
3
|
import pandas as pd
|
3
4
|
|
4
5
|
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
@@ -28,7 +29,6 @@ class SqlAlchemyLoadFromDb:
|
|
28
29
|
self.query_config = plugin_query
|
29
30
|
self.params_config = plugin_params
|
30
31
|
self.debug = kwargs.pop("debug", False)
|
31
|
-
self.verbose_debug = kwargs.pop("verbose_debug", False)
|
32
32
|
|
33
33
|
def build_and_load(self) -> dd.DataFrame:
|
34
34
|
"""
|
@@ -40,7 +40,6 @@ class SqlAlchemyLoadFromDb:
|
|
40
40
|
def _build_and_load(self) -> dd.DataFrame:
|
41
41
|
|
42
42
|
try:
|
43
|
-
# reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
|
44
43
|
self.df = SQLAlchemyDask(
|
45
44
|
model=self.model,
|
46
45
|
filters=self.params_config.filters,
|
@@ -49,10 +48,13 @@ class SqlAlchemyLoadFromDb:
|
|
49
48
|
chunk_size=1000,
|
50
49
|
debug=self.debug).read_frame()
|
51
50
|
if self.df is None or len(self.df.head().index) == 0:
|
52
|
-
self.logger.
|
53
|
-
|
51
|
+
self.logger.debug("Query returned no results.")
|
52
|
+
dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
54
53
|
|
54
|
+
return dask_df
|
55
55
|
return self.df
|
56
56
|
except Exception as e:
|
57
|
-
self.logger.
|
58
|
-
|
57
|
+
self.logger.debug(f"Failed to load data into Dask DataFrame.{e}")
|
58
|
+
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
59
|
+
|
60
|
+
return dask_df
|
@@ -59,7 +59,7 @@ class SqlAlchemyModelBuilder:
|
|
59
59
|
attrs = {
|
60
60
|
"__tablename__": self.table_name,
|
61
61
|
"__table__": self.table,
|
62
|
-
|
62
|
+
"__module__": f"{apps_label}.models",
|
63
63
|
"__mapper_args__": {"eager_defaults": True},
|
64
64
|
}
|
65
65
|
|
@@ -82,9 +82,12 @@ class SqlAlchemyModelBuilder:
|
|
82
82
|
dict: Dictionary of column attributes.
|
83
83
|
"""
|
84
84
|
columns = {}
|
85
|
+
reserved_names = ["metadata", "class_", "table"]
|
86
|
+
|
85
87
|
for column in table.columns:
|
86
88
|
column_name = self.normalize_column_name(column.name)
|
87
|
-
|
89
|
+
if column_name not in reserved_names:
|
90
|
+
columns[column_name] = column
|
88
91
|
return columns
|
89
92
|
|
90
93
|
def add_relationships(self, attrs, table: Table):
|
@@ -26,7 +26,6 @@ class SQLModelLoadFromDb:
|
|
26
26
|
self.query_config = db_query or {}
|
27
27
|
self.params_config = db_params or {}
|
28
28
|
self.debug = kwargs.pop("debug", False)
|
29
|
-
self.verbose_debug = kwargs.pop("verbose_debug", False)
|
30
29
|
|
31
30
|
def _default_logger(self):
|
32
31
|
"""Create a default logger."""
|
@@ -69,7 +68,7 @@ class SQLModelLoadFromDb:
|
|
69
68
|
query = query.limit(n_records)
|
70
69
|
|
71
70
|
# Debug: Log the SQL query
|
72
|
-
self.logger.
|
71
|
+
self.logger.debug(f"Executing query: {str(query)}")
|
73
72
|
|
74
73
|
# Execute the query
|
75
74
|
results = session.exec(query).fetchall()
|
@@ -79,7 +78,7 @@ class SQLModelLoadFromDb:
|
|
79
78
|
if results:
|
80
79
|
df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
|
81
80
|
else:
|
82
|
-
self.logger.
|
81
|
+
self.logger.debug("Query returned no results.")
|
83
82
|
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
84
83
|
|
85
84
|
except Exception as e:
|
@@ -34,7 +34,7 @@ class ClickHouseWriter:
|
|
34
34
|
self.df = df.copy()
|
35
35
|
self.order_by = kwargs.setdefault('order_by',self.order_by)
|
36
36
|
if len(self.df.head().index) == 0:
|
37
|
-
self.logger.
|
37
|
+
self.logger.debug("Dataframe is empty")
|
38
38
|
return
|
39
39
|
self._handle_missing_values()
|
40
40
|
self._connect()
|
@@ -51,7 +51,7 @@ class ClickHouseWriter:
|
|
51
51
|
user=self.clickhouse_user,
|
52
52
|
password=self.clickhouse_password
|
53
53
|
)
|
54
|
-
self.logger.
|
54
|
+
self.logger.debug("Connected to ClickHouse")
|
55
55
|
except Exception as e:
|
56
56
|
self.logger.error(e)
|
57
57
|
raise
|
@@ -80,7 +80,7 @@ class ClickHouseWriter:
|
|
80
80
|
def _drop_table(self):
|
81
81
|
if self.client:
|
82
82
|
self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
|
83
|
-
self.logger.
|
83
|
+
self.logger.debug(f"Dropped table {self.clickhouse_table}")
|
84
84
|
|
85
85
|
def _create_table_from_dask(self, engine=None):
|
86
86
|
if engine is None:
|
@@ -88,18 +88,18 @@ class ClickHouseWriter:
|
|
88
88
|
dtypes = self.df.dtypes
|
89
89
|
clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
|
90
90
|
create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
91
|
-
self.logger.
|
91
|
+
self.logger.debug(f"Creating table SQL:{create_table_sql}")
|
92
92
|
if self.client:
|
93
93
|
self.client.command(create_table_sql)
|
94
|
-
self.logger.
|
94
|
+
self.logger.debug("Created table '{}'".format(self.clickhouse_table))
|
95
95
|
|
96
96
|
def _handle_missing_values(self):
|
97
97
|
"""
|
98
98
|
Handle missing values in the Dask DataFrame before writing to ClickHouse.
|
99
99
|
"""
|
100
|
-
self.logger.
|
100
|
+
self.logger.debug("Checking for missing values...")
|
101
101
|
missing_counts = self.df.isnull().sum().compute()
|
102
|
-
self.logger.
|
102
|
+
self.logger.debug(f"Missing values per column:\n{missing_counts}")
|
103
103
|
|
104
104
|
# Replace missing values based on column types
|
105
105
|
def replace_missing_values(df):
|
@@ -116,14 +116,14 @@ class ClickHouseWriter:
|
|
116
116
|
|
117
117
|
# Apply replacement
|
118
118
|
self.df = replace_missing_values(self.df)
|
119
|
-
self.logger.
|
119
|
+
self.logger.debug("Missing values replaced.")
|
120
120
|
|
121
121
|
def _write_data(self):
|
122
122
|
"""
|
123
123
|
Writes the Dask DataFrame to a ClickHouse table partition by partition.
|
124
124
|
"""
|
125
125
|
if len(self.df.head().index) == 0:
|
126
|
-
self.logger.
|
126
|
+
self.logger.debug("No data found. Nothing written.")
|
127
127
|
return
|
128
128
|
|
129
129
|
for i, partition in enumerate(self.df.to_delayed()):
|
@@ -132,10 +132,10 @@ class ClickHouseWriter:
|
|
132
132
|
df = partition.compute()
|
133
133
|
|
134
134
|
if df.empty:
|
135
|
-
self.logger.
|
135
|
+
self.logger.debug(f"Partition {i} is empty. Skipping...")
|
136
136
|
continue
|
137
137
|
|
138
|
-
self.logger.
|
138
|
+
self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
|
139
139
|
|
140
140
|
# Write the partition to the ClickHouse table
|
141
141
|
self.client.insert_df(self.clickhouse_table, df)
|
@@ -148,7 +148,7 @@ class ClickHouseWriter:
|
|
148
148
|
Ensures a separate client instance is used per thread to avoid session conflicts.
|
149
149
|
"""
|
150
150
|
if len(self.df.index) == 0:
|
151
|
-
self.logger.
|
151
|
+
self.logger.debug("No data found. Nothing written.")
|
152
152
|
return
|
153
153
|
|
154
154
|
def create_client():
|
@@ -170,13 +170,13 @@ class ClickHouseWriter:
|
|
170
170
|
Write a single partition to ClickHouse using a separate client instance.
|
171
171
|
"""
|
172
172
|
try:
|
173
|
-
self.logger.
|
173
|
+
self.logger.debug(f"Starting to process partition {index}")
|
174
174
|
client = create_client() # Create a new client for the thread
|
175
175
|
|
176
176
|
# Compute the Dask partition into a Pandas DataFrame
|
177
177
|
df = partition.compute()
|
178
178
|
if df.empty:
|
179
|
-
self.logger.
|
179
|
+
self.logger.debug(f"Partition {index} is empty. Skipping...")
|
180
180
|
return
|
181
181
|
|
182
182
|
# Convert DataFrame to list of tuples
|
@@ -184,7 +184,7 @@ class ClickHouseWriter:
|
|
184
184
|
columns = df.columns.tolist()
|
185
185
|
|
186
186
|
# Perform the insert
|
187
|
-
self.logger.
|
187
|
+
self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
|
188
188
|
client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
|
189
189
|
|
190
190
|
except Exception as e:
|
@@ -192,7 +192,7 @@ class ClickHouseWriter:
|
|
192
192
|
finally:
|
193
193
|
if 'client' in locals() and hasattr(client, 'close'):
|
194
194
|
client.close()
|
195
|
-
self.logger.
|
195
|
+
self.logger.debug(f"Closed client for partition {index}")
|
196
196
|
|
197
197
|
try:
|
198
198
|
# Get delayed partitions and enumerate them
|
sibi_dst/utils/_data_utils.py
CHANGED
@@ -1,77 +1,32 @@
|
|
1
|
-
import pandas as pd
|
2
1
|
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
3
4
|
from sibi_dst.utils import Logger
|
4
5
|
|
6
|
+
|
5
7
|
class DataUtils:
|
6
8
|
|
7
|
-
def __init__(self, logger=None):
|
9
|
+
def __init__(self, logger=None, **kwargs):
|
8
10
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
11
|
+
self.debug = kwargs.get('debug', False)
|
9
12
|
|
10
13
|
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
11
14
|
if not columns:
|
12
15
|
self.logger.warning('No columns specified')
|
13
|
-
|
16
|
+
self.logger.debug(f'Dataframe type:{type(df)}')
|
14
17
|
columns = [column for column in columns if column in df.columns]
|
15
18
|
for col in columns:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
)
|
24
|
-
else:
|
25
|
-
# For Pandas DataFrame, handle mixed types and invalid values
|
26
|
-
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
27
|
-
df[col] = df[col].fillna(fill_value).astype(dtype)
|
19
|
+
# Replace NaN with 0, then convert to boolean
|
20
|
+
df[col] = df[col].map_partitions(
|
21
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
22
|
+
.fillna(fill_value) # Replace NaN with 0
|
23
|
+
.astype(dtype),
|
24
|
+
meta=(col, dtype)
|
25
|
+
)
|
28
26
|
|
29
27
|
return df
|
30
28
|
|
31
|
-
|
32
|
-
def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
|
33
|
-
"""
|
34
|
-
Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
|
35
|
-
|
36
|
-
Parameters:
|
37
|
-
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
38
|
-
- columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
|
39
|
-
- fill_value (int or float): The value to replace NA values with.
|
40
|
-
- transform_func (callable, optional): The transformation function to apply.
|
41
|
-
If None, no additional transformation is applied.
|
42
|
-
|
43
|
-
Returns:
|
44
|
-
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
|
45
|
-
"""
|
46
|
-
if columns is None:
|
47
|
-
# Detect numeric columns
|
48
|
-
columns = df.select_dtypes(include=['number']).columns.tolist()
|
49
|
-
|
50
|
-
if not columns:
|
51
|
-
return df
|
52
|
-
|
53
|
-
columns = [column for column in columns if column in df.columns]
|
54
|
-
# Default transformation function (identity) if none is provided
|
55
|
-
if transform_func is None:
|
56
|
-
transform_func = lambda x: x
|
57
|
-
|
58
|
-
# Batch processing for Dask
|
59
|
-
if isinstance(df, dd.DataFrame):
|
60
|
-
def transform_partition(partition):
|
61
|
-
# Apply transformations for all numeric columns in a single pass
|
62
|
-
partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
|
63
|
-
return partition
|
64
|
-
|
65
|
-
# Apply the transformation function to all specified columns
|
66
|
-
df = df.map_partitions(transform_partition, meta=df)
|
67
|
-
else:
|
68
|
-
# Pandas: Vectorized operations for all specified columns
|
69
|
-
df[columns] = df[columns].fillna(fill_value).map(transform_func)
|
70
|
-
|
71
|
-
return df
|
72
|
-
|
73
|
-
@staticmethod
|
74
|
-
def transform_boolean_columns(df, columns=None):
|
29
|
+
def transform_boolean_columns(self, df, columns=None):
|
75
30
|
"""
|
76
31
|
Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
|
77
32
|
and convert them to boolean. Detection is performed using a sample.
|
@@ -84,23 +39,20 @@ class DataUtils:
|
|
84
39
|
Returns:
|
85
40
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
|
86
41
|
"""
|
42
|
+
|
87
43
|
# Apply transformation to each specified column
|
88
44
|
for col in columns:
|
89
45
|
if col in df.columns:
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
# For Pandas DataFrame, handle mixed types and invalid values
|
101
|
-
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
102
|
-
df[col] = df[col].fillna(0).astype(int).astype(bool)
|
103
|
-
|
46
|
+
# Replace NaN with 0, then convert to boolean
|
47
|
+
df[col] = df[col].map_partitions(
|
48
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
49
|
+
.fillna(0) # Replace NaN with 0
|
50
|
+
.astype(int) # Ensure integer type
|
51
|
+
.astype(bool), # Convert to boolean
|
52
|
+
meta=(col, 'bool')
|
53
|
+
)
|
54
|
+
if self.debug:
|
55
|
+
self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
|
104
56
|
return df
|
105
57
|
|
106
58
|
def merge_lookup_data(self, classname, df, **kwargs):
|
@@ -116,6 +68,7 @@ class DataUtils:
|
|
116
68
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
|
117
69
|
"""
|
118
70
|
# Return early if the DataFrame is empty
|
71
|
+
debug = kwargs.setdefault("debug", False)
|
119
72
|
if self.is_dataframe_empty(df):
|
120
73
|
return df
|
121
74
|
|
@@ -136,17 +89,24 @@ class DataUtils:
|
|
136
89
|
column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
|
137
90
|
|
138
91
|
if source_col not in df.columns:
|
139
|
-
self.logger.
|
92
|
+
self.logger.debug(f"{source_col} not in DataFrame columns")
|
140
93
|
return df
|
141
94
|
|
142
95
|
# Get unique IDs from source column
|
143
96
|
ids = df[source_col].dropna().unique()
|
144
|
-
if
|
97
|
+
# Compute if it's a Dask Series
|
98
|
+
if isinstance(ids, dd.core.Series):
|
145
99
|
ids = ids.compute()
|
100
|
+
|
101
|
+
# Check if any IDs are found
|
146
102
|
if not len(ids):
|
147
|
-
self.logger.
|
103
|
+
self.logger.debug(f"No IDs found in the source column: {source_col}")
|
148
104
|
return df
|
149
|
-
|
105
|
+
|
106
|
+
# Convert to a list only if necessary and sort
|
107
|
+
if not isinstance(ids, list):
|
108
|
+
ids = ids.tolist()
|
109
|
+
ids = sorted(ids)
|
150
110
|
# Prepare kwargs for loading lookup data
|
151
111
|
load_kwargs = kwargs.copy()
|
152
112
|
load_kwargs.update({
|
@@ -155,10 +115,10 @@ class DataUtils:
|
|
155
115
|
f'{lookup_col}__in': ids
|
156
116
|
})
|
157
117
|
# Load lookup data
|
158
|
-
lookup_instance = classname(debug=
|
118
|
+
lookup_instance = classname(debug=debug)
|
159
119
|
result = lookup_instance.load(**load_kwargs)
|
160
120
|
if len(result.index) == 0:
|
161
|
-
self.logger.
|
121
|
+
self.logger.debug(f"No IDs found in the source column: {source_col}")
|
162
122
|
return df
|
163
123
|
# Determine the join column on the result DataFrame
|
164
124
|
temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
|
@@ -167,14 +127,13 @@ class DataUtils:
|
|
167
127
|
df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
|
168
128
|
|
169
129
|
if fillna_source_description_alias and source_description_alias in df.columns:
|
170
|
-
df[source_description_alias]=df[source_description_alias].fillna('')
|
130
|
+
df[source_description_alias] = df[source_description_alias].fillna('')
|
171
131
|
|
172
132
|
# Drop temp_join_col if present
|
173
133
|
df = df.drop(columns='temp_join_col', errors='ignore')
|
174
134
|
|
175
135
|
return df
|
176
136
|
|
177
|
-
|
178
137
|
def is_dataframe_empty(self, df):
|
179
138
|
"""
|
180
139
|
Check if a DataFrame (Pandas or Dask) is empty.
|
sibi_dst/utils/_data_wrapper.py
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
import datetime
|
2
2
|
from typing import Type, Any, Dict, Optional
|
3
|
+
|
4
|
+
import dask_expr
|
3
5
|
import fsspec
|
4
6
|
import pandas as pd
|
5
7
|
from IPython.display import display
|
8
|
+
from dask.dataframe import dd
|
9
|
+
|
6
10
|
from sibi_dst.utils import Logger
|
7
11
|
from tqdm import tqdm
|
8
12
|
from sibi_dst.utils import ParquetSaver
|
@@ -112,7 +116,7 @@ class DataWrapper:
|
|
112
116
|
file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
|
113
117
|
|
114
118
|
if self.verbose:
|
115
|
-
self.logger.
|
119
|
+
self.logger.debug(
|
116
120
|
f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
|
117
121
|
f"(threshold: {self.max_age_minutes} minutes)"
|
118
122
|
)
|
@@ -129,14 +133,14 @@ class DataWrapper:
|
|
129
133
|
start_time = datetime.datetime.now()
|
130
134
|
|
131
135
|
if self.verbose:
|
132
|
-
self.logger.
|
136
|
+
self.logger.debug(f"Processing {full_parquet_filename}...")
|
133
137
|
|
134
138
|
data_object = self.dataclass(**self.class_params)
|
135
139
|
df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
136
140
|
|
137
141
|
if len(df.index)==0:
|
138
142
|
if self.verbose:
|
139
|
-
self.logger.
|
143
|
+
self.logger.debug("No data found for the specified date.")
|
140
144
|
return
|
141
145
|
|
142
146
|
parquet_saver = ParquetSaver(df, folder, self.logger)
|
@@ -146,7 +150,7 @@ class DataWrapper:
|
|
146
150
|
duration_seconds = (end_time - start_time).total_seconds()
|
147
151
|
|
148
152
|
if self.verbose:
|
149
|
-
self.logger.
|
153
|
+
self.logger.debug(
|
150
154
|
f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
|
151
155
|
)
|
152
156
|
|
sibi_dst/utils/_df_utils.py
CHANGED
@@ -85,7 +85,7 @@ class DfUtils:
|
|
85
85
|
# Ensure all specified columns exist in the DataFrame
|
86
86
|
missing_columns = [col for col, _, _ in conditions if col not in df.columns]
|
87
87
|
if missing_columns:
|
88
|
-
self.logger.
|
88
|
+
self.logger.debug(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
|
89
89
|
return df
|
90
90
|
|
91
91
|
# Build the combined filtering condition
|
@@ -117,7 +117,7 @@ class DfUtils:
|
|
117
117
|
DataFrame: Grouped DataFrame with counts.
|
118
118
|
"""
|
119
119
|
if debug:
|
120
|
-
self.logger.
|
120
|
+
self.logger.debug(f"Grouping by: {group_by_expr}")
|
121
121
|
|
122
122
|
df_grouped = df.groupby(by=group_by_expr).size().reset_index(name=group_expr)
|
123
123
|
return df_grouped
|
@@ -141,7 +141,7 @@ class DfUtils:
|
|
141
141
|
|
142
142
|
if debug:
|
143
143
|
df_duplicates = df[df.duplicated(subset=duplicate_expr)]
|
144
|
-
self.logger.
|
144
|
+
self.logger.debug(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
|
145
145
|
|
146
146
|
if sort_field:
|
147
147
|
if isinstance(df, dd.DataFrame):
|
@@ -224,9 +224,9 @@ class DfUtils:
|
|
224
224
|
Returns:
|
225
225
|
DataFrame: Resampled pivot table.
|
226
226
|
"""
|
227
|
-
if isinstance(df, dd.DataFrame):
|
227
|
+
if isinstance(df, dd.core.DataFrame):
|
228
228
|
# Implement Dask-compatible pivot and resample
|
229
|
-
self.logger.
|
229
|
+
self.logger.debug("Performing summarization with Dask DataFrame.")
|
230
230
|
# Ensure the index is a datetime for resampling
|
231
231
|
if not isinstance(df.index, (pd.DatetimeIndex, dd.core.DatetimeIndex)):
|
232
232
|
self.logger.warning("Index is not a DatetimeIndex. Converting index to datetime.")
|
sibi_dst/utils/_log_utils.py
CHANGED
sibi_dst/utils/_parquet_saver.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
import datetime
|
2
1
|
from pathlib import Path
|
3
2
|
from typing import Optional
|
4
3
|
|
5
|
-
import
|
4
|
+
import dask_expr
|
6
5
|
import fsspec
|
7
|
-
import pandas as pd
|
8
6
|
import pyarrow as pa
|
7
|
+
|
9
8
|
from sibi_dst.utils import Logger
|
10
9
|
|
10
|
+
|
11
11
|
class ParquetSaver:
|
12
12
|
def __init__(self, df_result, parquet_storage_path, logger=None):
|
13
13
|
# Ensure df_result is a Dask DataFrame
|
14
|
-
if not isinstance(df_result, dd.DataFrame):
|
15
|
-
df_result = dd.from_pandas(df_result, npartitions=1)
|
16
14
|
self.df_result = df_result
|
17
15
|
self.parquet_storage_path = parquet_storage_path
|
18
16
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -106,106 +104,3 @@ class ParquetSaver:
|
|
106
104
|
str(full_path), engine="pyarrow", schema=schema, write_index=False
|
107
105
|
)
|
108
106
|
|
109
|
-
# import datetime
|
110
|
-
# from pathlib import Path
|
111
|
-
# from typing import Optional
|
112
|
-
#
|
113
|
-
# import dask.dataframe as dd
|
114
|
-
# import fsspec
|
115
|
-
# import pandas as pd
|
116
|
-
# import pyarrow as pa
|
117
|
-
# from sibi_dst.utils import Logger
|
118
|
-
#
|
119
|
-
# class ParquetSaver:
|
120
|
-
# def __init__(self, df_result, parquet_storage_path, logger):
|
121
|
-
# self.df_result = df_result
|
122
|
-
# self.parquet_storage_path = parquet_storage_path
|
123
|
-
# self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
124
|
-
#
|
125
|
-
#
|
126
|
-
# def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
|
127
|
-
# full_path = self._construct_full_path(parquet_filename)
|
128
|
-
#
|
129
|
-
# if len(self.df_result) == 0:
|
130
|
-
# self.logger.warning('No data to save')
|
131
|
-
# return # Exit early if there's no data to save
|
132
|
-
#
|
133
|
-
# # Ensure directory exists and clear if necessary
|
134
|
-
# self._ensure_directory_exists(full_path, clear_existing=True)
|
135
|
-
#
|
136
|
-
# # Define schema and save DataFrame to parquet
|
137
|
-
# schema = self._define_schema()
|
138
|
-
# self._convert_dtypes(schema)
|
139
|
-
# self._save_dataframe_to_parquet(full_path, schema)
|
140
|
-
#
|
141
|
-
# def _define_schema(self) -> pa.Schema:
|
142
|
-
# """Define a PyArrow schema dynamically based on df_result column types."""
|
143
|
-
# pandas_dtype_to_pa = {
|
144
|
-
# 'object': pa.string(),
|
145
|
-
# 'string': pa.string(),
|
146
|
-
# 'Int64': pa.int64(),
|
147
|
-
# 'int64': pa.int64(),
|
148
|
-
# 'float64': pa.float64(),
|
149
|
-
# 'bool': pa.bool_(),
|
150
|
-
# 'boolean': pa.bool_(), # pandas nullable boolean
|
151
|
-
# 'datetime64[ns]': pa.timestamp('ns'),
|
152
|
-
# 'timedelta[ns]': pa.duration('ns')
|
153
|
-
# }
|
154
|
-
#
|
155
|
-
# fields = [
|
156
|
-
# pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
|
157
|
-
# for col, dtype in self.df_result.dtypes.items()
|
158
|
-
# ]
|
159
|
-
# return pa.schema(fields)
|
160
|
-
#
|
161
|
-
# def _convert_dtypes(self, schema: pa.Schema):
|
162
|
-
# """Convert DataFrame columns to match the specified schema."""
|
163
|
-
# dtype_mapping = {}
|
164
|
-
# for field in schema:
|
165
|
-
# col_name = field.name
|
166
|
-
# if col_name in self.df_result.columns:
|
167
|
-
# if pa.types.is_string(field.type):
|
168
|
-
# dtype_mapping[col_name] = 'string'
|
169
|
-
# elif pa.types.is_int64(field.type):
|
170
|
-
# dtype_mapping[col_name] = 'Int64' # pandas nullable integer
|
171
|
-
# elif pa.types.is_float64(field.type):
|
172
|
-
# dtype_mapping[col_name] = 'float64'
|
173
|
-
# elif pa.types.is_boolean(field.type):
|
174
|
-
# dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
|
175
|
-
# elif pa.types.is_timestamp(field.type):
|
176
|
-
# dtype_mapping[col_name] = 'datetime64[ns]'
|
177
|
-
# else:
|
178
|
-
# dtype_mapping[col_name] = 'object' # Fallback to object
|
179
|
-
# self.df_result = self.df_result.astype(dtype_mapping)
|
180
|
-
#
|
181
|
-
# def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
|
182
|
-
# """Construct and return the full path for the parquet file."""
|
183
|
-
# fs, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
|
184
|
-
# parquet_filename = parquet_filename or "default.parquet"
|
185
|
-
# return Path(base_path) / parquet_filename
|
186
|
-
#
|
187
|
-
# @staticmethod
|
188
|
-
# def _ensure_directory_exists(full_path: Path, clear_existing=False):
|
189
|
-
# """Ensure that the directory for the path exists, clearing it if specified."""
|
190
|
-
# fs, _ = fsspec.core.url_to_fs(str(full_path))
|
191
|
-
# directory = str(full_path.parent)
|
192
|
-
#
|
193
|
-
# if fs.exists(directory):
|
194
|
-
# if clear_existing:
|
195
|
-
# fs.rm(directory, recursive=True)
|
196
|
-
# else:
|
197
|
-
# fs.mkdirs(directory, exist_ok=True)
|
198
|
-
#
|
199
|
-
# def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
|
200
|
-
# """Save the DataFrame to parquet with fsspec using specified schema."""
|
201
|
-
# fs, _ = fsspec.core.url_to_fs(str(full_path))
|
202
|
-
# if fs.exists(full_path):
|
203
|
-
# fs.rm(full_path, recursive=True)
|
204
|
-
# if isinstance(self.df_result, dd.DataFrame):
|
205
|
-
# self.df_result.to_parquet(
|
206
|
-
# str(full_path), engine="pyarrow", schema=schema, write_index=False
|
207
|
-
# )
|
208
|
-
# elif isinstance(self.df_result, pd.DataFrame):
|
209
|
-
# dd.from_pandas(self.df_result, npartitions=1).to_parquet(
|
210
|
-
# str(full_path), engine="pyarrow", schema=schema, write_index=False
|
211
|
-
# )
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.16
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -13,6 +13,7 @@ Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
|
13
13
|
Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
14
14
|
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
15
15
|
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
16
|
+
Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
|
16
17
|
Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
|
17
18
|
Requires-Dist: django (>=5.1.4,<6.0.0)
|
18
19
|
Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
|