sibi-dst 2025.9.10__py3-none-any.whl → 2025.9.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +11 -6
- sibi_dst/df_helper/__init__.py +0 -1
- sibi_dst/df_helper/_artifact_updater_async.py +199 -175
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -3
- sibi_dst/osmnx_helper/__init__.py +3 -1
- sibi_dst/utils/__init__.py +2 -1
- sibi_dst/utils/boilerplate/base_pipeline.py +1 -2
- sibi_dst/utils/business_days.py +19 -51
- sibi_dst/utils/dask_utils.py +124 -1
- sibi_dst/utils/data_wrapper.py +0 -11
- sibi_dst/utils/filepath_generator.py +1 -154
- {sibi_dst-2025.9.10.dist-info → sibi_dst-2025.9.12.dist-info}/METADATA +26 -30
- {sibi_dst-2025.9.10.dist-info → sibi_dst-2025.9.12.dist-info}/RECORD +27 -27
- {sibi_dst-2025.9.10.dist-info → sibi_dst-2025.9.12.dist-info}/WHEEL +2 -1
- sibi_dst-2025.9.12.dist-info/top_level.txt +1 -0
- sibi_dst/df_helper/data_cleaner.py +0 -132
@@ -0,0 +1 @@
|
|
1
|
+
sibi_dst
|
@@ -1,132 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
from nltk.corpus import stopwords
|
3
|
-
from nltk.stem import SnowballStemmer
|
4
|
-
import dask.dataframe as dd
|
5
|
-
from dask_ml.preprocessing import OneHotEncoder, LabelEncoder
|
6
|
-
import nltk
|
7
|
-
|
8
|
-
class DataCleaner:
|
9
|
-
def __init__(self, dataframe):
|
10
|
-
self.original_df = dataframe
|
11
|
-
self.df = dataframe.copy()
|
12
|
-
self.duplicates_df = None
|
13
|
-
|
14
|
-
def handle_missing_values(self, strategy='mean'):
|
15
|
-
if strategy == 'mean':
|
16
|
-
self.df = self.df.fillna(self.df.mean())
|
17
|
-
elif strategy == 'median':
|
18
|
-
self.df = self.df.fillna(self.df.median())
|
19
|
-
elif strategy == 'mode':
|
20
|
-
self.df = self.df.fillna(self.df.mode().iloc[0])
|
21
|
-
elif strategy == 'drop':
|
22
|
-
self.df = self.df.dropna()
|
23
|
-
return self
|
24
|
-
|
25
|
-
def identify_duplicates(self, subset=None):
|
26
|
-
self.duplicates_df = self.df.map_partitions(lambda df: df[df.duplicated(subset=subset, keep=False)])
|
27
|
-
return self.duplicates_df
|
28
|
-
|
29
|
-
def remove_duplicates(self):
|
30
|
-
if self.duplicates_df is not None:
|
31
|
-
self.df = self.df[~self.df.index.isin(self.duplicates_df.index)]
|
32
|
-
return self
|
33
|
-
|
34
|
-
def validate_date_fields(self, date_columns=None):
|
35
|
-
if date_columns is None:
|
36
|
-
date_columns = self.df.select_dtypes(include=['datetime', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns
|
37
|
-
for col in date_columns:
|
38
|
-
print('Validating date field: ', col)
|
39
|
-
self.df[col] = dd.to_datetime(self.df[col], errors='coerce')
|
40
|
-
return self
|
41
|
-
|
42
|
-
def clean_text(self, text_columns=None, language='english'):
|
43
|
-
nltk.download('stopwords')
|
44
|
-
stop_words = set(stopwords.words(language))
|
45
|
-
stemmer = SnowballStemmer(language)
|
46
|
-
|
47
|
-
def clean_text(text):
|
48
|
-
if isinstance(text, str):
|
49
|
-
text = text.strip().lower() # Remove leading/trailing whitespace and convert to lowercase
|
50
|
-
text = re.sub(r'[^\w\s]', '', text) # Remove special characters and punctuation
|
51
|
-
words = text.split()
|
52
|
-
words = [word for word in words if word not in stop_words] # Remove stop words
|
53
|
-
words = [stemmer.stem(word) for word in words] # Apply stemming
|
54
|
-
return ' '.join(words)
|
55
|
-
return text
|
56
|
-
|
57
|
-
if text_columns is None:
|
58
|
-
text_columns = self.df.select_dtypes(include=['object', 'string']).columns
|
59
|
-
text_columns = [col for col in text_columns if self.df[col].dtype != 'bool']
|
60
|
-
|
61
|
-
for col in text_columns:
|
62
|
-
print('Cleaning text field: ', col)
|
63
|
-
self.df[col] = self.df[col].map(clean_text, meta=('cleaned_text', 'object'))
|
64
|
-
return self
|
65
|
-
|
66
|
-
def validate_numeric_fields(self, int_columns=None, float_columns=None):
|
67
|
-
if int_columns is None:
|
68
|
-
int_columns = self.df.select_dtypes(include=['int64', 'int32']).columns
|
69
|
-
if float_columns is None:
|
70
|
-
float_columns = self.df.select_dtypes(include=['float64', 'float32']).columns
|
71
|
-
|
72
|
-
for col in int_columns:
|
73
|
-
print('Validating integer field: ', col)
|
74
|
-
self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='integer')
|
75
|
-
|
76
|
-
for col in float_columns:
|
77
|
-
print('Validating float field: ', col)
|
78
|
-
self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='float')
|
79
|
-
|
80
|
-
return self
|
81
|
-
|
82
|
-
def detect_categorical_columns(self, threshold=0.05):
|
83
|
-
"""
|
84
|
-
Detect columns that can be converted to 'category' dtype.
|
85
|
-
|
86
|
-
Parameters:
|
87
|
-
threshold (float): The maximum ratio of unique values to total values for a column to be considered categorical.
|
88
|
-
|
89
|
-
Returns:
|
90
|
-
List of column names that can be converted to 'category' dtype.
|
91
|
-
"""
|
92
|
-
categorical_columns = []
|
93
|
-
|
94
|
-
def unique_ratio(partition, col):
|
95
|
-
return partition[col].nunique() / len(partition)
|
96
|
-
|
97
|
-
for col in self.df.columns:
|
98
|
-
print("Detecting categorical columns: ", col)
|
99
|
-
unique_ratios = self.df.map_partitions(unique_ratio, col=col).compute()
|
100
|
-
overall_unique_ratio = unique_ratios.sum() / len(self.df)
|
101
|
-
if overall_unique_ratio < threshold:
|
102
|
-
print(f'Column {col} is categorical')
|
103
|
-
categorical_columns.append(col)
|
104
|
-
|
105
|
-
return categorical_columns
|
106
|
-
|
107
|
-
def handle_categorical_variables(self, columns=None, method='onehot', threshold=0.05):
|
108
|
-
if columns is None:
|
109
|
-
columns = self.detect_categorical_columns(threshold)
|
110
|
-
|
111
|
-
if method == 'onehot':
|
112
|
-
for col in columns:
|
113
|
-
self.df[col] = self.df[col].astype('category')
|
114
|
-
encoder = OneHotEncoder(sparse_output=False)
|
115
|
-
self.df = encoder.fit_transform(self.df)
|
116
|
-
elif method == 'label':
|
117
|
-
encoder = LabelEncoder()
|
118
|
-
for col in columns:
|
119
|
-
self.df[col] = encoder.fit_transform(self.df[col])
|
120
|
-
return self
|
121
|
-
|
122
|
-
def analyze_dtypes(self):
|
123
|
-
return self.df.dtypes
|
124
|
-
|
125
|
-
def get_cleaned_dataframe(self):
|
126
|
-
return self.df
|
127
|
-
|
128
|
-
def get_original_dataframe(self):
|
129
|
-
return self.original_df
|
130
|
-
|
131
|
-
def get_duplicates_dataframe(self):
|
132
|
-
return self.duplicates_df
|