sibi-dst 2025.9.10__py3-none-any.whl → 2025.9.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ sibi_dst
@@ -1,132 +0,0 @@
1
- import re
2
- from nltk.corpus import stopwords
3
- from nltk.stem import SnowballStemmer
4
- import dask.dataframe as dd
5
- from dask_ml.preprocessing import OneHotEncoder, LabelEncoder
6
- import nltk
7
-
8
- class DataCleaner:
9
- def __init__(self, dataframe):
10
- self.original_df = dataframe
11
- self.df = dataframe.copy()
12
- self.duplicates_df = None
13
-
14
- def handle_missing_values(self, strategy='mean'):
15
- if strategy == 'mean':
16
- self.df = self.df.fillna(self.df.mean())
17
- elif strategy == 'median':
18
- self.df = self.df.fillna(self.df.median())
19
- elif strategy == 'mode':
20
- self.df = self.df.fillna(self.df.mode().iloc[0])
21
- elif strategy == 'drop':
22
- self.df = self.df.dropna()
23
- return self
24
-
25
- def identify_duplicates(self, subset=None):
26
- self.duplicates_df = self.df.map_partitions(lambda df: df[df.duplicated(subset=subset, keep=False)])
27
- return self.duplicates_df
28
-
29
- def remove_duplicates(self):
30
- if self.duplicates_df is not None:
31
- self.df = self.df[~self.df.index.isin(self.duplicates_df.index)]
32
- return self
33
-
34
- def validate_date_fields(self, date_columns=None):
35
- if date_columns is None:
36
- date_columns = self.df.select_dtypes(include=['datetime', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns
37
- for col in date_columns:
38
- print('Validating date field: ', col)
39
- self.df[col] = dd.to_datetime(self.df[col], errors='coerce')
40
- return self
41
-
42
- def clean_text(self, text_columns=None, language='english'):
43
- nltk.download('stopwords')
44
- stop_words = set(stopwords.words(language))
45
- stemmer = SnowballStemmer(language)
46
-
47
- def clean_text(text):
48
- if isinstance(text, str):
49
- text = text.strip().lower() # Remove leading/trailing whitespace and convert to lowercase
50
- text = re.sub(r'[^\w\s]', '', text) # Remove special characters and punctuation
51
- words = text.split()
52
- words = [word for word in words if word not in stop_words] # Remove stop words
53
- words = [stemmer.stem(word) for word in words] # Apply stemming
54
- return ' '.join(words)
55
- return text
56
-
57
- if text_columns is None:
58
- text_columns = self.df.select_dtypes(include=['object', 'string']).columns
59
- text_columns = [col for col in text_columns if self.df[col].dtype != 'bool']
60
-
61
- for col in text_columns:
62
- print('Cleaning text field: ', col)
63
- self.df[col] = self.df[col].map(clean_text, meta=('cleaned_text', 'object'))
64
- return self
65
-
66
- def validate_numeric_fields(self, int_columns=None, float_columns=None):
67
- if int_columns is None:
68
- int_columns = self.df.select_dtypes(include=['int64', 'int32']).columns
69
- if float_columns is None:
70
- float_columns = self.df.select_dtypes(include=['float64', 'float32']).columns
71
-
72
- for col in int_columns:
73
- print('Validating integer field: ', col)
74
- self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='integer')
75
-
76
- for col in float_columns:
77
- print('Validating float field: ', col)
78
- self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='float')
79
-
80
- return self
81
-
82
- def detect_categorical_columns(self, threshold=0.05):
83
- """
84
- Detect columns that can be converted to 'category' dtype.
85
-
86
- Parameters:
87
- threshold (float): The maximum ratio of unique values to total values for a column to be considered categorical.
88
-
89
- Returns:
90
- List of column names that can be converted to 'category' dtype.
91
- """
92
- categorical_columns = []
93
-
94
- def unique_ratio(partition, col):
95
- return partition[col].nunique() / len(partition)
96
-
97
- for col in self.df.columns:
98
- print("Detecting categorical columns: ", col)
99
- unique_ratios = self.df.map_partitions(unique_ratio, col=col).compute()
100
- overall_unique_ratio = unique_ratios.sum() / len(self.df)
101
- if overall_unique_ratio < threshold:
102
- print(f'Column {col} is categorical')
103
- categorical_columns.append(col)
104
-
105
- return categorical_columns
106
-
107
- def handle_categorical_variables(self, columns=None, method='onehot', threshold=0.05):
108
- if columns is None:
109
- columns = self.detect_categorical_columns(threshold)
110
-
111
- if method == 'onehot':
112
- for col in columns:
113
- self.df[col] = self.df[col].astype('category')
114
- encoder = OneHotEncoder(sparse_output=False)
115
- self.df = encoder.fit_transform(self.df)
116
- elif method == 'label':
117
- encoder = LabelEncoder()
118
- for col in columns:
119
- self.df[col] = encoder.fit_transform(self.df[col])
120
- return self
121
-
122
- def analyze_dtypes(self):
123
- return self.df.dtypes
124
-
125
- def get_cleaned_dataframe(self):
126
- return self.df
127
-
128
- def get_original_dataframe(self):
129
- return self.original_df
130
-
131
- def get_duplicates_dataframe(self):
132
- return self.duplicates_df