junshan-kit 2.3.1__py2.py3-none-any.whl → 2.3.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
junshan_kit/DataProcessor.py
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import pandas as pd
|
9
|
-
|
9
|
+
from sklearn.preprocessing import StandardScaler
|
10
10
|
|
11
11
|
class CSV_TO_Pandas:
|
12
12
|
def __init__(self):
|
@@ -19,7 +19,9 @@ class CSV_TO_Pandas:
|
|
19
19
|
label_col: str,
|
20
20
|
label_map: dict,
|
21
21
|
data_name: str,
|
22
|
+
user_one_hot_cols = [],
|
22
23
|
print_info=False,
|
24
|
+
Standard = False
|
23
25
|
):
|
24
26
|
"""
|
25
27
|
Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
|
@@ -88,9 +90,15 @@ class CSV_TO_Pandas:
|
|
88
90
|
col for col in text_feature_cols if col != label_col
|
89
91
|
] # ✅ exclude label
|
90
92
|
|
91
|
-
df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
|
93
|
+
df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
|
92
94
|
m_cleaned, n_cleaned = df.shape
|
93
95
|
|
96
|
+
if Standard:
|
97
|
+
# Identify numerical columns Standardize numerical columns
|
98
|
+
num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + [user_one_hot_cols]]
|
99
|
+
scaler = StandardScaler()
|
100
|
+
df[num_cols] = scaler.fit_transform(df[num_cols])
|
101
|
+
|
94
102
|
# print info
|
95
103
|
if print_info:
|
96
104
|
pos_count = (df[label_col] == 1).sum()
|
@@ -98,20 +106,21 @@ class CSV_TO_Pandas:
|
|
98
106
|
|
99
107
|
# Step 6: Print dataset information
|
100
108
|
print("\n" + "=" * 80)
|
101
|
-
print(f"{'{data_name} - Info':^70}")
|
109
|
+
print(f"{f'{data_name} - Info':^70}")
|
102
110
|
print("=" * 80)
|
103
111
|
print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
|
104
112
|
print(
|
105
|
-
f"{'Size after dropping NaN & non-feature cols:':<40} {
|
113
|
+
f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
|
106
114
|
)
|
107
115
|
print(f"{'Positive samples (+1):':<40} {pos_count}")
|
108
116
|
print(f"{'Negative samples (-1):':<40} {neg_count}")
|
109
117
|
print(
|
110
|
-
f"{'Size after one-hot encoding:':<40} {
|
118
|
+
f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
|
111
119
|
)
|
112
120
|
print("-" * 80)
|
113
121
|
print(f"Note:")
|
114
|
-
print(f"{'Label column:':<40} {label_col}")
|
122
|
+
print(f"{'Label column:':<40} {label_col}")
|
123
|
+
print(f"{'label_map:':<40} {label_map}")
|
115
124
|
print(
|
116
125
|
f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
|
117
126
|
)
|
junshan_kit/DataSets.py
CHANGED
@@ -36,16 +36,26 @@ def _download_data(data_name):
|
|
36
36
|
else:
|
37
37
|
print("❌ Invalid choice. Please enter 1 or 2.\n")
|
38
38
|
|
39
|
-
def
|
39
|
+
def _export_csv(df, data_name):
|
40
|
+
path = f'./data_trans_fea/{data_name}/'
|
41
|
+
os.makedirs(path, exist_ok=True)
|
42
|
+
df.to_csv(path + f'{data_name}.csv')
|
43
|
+
print(path + f'{data_name}.csv')
|
44
|
+
|
45
|
+
|
46
|
+
def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, ):
|
40
47
|
if not os.path.exists(csv_path):
|
41
48
|
print('\n' + '*'*60)
|
42
49
|
print(f"Please download the data.")
|
43
50
|
print(csv_path)
|
44
51
|
_download_data(data_name)
|
45
|
-
junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
|
46
|
-
|
52
|
+
junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
|
53
|
+
|
47
54
|
cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
|
48
|
-
df = cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map, data_name, print_info=print_info)
|
55
|
+
df = cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map, data_name,user_one_hot_cols, print_info=print_info)
|
56
|
+
|
57
|
+
if export_csv:
|
58
|
+
_export_csv(df, data_name)
|
49
59
|
|
50
60
|
return df
|
51
61
|
|
@@ -94,7 +104,7 @@ def electric_vehicle_population_data(data_name = "Electric Vehicle Population",
|
|
94
104
|
def global_house_purchase_dataset(data_name = "Global House Purchase", print_info = False):
|
95
105
|
csv_path = f'./exp_data/{data_name}/global_house_purchase_dataset.csv'
|
96
106
|
drop_cols = ['property_id']
|
97
|
-
label_col = '
|
107
|
+
label_col = 'decision'
|
98
108
|
label_map = {0: -1, 1: 1}
|
99
109
|
|
100
110
|
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
@@ -105,7 +115,7 @@ def global_house_purchase_dataset(data_name = "Global House Purchase", print_inf
|
|
105
115
|
def health_lifestyle_dataset(data_name = "Health Lifestyle", print_info = False):
|
106
116
|
csv_path = f'./exp_data/{data_name}/health_lifestyle_dataset.csv'
|
107
117
|
drop_cols = ['id']
|
108
|
-
label_col = '
|
118
|
+
label_col = 'disease_risk'
|
109
119
|
label_map = {0: -1, 1: 1}
|
110
120
|
|
111
121
|
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
@@ -114,8 +124,123 @@ def health_lifestyle_dataset(data_name = "Health Lifestyle", print_info = False)
|
|
114
124
|
|
115
125
|
|
116
126
|
|
127
|
+
def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False):
|
128
|
+
"""
|
129
|
+
1. The missing values in this dataset are handled by directly removing the corresponding column. Since the `alcohol_freq` column contains a large number of missing values, deleting the rows would result in significant data loss, so the entire column is dropped instead.
|
130
|
+
|
131
|
+
2. There are several columns that could serve as binary classification labels, such as `is_high_risk`, `cardiovascular_disease`, and `liver_disease`. In this case, `is_high_risk` is chosen as the label column.
|
132
|
+
"""
|
133
|
+
csv_path = f'./exp_data/{data_name}/medical_insurance.csv'
|
134
|
+
drop_cols = ['alcohol_freq']
|
135
|
+
label_col = 'is_high_risk'
|
136
|
+
label_map = {0: -1, 1: 1}
|
137
|
+
|
138
|
+
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
139
|
+
|
140
|
+
return df
|
141
|
+
|
117
142
|
|
143
|
+
def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False):
|
144
|
+
csv_path = f'./exp_data/{data_name}/Particle Physics Event Classification.csv'
|
145
|
+
drop_cols = []
|
146
|
+
label_col = 'Label'
|
147
|
+
label_map = {'s': -1, 'b': 1}
|
118
148
|
|
119
|
-
|
120
|
-
pass
|
149
|
+
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
121
150
|
|
151
|
+
return df
|
152
|
+
|
153
|
+
|
154
|
+
|
155
|
+
def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False):
|
156
|
+
csv_path = f'./exp_data/{data_name}/adult.csv'
|
157
|
+
drop_cols = []
|
158
|
+
label_col = 'income'
|
159
|
+
label_map = {'<=50K': -1, '>50K': 1}
|
160
|
+
|
161
|
+
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
162
|
+
|
163
|
+
return df
|
164
|
+
|
165
|
+
|
166
|
+
|
167
|
+
|
168
|
+
def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
|
169
|
+
csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
|
170
|
+
label_col = 'rain_tomorrow'
|
171
|
+
label_map = {0: -1, 1: 1}
|
172
|
+
|
173
|
+
# Step 0: Load the dataset
|
174
|
+
df = pd.read_csv(csv_path)
|
175
|
+
|
176
|
+
df['time'] = pd.to_datetime(df['time'])
|
177
|
+
df['year'] = df['time'].dt.year
|
178
|
+
df['month'] = df['time'].dt.month
|
179
|
+
df['day'] = df['time'].dt.day
|
180
|
+
df['hour'] = df['time'].dt.hour
|
181
|
+
|
182
|
+
user_one_hot_cols = ['year','month','day', 'hour']
|
183
|
+
drop_cols = ['Unnamed: 0', 'time']
|
184
|
+
|
185
|
+
# Save original size
|
186
|
+
m_original, n_original = df.shape
|
187
|
+
|
188
|
+
# Step 1: Drop non-informative columns
|
189
|
+
df = df.drop(columns=drop_cols)
|
190
|
+
|
191
|
+
# Step 2: Remove rows with missing values
|
192
|
+
df = df.dropna(axis=0, how="any")
|
193
|
+
m_encoded, n_encoded = df.shape
|
194
|
+
|
195
|
+
# Step 3: Map target label (to -1 and +1)
|
196
|
+
df[label_col] = df[label_col].map(label_map)
|
197
|
+
|
198
|
+
# Step 4: Encode categorical features (exclude label column)
|
199
|
+
text_feature_cols = df.select_dtypes(
|
200
|
+
include=["object", "string", "category"]
|
201
|
+
).columns
|
202
|
+
text_feature_cols = [
|
203
|
+
col for col in text_feature_cols if col != label_col
|
204
|
+
] # ✅ exclude label
|
205
|
+
|
206
|
+
df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
|
207
|
+
m_cleaned, n_cleaned = df.shape
|
208
|
+
|
209
|
+
num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
|
210
|
+
scaler = StandardScaler()
|
211
|
+
df[num_cols] = scaler.fit_transform(df[num_cols])
|
212
|
+
|
213
|
+
if export_csv:
|
214
|
+
_export_csv(df, data_name)
|
215
|
+
|
216
|
+
# print info
|
217
|
+
if print_info:
|
218
|
+
pos_count = (df[label_col] == 1).sum()
|
219
|
+
neg_count = (df[label_col] == -1).sum()
|
220
|
+
|
221
|
+
# Step 6: Print dataset information
|
222
|
+
print("\n" + "=" * 80)
|
223
|
+
print(f"{f'{data_name} - Info':^70}")
|
224
|
+
print("=" * 80)
|
225
|
+
print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
|
226
|
+
print(
|
227
|
+
f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
|
228
|
+
)
|
229
|
+
print(f"{'Positive samples (+1):':<40} {pos_count}")
|
230
|
+
print(f"{'Negative samples (-1):':<40} {neg_count}")
|
231
|
+
print(
|
232
|
+
f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
|
233
|
+
)
|
234
|
+
print("-" * 80)
|
235
|
+
print(f"Note:")
|
236
|
+
print(f"{'Label column:':<40} {label_col}")
|
237
|
+
print(f"{'label_map:':<40} {label_map}")
|
238
|
+
print(
|
239
|
+
f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
|
240
|
+
)
|
241
|
+
print(
|
242
|
+
f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
|
243
|
+
)
|
244
|
+
print("=" * 80 + "\n")
|
245
|
+
|
246
|
+
return df
|
@@ -0,0 +1,7 @@
|
|
1
|
+
junshan_kit/DataProcessor.py,sha256=YIZMy2gnqnT8n9MMT-q7WtRB1bbA4ITwPxNBEasAnLQ,4966
|
2
|
+
junshan_kit/DataSets.py,sha256=qN4lTVaUsKlu4b8tkZ3aMgHg9lyZTQJlYsgc0uLwUys,8570
|
3
|
+
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
junshan_kit/kit.py,sha256=tB1TpW9hW1EweK1RQwHOdUo7uG1QU4vSeyR0fdaSydo,9569
|
5
|
+
junshan_kit-2.3.2.dist-info/METADATA,sha256=OcuB7peGzVOuC0KIcohSPiCBdAs9BZ1mpZjX4ySSKpk,329
|
6
|
+
junshan_kit-2.3.2.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
7
|
+
junshan_kit-2.3.2.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
junshan_kit/DataProcessor.py,sha256=QnYsqt2j4amZ4U04Urcu91RJs0du-tkl0N2lwxnTy1U,4472
|
2
|
-
junshan_kit/DataSets.py,sha256=2P2AMQjQDKx8FITa5cNBaiFuUhPuWXKkgdlBM02LfPQ,3954
|
3
|
-
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
junshan_kit/kit.py,sha256=tB1TpW9hW1EweK1RQwHOdUo7uG1QU4vSeyR0fdaSydo,9569
|
5
|
-
junshan_kit-2.3.1.dist-info/METADATA,sha256=cCVc5fHfyWVno2RFPc8xskJ7XO8e7sWHYmV-8udM5s4,329
|
6
|
-
junshan_kit-2.3.1.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
7
|
-
junshan_kit-2.3.1.dist-info/RECORD,,
|
File without changes
|