junshan-kit 2.3.0__py2.py3-none-any.whl → 2.3.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@
6
6
  """
7
7
 
8
8
  import pandas as pd
9
-
9
+ from sklearn.preprocessing import StandardScaler
10
10
 
11
11
  class CSV_TO_Pandas:
12
12
  def __init__(self):
@@ -18,7 +18,10 @@ class CSV_TO_Pandas:
18
18
  drop_cols: list,
19
19
  label_col: str,
20
20
  label_map: dict,
21
+ data_name: str,
22
+ user_one_hot_cols = [],
21
23
  print_info=False,
24
+ Standard = False
22
25
  ):
23
26
  """
24
27
  Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
@@ -87,9 +90,15 @@ class CSV_TO_Pandas:
87
90
  col for col in text_feature_cols if col != label_col
88
91
  ] # ✅ exclude label
89
92
 
90
- df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
93
+ df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
91
94
  m_cleaned, n_cleaned = df.shape
92
95
 
96
+ if Standard:
97
+ # Identify numerical columns Standardize numerical columns
98
+ num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + [user_one_hot_cols]]
99
+ scaler = StandardScaler()
100
+ df[num_cols] = scaler.fit_transform(df[num_cols])
101
+
93
102
  # print info
94
103
  if print_info:
95
104
  pos_count = (df[label_col] == 1).sum()
@@ -97,20 +106,21 @@ class CSV_TO_Pandas:
97
106
 
98
107
  # Step 6: Print dataset information
99
108
  print("\n" + "=" * 80)
100
- print(f"{'Dataset Info':^70}")
109
+ print(f"{f'{data_name} - Info':^70}")
101
110
  print("=" * 80)
102
111
  print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
103
112
  print(
104
- f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols"
113
+ f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
105
114
  )
106
115
  print(f"{'Positive samples (+1):':<40} {pos_count}")
107
116
  print(f"{'Negative samples (-1):':<40} {neg_count}")
108
117
  print(
109
- f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols"
118
+ f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
110
119
  )
111
120
  print("-" * 80)
112
121
  print(f"Note:")
113
- print(f"{'Label column:':<40} {label_col}")
122
+ print(f"{'Label column:':<40} {label_col}")
123
+ print(f"{'label_map:':<40} {label_map}")
114
124
  print(
115
125
  f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
116
126
  )
junshan_kit/DataSets.py CHANGED
@@ -36,16 +36,26 @@ def _download_data(data_name):
36
36
  else:
37
37
  print("❌ Invalid choice. Please enter 1 or 2.\n")
38
38
 
39
- def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info):
39
+ def _export_csv(df, data_name):
40
+ path = f'./data_trans_fea/{data_name}/'
41
+ os.makedirs(path, exist_ok=True)
42
+ df.to_csv(path + f'{data_name}.csv')
43
+ print(path + f'{data_name}.csv')
44
+
45
+
46
+ def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, ):
40
47
  if not os.path.exists(csv_path):
41
48
  print('\n' + '*'*60)
42
49
  print(f"Please download the data.")
43
50
  print(csv_path)
44
51
  _download_data(data_name)
45
- junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
46
-
52
+ junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
53
+
47
54
  cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
48
- df = cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map, print_info=print_info)
55
+ df = cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map, data_name,user_one_hot_cols, print_info=print_info)
56
+
57
+ if export_csv:
58
+ _export_csv(df, data_name)
49
59
 
50
60
  return df
51
61
 
@@ -67,7 +77,7 @@ def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print
67
77
  return df
68
78
 
69
79
 
70
- def diabetes_health_indicators_dataset(data_name = "Diabetes Health Indicators Dataset", print_info = False):
80
+ def diabetes_health_indicators_dataset(data_name = "Diabetes Health Indicators", print_info = False):
71
81
  csv_path = f'./exp_data/{data_name}/diabetes_dataset.csv'
72
82
  drop_cols = []
73
83
  label_col = 'diagnosed_diabetes'
@@ -78,7 +88,7 @@ def diabetes_health_indicators_dataset(data_name = "Diabetes Health Indicators D
78
88
  return df
79
89
 
80
90
 
81
- def electric_vehicle_population_data(data_name = "Electric Vehicle Population Data", print_info = False):
91
+ def electric_vehicle_population_data(data_name = "Electric Vehicle Population", print_info = False):
82
92
  csv_path = f'./exp_data/{data_name}/Electric_Vehicle_Population_Data.csv'
83
93
  drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
84
94
  label_col = 'Electric Vehicle Type'
@@ -91,10 +101,10 @@ def electric_vehicle_population_data(data_name = "Electric Vehicle Population Da
91
101
 
92
102
  return df
93
103
 
94
- def global_house_purchase_dataset(data_name = "Global House Purchase Dataset", print_info = False):
104
+ def global_house_purchase_dataset(data_name = "Global House Purchase", print_info = False):
95
105
  csv_path = f'./exp_data/{data_name}/global_house_purchase_dataset.csv'
96
106
  drop_cols = ['property_id']
97
- label_col = 'Electric Vehicle Type'
107
+ label_col = 'decision'
98
108
  label_map = {0: -1, 1: 1}
99
109
 
100
110
  df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
@@ -102,10 +112,10 @@ def global_house_purchase_dataset(data_name = "Global House Purchase Dataset", p
102
112
  return df
103
113
 
104
114
 
105
- def health_lifestyle_dataset(data_name = "Health_lifestyle_dataset", print_info = False):
115
+ def health_lifestyle_dataset(data_name = "Health Lifestyle", print_info = False):
106
116
  csv_path = f'./exp_data/{data_name}/health_lifestyle_dataset.csv'
107
117
  drop_cols = ['id']
108
- label_col = 'decision'
118
+ label_col = 'disease_risk'
109
119
  label_map = {0: -1, 1: 1}
110
120
 
111
121
  df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
@@ -114,8 +124,123 @@ def health_lifestyle_dataset(data_name = "Health_lifestyle_dataset", print_info
114
124
 
115
125
 
116
126
 
127
+ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False):
128
+ """
129
+ 1. The missing values in this dataset are handled by directly removing the corresponding column. Since the `alcohol_freq` column contains a large number of missing values, deleting the rows would result in significant data loss, so the entire column is dropped instead.
130
+
131
+ 2. There are several columns that could serve as binary classification labels, such as `is_high_risk`, `cardiovascular_disease`, and `liver_disease`. In this case, `is_high_risk` is chosen as the label column.
132
+ """
133
+ csv_path = f'./exp_data/{data_name}/medical_insurance.csv'
134
+ drop_cols = ['alcohol_freq']
135
+ label_col = 'is_high_risk'
136
+ label_map = {0: -1, 1: 1}
137
+
138
+ df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
139
+
140
+ return df
141
+
117
142
 
143
+ def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False):
144
+ csv_path = f'./exp_data/{data_name}/Particle Physics Event Classification.csv'
145
+ drop_cols = []
146
+ label_col = 'Label'
147
+ label_map = {'s': -1, 'b': 1}
118
148
 
119
- def wine_and_food_pairing_dataset():
120
- pass
149
+ df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
121
150
 
151
+ return df
152
+
153
+
154
+
155
+ def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False):
156
+ csv_path = f'./exp_data/{data_name}/adult.csv'
157
+ drop_cols = []
158
+ label_col = 'income'
159
+ label_map = {'<=50K': -1, '>50K': 1}
160
+
161
+ df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
162
+
163
+ return df
164
+
165
+
166
+
167
+
168
+ def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
169
+ csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
170
+ label_col = 'rain_tomorrow'
171
+ label_map = {0: -1, 1: 1}
172
+
173
+ # Step 0: Load the dataset
174
+ df = pd.read_csv(csv_path)
175
+
176
+ df['time'] = pd.to_datetime(df['time'])
177
+ df['year'] = df['time'].dt.year
178
+ df['month'] = df['time'].dt.month
179
+ df['day'] = df['time'].dt.day
180
+ df['hour'] = df['time'].dt.hour
181
+
182
+ user_one_hot_cols = ['year','month','day', 'hour']
183
+ drop_cols = ['Unnamed: 0', 'time']
184
+
185
+ # Save original size
186
+ m_original, n_original = df.shape
187
+
188
+ # Step 1: Drop non-informative columns
189
+ df = df.drop(columns=drop_cols)
190
+
191
+ # Step 2: Remove rows with missing values
192
+ df = df.dropna(axis=0, how="any")
193
+ m_encoded, n_encoded = df.shape
194
+
195
+ # Step 3: Map target label (to -1 and +1)
196
+ df[label_col] = df[label_col].map(label_map)
197
+
198
+ # Step 4: Encode categorical features (exclude label column)
199
+ text_feature_cols = df.select_dtypes(
200
+ include=["object", "string", "category"]
201
+ ).columns
202
+ text_feature_cols = [
203
+ col for col in text_feature_cols if col != label_col
204
+ ] # ✅ exclude label
205
+
206
+ df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
207
+ m_cleaned, n_cleaned = df.shape
208
+
209
+ num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
210
+ scaler = StandardScaler()
211
+ df[num_cols] = scaler.fit_transform(df[num_cols])
212
+
213
+ if export_csv:
214
+ _export_csv(df, data_name)
215
+
216
+ # print info
217
+ if print_info:
218
+ pos_count = (df[label_col] == 1).sum()
219
+ neg_count = (df[label_col] == -1).sum()
220
+
221
+ # Step 6: Print dataset information
222
+ print("\n" + "=" * 80)
223
+ print(f"{f'{data_name} - Info':^70}")
224
+ print("=" * 80)
225
+ print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
226
+ print(
227
+ f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
228
+ )
229
+ print(f"{'Positive samples (+1):':<40} {pos_count}")
230
+ print(f"{'Negative samples (-1):':<40} {neg_count}")
231
+ print(
232
+ f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
233
+ )
234
+ print("-" * 80)
235
+ print(f"Note:")
236
+ print(f"{'Label column:':<40} {label_col}")
237
+ print(f"{'label_map:':<40} {label_map}")
238
+ print(
239
+ f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
240
+ )
241
+ print(
242
+ f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
243
+ )
244
+ print("=" * 80 + "\n")
245
+
246
+ return df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: junshan_kit
3
- Version: 2.3.0
3
+ Version: 2.3.2
4
4
  Summary: This is an optimization tool.
5
5
  Author-email: Junshan Yin <junshanyin@163.com>
6
6
  Requires-Dist: kaggle==1.7.4.5
@@ -0,0 +1,7 @@
1
+ junshan_kit/DataProcessor.py,sha256=YIZMy2gnqnT8n9MMT-q7WtRB1bbA4ITwPxNBEasAnLQ,4966
2
+ junshan_kit/DataSets.py,sha256=qN4lTVaUsKlu4b8tkZ3aMgHg9lyZTQJlYsgc0uLwUys,8570
3
+ junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ junshan_kit/kit.py,sha256=tB1TpW9hW1EweK1RQwHOdUo7uG1QU4vSeyR0fdaSydo,9569
5
+ junshan_kit-2.3.2.dist-info/METADATA,sha256=OcuB7peGzVOuC0KIcohSPiCBdAs9BZ1mpZjX4ySSKpk,329
6
+ junshan_kit-2.3.2.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
7
+ junshan_kit-2.3.2.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- junshan_kit/DataProcessor.py,sha256=eryVmS5BFZj8wjDN2QWVHqkbFgFuWU0HXV9s6TGf9QM,4442
2
- junshan_kit/DataSets.py,sha256=rf5AVlA9DxP7wBpXjSO1_xznCMuxEoK50TqExafwHhc,3972
3
- junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- junshan_kit/kit.py,sha256=tB1TpW9hW1EweK1RQwHOdUo7uG1QU4vSeyR0fdaSydo,9569
5
- junshan_kit-2.3.0.dist-info/METADATA,sha256=9NlU4YOD0zx5F5tLIMREKKxn-LwYD8-7IFVtp7DvMNM,329
6
- junshan_kit-2.3.0.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
7
- junshan_kit-2.3.0.dist-info/RECORD,,