junshan-kit 2.4.2__py2.py3-none-any.whl → 2.4.4__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of junshan-kit might be problematic. Click here for more details.

@@ -146,6 +146,7 @@ class CSV_TO_Pandas:
146
146
  """
147
147
  # Step 0: Load the dataset
148
148
  df = pd.read_csv(csv_path)
149
+ columns = df.columns
149
150
 
150
151
  # Save original size
151
152
  m_original, n_original = df.shape
@@ -214,13 +215,17 @@ class CSV_TO_Pandas:
214
215
  print(f"{'Label column:':<40} {label_col}")
215
216
  print(f"{'label_map:':<40} {label_map}")
216
217
  print(f"{'time column:':<40} {time_info}")
217
- if time_info:
218
+ if time_info is not None:
218
219
  print(f"{'trans_type : int, optional, default=1'}")
219
220
  print(f"{' - 0 : Extract [\'year\', \'month\', \'day\', \'hour\']':<10}")
220
221
  print(f"{' - 1 : Extract [\'hour\', \'dayofweek\', \'is_weekend\']':<10}")
221
222
  print(
222
223
  f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
223
224
  )
225
+ print("-" * 80)
226
+ print(
227
+ f"{'all columns:':<40} {', '.join(columns)}"
228
+ )
224
229
  print("=" * 80 + "\n")
225
230
 
226
231
  return df
junshan_kit/DataSets.py CHANGED
@@ -40,10 +40,10 @@ def _download_data(data_name, data_type):
40
40
  print("❌ Invalid choice. Please enter 1 or 2.\n")
41
41
 
42
42
  # unzip file
43
- junshan_kit.kit.unzip_file(f'./exp_data/{data_type}/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
43
+ junshan_kit.kit.unzip_file(f'./exp_data/{data_type}/{data_name}/{data_name}.zip', f'./exp_data/{data_type}/{data_name}')
44
44
 
45
- def _export_csv(df, data_name):
46
- path = f'./exp_data/{data_name}/'
45
+ def _export_csv(df, data_name, data_type):
46
+ path = f'./exp_data/{data_type}/{data_name}/'
47
47
  os.makedirs(path, exist_ok=True)
48
48
  df.to_csv(path + f'{data_name}_num.csv')
49
49
  print(path + f'{data_name}.csv')
@@ -61,7 +61,7 @@ def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_
61
61
  df = cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map, data_name, user_one_hot_cols, print_info=print_info, time_info = time_info)
62
62
 
63
63
  if export_csv:
64
- _export_csv(df, data_name)
64
+ _export_csv(df, data_name, data_type)
65
65
 
66
66
  return df
67
67
 
@@ -73,11 +73,10 @@ def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_
73
73
  ----------------------------------------------------------------------
74
74
  """
75
75
 
76
- def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False):
76
+ def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False, drop_cols = []):
77
77
 
78
78
  data_type = "binary"
79
79
  csv_path = f'./exp_data/{data_type}/{data_name}/creditcard.csv'
80
- drop_cols = []
81
80
  label_col = 'Class'
82
81
  label_map = {0: -1, 1: 1}
83
82
 
@@ -88,24 +87,22 @@ def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print
88
87
  return df
89
88
 
90
89
 
91
- def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_info = False, export_csv = False):
90
+ def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_info = False, export_csv = False, drop_cols = []):
92
91
  data_type = "binary"
93
92
  csv_path = f'./exp_data/{data_type}/{data_name}/diabetes_dataset.csv'
94
- drop_cols = []
95
93
  label_col = 'diagnosed_diabetes'
96
94
  label_map = {0: -1, 1: 1}
97
95
 
98
-
99
96
  df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
100
97
 
101
98
  return df
102
99
 
103
100
 
104
- def electric_vehicle_population(data_name = "Electric Vehicle Population", print_info = False, export_csv = False):
101
+ def electric_vehicle_population(data_name = "Electric Vehicle Population", print_info = False, export_csv = False, drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']):
105
102
 
106
103
  data_type = "binary"
107
104
  csv_path = f'./exp_data/{data_type}/{data_name}/Electric_Vehicle_Population_Data.csv'
108
- drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
105
+ # drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
109
106
  label_col = 'Electric Vehicle Type'
110
107
  label_map = {
111
108
  'Battery Electric Vehicle (BEV)': 1,
@@ -117,11 +114,10 @@ def electric_vehicle_population(data_name = "Electric Vehicle Population", print
117
114
 
118
115
  return df
119
116
 
120
- def global_house_purchase(data_name = "Global House Purchase", print_info = False, export_csv = False):
117
+ def global_house_purchase(data_name = "Global House Purchase", print_info = False, export_csv = False, drop_cols = ['property_id']):
121
118
 
122
119
  data_type = "binary"
123
120
  csv_path = f'./exp_data/{data_type}/{data_name}/global_house_purchase_dataset.csv'
124
- drop_cols = ['property_id']
125
121
  label_col = 'decision'
126
122
  label_map = {0: -1, 1: 1}
127
123
 
@@ -131,11 +127,11 @@ def global_house_purchase(data_name = "Global House Purchase", print_info = Fals
131
127
  return df
132
128
 
133
129
 
134
- def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_csv = False):
130
+ def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_csv = False, drop_cols = ['id']):
135
131
 
136
132
  data_type = "binary"
137
133
  csv_path = f'./exp_data/{data_type}/{data_name}/health_lifestyle_dataset.csv'
138
- drop_cols = ['id']
134
+
139
135
  label_col = 'disease_risk'
140
136
  label_map = {0: -1, 1: 1}
141
137
 
@@ -145,7 +141,7 @@ def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_
145
141
  return df
146
142
 
147
143
 
148
- def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False, export_csv = False):
144
+ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False, export_csv = False, drop_cols = ['alcohol_freq']):
149
145
  """
150
146
  1. The missing values in this dataset are handled by directly removing the corresponding column. Since the `alcohol_freq` column contains a large number of missing values, deleting the rows would result in significant data loss, so the entire column is dropped instead.
151
147
 
@@ -154,7 +150,7 @@ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Predic
154
150
 
155
151
  data_type = "binary"
156
152
  csv_path = f'./exp_data/{data_type}/{data_name}/medical_insurance.csv'
157
- drop_cols = ['alcohol_freq']
153
+
158
154
  label_col = 'is_high_risk'
159
155
  label_map = {0: -1, 1: 1}
160
156
 
@@ -164,11 +160,11 @@ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Predic
164
160
  return df
165
161
 
166
162
 
167
- def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False, export_csv = False):
163
+ def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False, export_csv = False, drop_cols = []):
168
164
 
169
165
  data_type = "binary"
170
166
  csv_path = f'./exp_data/{data_type}/{data_name}/Particle Physics Event Classification.csv'
171
- drop_cols = []
167
+
172
168
  label_col = 'Label'
173
169
  label_map = {'s': -1, 'b': 1}
174
170
 
@@ -179,11 +175,11 @@ def particle_physics_event_classification(data_name = "Particle Physics Event Cl
179
175
 
180
176
 
181
177
 
182
- def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False, export_csv=False):
178
+ def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False, export_csv=False, drop_cols = []):
183
179
 
184
180
  data_type = "binary"
185
181
  csv_path = f'./exp_data/{data_type}/{data_name}/adult.csv'
186
- drop_cols = []
182
+
187
183
  label_col = 'income'
188
184
  label_map = {'<=50K': -1, '>50K': 1}
189
185
 
@@ -193,11 +189,11 @@ def adult_income_prediction(data_name = "Adult Income Prediction", print_info =
193
189
  return df
194
190
 
195
191
 
196
- def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
192
+ def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False, drop_cols = ['Unnamed: 0']):
197
193
 
198
194
  data_type = "binary"
199
195
  csv_path = f'./exp_data/{data_type}/{data_name}/TNweather_1.8M.csv'
200
- drop_cols = ['Unnamed: 0']
196
+
201
197
  label_col = 'rain_tomorrow'
202
198
  label_map = {0: -1, 1: 1}
203
199
 
@@ -214,11 +210,11 @@ def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info =
214
210
 
215
211
  return df
216
212
 
217
- def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
213
+ def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False, drop_cols = ['user_id']):
218
214
 
219
215
  data_type = "binary"
220
216
  csv_path = f'./exp_data/{data_type}/{data_name}/youtube recommendation dataset.csv'
221
- drop_cols = ['user_id']
217
+
222
218
  label_col = 'subscribed_after'
223
219
  label_map = {0: -1, 1: 1}
224
220
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: junshan_kit
3
- Version: 2.4.2
3
+ Version: 2.4.4
4
4
  Summary: This is an optimization tool.
5
5
  Author-email: Junshan Yin <junshanyin@163.com>
6
6
  Requires-Dist: kaggle==1.7.4.5
@@ -1,6 +1,6 @@
1
1
  junshan_kit/ComOptimizers.py,sha256=MUgFnm1DbbvNKv5-7nHJCLOfq4VjoNk1KLRR-ji5rOA,4637
2
- junshan_kit/DataProcessor.py,sha256=rp1zOTOoF98idwGM_QRzr8yC9M5fj1uyfVhdmV02kyQ,8962
3
- junshan_kit/DataSets.py,sha256=rRaCPtlR5WvH0E1CAaaWbVkfS5QU12ak31VbREq_prE,8354
2
+ junshan_kit/DataProcessor.py,sha256=KBvhW7nIbOkLz3Mr6-gl3oVJ2Ua8QHuV_LJiQWkm_lE,9125
3
+ junshan_kit/DataSets.py,sha256=PspH23YbB9cSuh5KQp7Dam3fWsfyH0pwL12nt7KN_tQ,8470
4
4
  junshan_kit/ExperimentHub.py,sha256=MKduxa7U16zMoavgS-lVOCL2ypcMLpAaD8k7JitNqRU,11493
5
5
  junshan_kit/Models.py,sha256=GRTunJON1vLQz2IxgsoOKvjP-3zSJJLuB3CkJTAiImo,6884
6
6
  junshan_kit/Print_Info.py,sha256=vogYcXvoGcRGZV-7svi_mtiCZH6c8d-RhbZLFrLbKr8,3012
@@ -11,6 +11,6 @@ junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  junshan_kit/check_args.py,sha256=7m4xSPAwqqQ0SPeKc-MCewDIDB7kFgsNYS2AuTTzGtk,3599
12
12
  junshan_kit/datahub.py,sha256=4c3P2TORMZ4va6NrSiojDCpnY_CGDlJV-5PG3u1_Isk,9081
13
13
  junshan_kit/kit.py,sha256=hpA4Zpn1VAuhdJSBBXswVum0CSk6QnB05GGLYoaRatQ,9792
14
- junshan_kit-2.4.2.dist-info/METADATA,sha256=KSijvx8KSsP-pIIPiyYR8PGINFJTIp6DVSt2g-47EX0,266
15
- junshan_kit-2.4.2.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
16
- junshan_kit-2.4.2.dist-info/RECORD,,
14
+ junshan_kit-2.4.4.dist-info/METADATA,sha256=etJ08CZjxNda9hliEBmjxw-1-OOiMtqxAyQXQcKEapU,266
15
+ junshan_kit-2.4.4.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
16
+ junshan_kit-2.4.4.dist-info/RECORD,,