junshan-kit 2.4.3__py2.py3-none-any.whl → 2.4.5__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of junshan-kit might be problematic. Click here for more details.
junshan_kit/DataProcessor.py
CHANGED
|
@@ -146,6 +146,7 @@ class CSV_TO_Pandas:
|
|
|
146
146
|
"""
|
|
147
147
|
# Step 0: Load the dataset
|
|
148
148
|
df = pd.read_csv(csv_path)
|
|
149
|
+
columns = df.columns
|
|
149
150
|
|
|
150
151
|
# Save original size
|
|
151
152
|
m_original, n_original = df.shape
|
|
@@ -214,13 +215,17 @@ class CSV_TO_Pandas:
|
|
|
214
215
|
print(f"{'Label column:':<40} {label_col}")
|
|
215
216
|
print(f"{'label_map:':<40} {label_map}")
|
|
216
217
|
print(f"{'time column:':<40} {time_info}")
|
|
217
|
-
if time_info:
|
|
218
|
+
if time_info is not None:
|
|
218
219
|
print(f"{'trans_type : int, optional, default=1'}")
|
|
219
|
-
print(f"{
|
|
220
|
-
print(f"{
|
|
220
|
+
print(f"{- 0 : Extract ['year', 'month', 'day', 'hour']:<50}")
|
|
221
|
+
print(f"{ - 1 : Extract ['hour', 'dayofweek', 'is_weekend']:<50}")
|
|
221
222
|
print(
|
|
222
223
|
f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
|
|
223
224
|
)
|
|
225
|
+
print("-" * 80)
|
|
226
|
+
print(
|
|
227
|
+
f"{'all columns:':<40} {', '.join(columns)}"
|
|
228
|
+
)
|
|
224
229
|
print("=" * 80 + "\n")
|
|
225
230
|
|
|
226
231
|
return df
|
junshan_kit/DataSets.py
CHANGED
|
@@ -42,8 +42,8 @@ def _download_data(data_name, data_type):
|
|
|
42
42
|
# unzip file
|
|
43
43
|
junshan_kit.kit.unzip_file(f'./exp_data/{data_type}/{data_name}/{data_name}.zip', f'./exp_data/{data_type}/{data_name}')
|
|
44
44
|
|
|
45
|
-
def _export_csv(df, data_name):
|
|
46
|
-
path = f'./exp_data/{data_name}/'
|
|
45
|
+
def _export_csv(df, data_name, data_type):
|
|
46
|
+
path = f'./exp_data/{data_type}/{data_name}/'
|
|
47
47
|
os.makedirs(path, exist_ok=True)
|
|
48
48
|
df.to_csv(path + f'{data_name}_num.csv')
|
|
49
49
|
print(path + f'{data_name}.csv')
|
|
@@ -61,7 +61,7 @@ def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_
|
|
|
61
61
|
df = cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map, data_name, user_one_hot_cols, print_info=print_info, time_info = time_info)
|
|
62
62
|
|
|
63
63
|
if export_csv:
|
|
64
|
-
_export_csv(df, data_name)
|
|
64
|
+
_export_csv(df, data_name, data_type)
|
|
65
65
|
|
|
66
66
|
return df
|
|
67
67
|
|
|
@@ -73,11 +73,10 @@ def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_
|
|
|
73
73
|
----------------------------------------------------------------------
|
|
74
74
|
"""
|
|
75
75
|
|
|
76
|
-
def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False):
|
|
76
|
+
def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False, drop_cols = []):
|
|
77
77
|
|
|
78
78
|
data_type = "binary"
|
|
79
79
|
csv_path = f'./exp_data/{data_type}/{data_name}/creditcard.csv'
|
|
80
|
-
drop_cols = []
|
|
81
80
|
label_col = 'Class'
|
|
82
81
|
label_map = {0: -1, 1: 1}
|
|
83
82
|
|
|
@@ -88,24 +87,22 @@ def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print
|
|
|
88
87
|
return df
|
|
89
88
|
|
|
90
89
|
|
|
91
|
-
def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_info = False, export_csv = False):
|
|
90
|
+
def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_info = False, export_csv = False, drop_cols = []):
|
|
92
91
|
data_type = "binary"
|
|
93
92
|
csv_path = f'./exp_data/{data_type}/{data_name}/diabetes_dataset.csv'
|
|
94
|
-
drop_cols = []
|
|
95
93
|
label_col = 'diagnosed_diabetes'
|
|
96
94
|
label_map = {0: -1, 1: 1}
|
|
97
95
|
|
|
98
|
-
|
|
99
96
|
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
|
100
97
|
|
|
101
98
|
return df
|
|
102
99
|
|
|
103
100
|
|
|
104
|
-
def electric_vehicle_population(data_name = "Electric Vehicle Population", print_info = False, export_csv = False):
|
|
101
|
+
def electric_vehicle_population(data_name = "Electric Vehicle Population", print_info = False, export_csv = False, drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']):
|
|
105
102
|
|
|
106
103
|
data_type = "binary"
|
|
107
104
|
csv_path = f'./exp_data/{data_type}/{data_name}/Electric_Vehicle_Population_Data.csv'
|
|
108
|
-
drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
|
|
105
|
+
# drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
|
|
109
106
|
label_col = 'Electric Vehicle Type'
|
|
110
107
|
label_map = {
|
|
111
108
|
'Battery Electric Vehicle (BEV)': 1,
|
|
@@ -117,11 +114,10 @@ def electric_vehicle_population(data_name = "Electric Vehicle Population", print
|
|
|
117
114
|
|
|
118
115
|
return df
|
|
119
116
|
|
|
120
|
-
def global_house_purchase(data_name = "Global House Purchase", print_info = False, export_csv = False):
|
|
117
|
+
def global_house_purchase(data_name = "Global House Purchase", print_info = False, export_csv = False, drop_cols = ['property_id']):
|
|
121
118
|
|
|
122
119
|
data_type = "binary"
|
|
123
120
|
csv_path = f'./exp_data/{data_type}/{data_name}/global_house_purchase_dataset.csv'
|
|
124
|
-
drop_cols = ['property_id']
|
|
125
121
|
label_col = 'decision'
|
|
126
122
|
label_map = {0: -1, 1: 1}
|
|
127
123
|
|
|
@@ -131,11 +127,11 @@ def global_house_purchase(data_name = "Global House Purchase", print_info = Fals
|
|
|
131
127
|
return df
|
|
132
128
|
|
|
133
129
|
|
|
134
|
-
def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_csv = False):
|
|
130
|
+
def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_csv = False, drop_cols = ['id']):
|
|
135
131
|
|
|
136
132
|
data_type = "binary"
|
|
137
133
|
csv_path = f'./exp_data/{data_type}/{data_name}/health_lifestyle_dataset.csv'
|
|
138
|
-
|
|
134
|
+
|
|
139
135
|
label_col = 'disease_risk'
|
|
140
136
|
label_map = {0: -1, 1: 1}
|
|
141
137
|
|
|
@@ -145,7 +141,7 @@ def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_
|
|
|
145
141
|
return df
|
|
146
142
|
|
|
147
143
|
|
|
148
|
-
def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False, export_csv = False):
|
|
144
|
+
def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False, export_csv = False, drop_cols = ['alcohol_freq']):
|
|
149
145
|
"""
|
|
150
146
|
1. The missing values in this dataset are handled by directly removing the corresponding column. Since the `alcohol_freq` column contains a large number of missing values, deleting the rows would result in significant data loss, so the entire column is dropped instead.
|
|
151
147
|
|
|
@@ -154,7 +150,7 @@ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Predic
|
|
|
154
150
|
|
|
155
151
|
data_type = "binary"
|
|
156
152
|
csv_path = f'./exp_data/{data_type}/{data_name}/medical_insurance.csv'
|
|
157
|
-
|
|
153
|
+
|
|
158
154
|
label_col = 'is_high_risk'
|
|
159
155
|
label_map = {0: -1, 1: 1}
|
|
160
156
|
|
|
@@ -164,11 +160,11 @@ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Predic
|
|
|
164
160
|
return df
|
|
165
161
|
|
|
166
162
|
|
|
167
|
-
def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False, export_csv = False):
|
|
163
|
+
def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False, export_csv = False, drop_cols = []):
|
|
168
164
|
|
|
169
165
|
data_type = "binary"
|
|
170
166
|
csv_path = f'./exp_data/{data_type}/{data_name}/Particle Physics Event Classification.csv'
|
|
171
|
-
|
|
167
|
+
|
|
172
168
|
label_col = 'Label'
|
|
173
169
|
label_map = {'s': -1, 'b': 1}
|
|
174
170
|
|
|
@@ -179,11 +175,11 @@ def particle_physics_event_classification(data_name = "Particle Physics Event Cl
|
|
|
179
175
|
|
|
180
176
|
|
|
181
177
|
|
|
182
|
-
def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False, export_csv=False):
|
|
178
|
+
def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False, export_csv=False, drop_cols = []):
|
|
183
179
|
|
|
184
180
|
data_type = "binary"
|
|
185
181
|
csv_path = f'./exp_data/{data_type}/{data_name}/adult.csv'
|
|
186
|
-
|
|
182
|
+
|
|
187
183
|
label_col = 'income'
|
|
188
184
|
label_map = {'<=50K': -1, '>50K': 1}
|
|
189
185
|
|
|
@@ -193,11 +189,11 @@ def adult_income_prediction(data_name = "Adult Income Prediction", print_info =
|
|
|
193
189
|
return df
|
|
194
190
|
|
|
195
191
|
|
|
196
|
-
def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
|
|
192
|
+
def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False, drop_cols = ['Unnamed: 0']):
|
|
197
193
|
|
|
198
194
|
data_type = "binary"
|
|
199
195
|
csv_path = f'./exp_data/{data_type}/{data_name}/TNweather_1.8M.csv'
|
|
200
|
-
|
|
196
|
+
|
|
201
197
|
label_col = 'rain_tomorrow'
|
|
202
198
|
label_map = {0: -1, 1: 1}
|
|
203
199
|
|
|
@@ -214,11 +210,11 @@ def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info =
|
|
|
214
210
|
|
|
215
211
|
return df
|
|
216
212
|
|
|
217
|
-
def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
|
|
213
|
+
def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False, drop_cols = ['user_id']):
|
|
218
214
|
|
|
219
215
|
data_type = "binary"
|
|
220
216
|
csv_path = f'./exp_data/{data_type}/{data_name}/youtube recommendation dataset.csv'
|
|
221
|
-
|
|
217
|
+
|
|
222
218
|
label_col = 'subscribed_after'
|
|
223
219
|
label_map = {0: -1, 1: 1}
|
|
224
220
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
junshan_kit/ComOptimizers.py,sha256=MUgFnm1DbbvNKv5-7nHJCLOfq4VjoNk1KLRR-ji5rOA,4637
|
|
2
|
-
junshan_kit/DataProcessor.py,sha256=
|
|
3
|
-
junshan_kit/DataSets.py,sha256=
|
|
2
|
+
junshan_kit/DataProcessor.py,sha256=oAbf9QsCaLjnN2yrcn8qGof56dSkEv9jHkWiB1CxfTw,9106
|
|
3
|
+
junshan_kit/DataSets.py,sha256=PspH23YbB9cSuh5KQp7Dam3fWsfyH0pwL12nt7KN_tQ,8470
|
|
4
4
|
junshan_kit/ExperimentHub.py,sha256=MKduxa7U16zMoavgS-lVOCL2ypcMLpAaD8k7JitNqRU,11493
|
|
5
5
|
junshan_kit/Models.py,sha256=GRTunJON1vLQz2IxgsoOKvjP-3zSJJLuB3CkJTAiImo,6884
|
|
6
6
|
junshan_kit/Print_Info.py,sha256=vogYcXvoGcRGZV-7svi_mtiCZH6c8d-RhbZLFrLbKr8,3012
|
|
@@ -11,6 +11,6 @@ junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
11
11
|
junshan_kit/check_args.py,sha256=7m4xSPAwqqQ0SPeKc-MCewDIDB7kFgsNYS2AuTTzGtk,3599
|
|
12
12
|
junshan_kit/datahub.py,sha256=4c3P2TORMZ4va6NrSiojDCpnY_CGDlJV-5PG3u1_Isk,9081
|
|
13
13
|
junshan_kit/kit.py,sha256=hpA4Zpn1VAuhdJSBBXswVum0CSk6QnB05GGLYoaRatQ,9792
|
|
14
|
-
junshan_kit-2.4.
|
|
15
|
-
junshan_kit-2.4.
|
|
16
|
-
junshan_kit-2.4.
|
|
14
|
+
junshan_kit-2.4.5.dist-info/METADATA,sha256=w5OjSbU0MXzViQmIv8J2YR1Jx87gMWVyKeqEIuR3AUU,266
|
|
15
|
+
junshan_kit-2.4.5.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
|
16
|
+
junshan_kit-2.4.5.dist-info/RECORD,,
|
|
File without changes
|