junshan-kit 2.4.7__py2.py3-none-any.whl → 2.4.9__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of junshan-kit might be problematic. Click here for more details.
- junshan_kit/DataHub.py +114 -0
- junshan_kit/DataProcessor.py +114 -24
- junshan_kit/DataSets.py +186 -37
- junshan_kit/{Models.py → ModelsHub.py} +5 -0
- junshan_kit/ParametersHub.py +404 -0
- junshan_kit/Print_Info.py +6 -2
- junshan_kit/TrainingHub.py +75 -0
- junshan_kit/kit.py +94 -30
- {junshan_kit-2.4.7.dist-info → junshan_kit-2.4.9.dist-info}/METADATA +2 -2
- junshan_kit-2.4.9.dist-info/RECORD +12 -0
- junshan_kit/ComOptimizers.py +0 -126
- junshan_kit/ExperimentHub.py +0 -338
- junshan_kit/SPBM.py +0 -350
- junshan_kit/SPBM_func.py +0 -601
- junshan_kit/TrainingParas.py +0 -470
- junshan_kit/check_args.py +0 -116
- junshan_kit/datahub.py +0 -281
- junshan_kit-2.4.7.dist-info/RECORD +0 -16
- {junshan_kit-2.4.7.dist-info → junshan_kit-2.4.9.dist-info}/WHEEL +0 -0
junshan_kit/DataSets.py
CHANGED
|
@@ -5,14 +5,53 @@
|
|
|
5
5
|
----------------------------------------------------------------------
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
import os
|
|
8
|
+
import os
|
|
9
9
|
import pandas as pd
|
|
10
|
+
from scipy.sparse import csr_matrix
|
|
11
|
+
from scipy.io import savemat
|
|
10
12
|
import junshan_kit.DataProcessor
|
|
11
13
|
import junshan_kit.kit
|
|
12
14
|
from sklearn.preprocessing import StandardScaler
|
|
13
15
|
|
|
14
16
|
#----------------------------------------------------------
|
|
15
17
|
def _download_data(data_name, data_type):
|
|
18
|
+
"""
|
|
19
|
+
Download and extract a dataset from Jianguoyun using either Firefox or Chrome automation.
|
|
20
|
+
|
|
21
|
+
This helper function allows the user to manually provide a Jianguoyun download link,
|
|
22
|
+
choose a browser (Firefox or Chrome) for automated downloading, and automatically unzip the downloaded dataset into a structured local directory.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
data_name (str):
|
|
26
|
+
The name of the dataset (used as a folder name for storage).
|
|
27
|
+
|
|
28
|
+
data_type (str):
|
|
29
|
+
The dataset category, e.g., "binary" or "multi".
|
|
30
|
+
Determines the subdirectory under './exp_data/'.
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ValueError:
|
|
34
|
+
If `data_type` is not one of the allowed options: ["binary", "multi"].
|
|
35
|
+
|
|
36
|
+
Behavior:
|
|
37
|
+
- Prompts the user to input a Jianguoyun download URL.
|
|
38
|
+
- Lets the user select a download method (Firefox or Chrome).
|
|
39
|
+
- Downloads the `.zip` file into `./exp_data/{data_name}/`.
|
|
40
|
+
- Automatically extracts the zip file in the same directory.
|
|
41
|
+
- Prints progress and completion messages.
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
>>> _download_data("mnist", "binary")
|
|
45
|
+
Enter the Jianguoyun download URL: https://www.jianguoyun.com/p/abcd1234
|
|
46
|
+
Select download method:
|
|
47
|
+
1. Firefox
|
|
48
|
+
2. Chrome
|
|
49
|
+
Enter the number of your choice (1 or 2):
|
|
50
|
+
|
|
51
|
+
Note:
|
|
52
|
+
Requires `junshan_kit` with `JianguoyunDownloaderFirefox`,
|
|
53
|
+
`JianguoyunDownloaderChrome`, and `unzip_file` utilities available.
|
|
54
|
+
"""
|
|
16
55
|
allowed_types = ["binary", "multi"]
|
|
17
56
|
if data_type not in allowed_types:
|
|
18
57
|
raise ValueError(f"Invalid data_type: {data_type!r}. Must be one of {allowed_types}.")
|
|
@@ -29,43 +68,78 @@ def _download_data(data_name, data_type):
|
|
|
29
68
|
choice = input("Enter the number of your choice (1 or 2): ").strip()
|
|
30
69
|
|
|
31
70
|
if choice == "1":
|
|
32
|
-
JianguoyunDownloaderFirefox(url, f"./exp_data/{
|
|
33
|
-
print("
|
|
71
|
+
JianguoyunDownloaderFirefox(url, f"./exp_data/{data_name}").run()
|
|
72
|
+
print("*** Download completed using Firefox ***")
|
|
34
73
|
break
|
|
35
74
|
elif choice == "2":
|
|
36
|
-
JianguoyunDownloaderChrome(url, f"./exp_data/{
|
|
37
|
-
print("
|
|
75
|
+
JianguoyunDownloaderChrome(url, f"./exp_data/{data_name}").run()
|
|
76
|
+
print("*** Download completed using Chrome ***")
|
|
38
77
|
break
|
|
39
78
|
else:
|
|
40
|
-
print("
|
|
79
|
+
print("*** Invalid choice. Please enter 1 or 2 ***\n")
|
|
41
80
|
|
|
42
81
|
# unzip file
|
|
43
|
-
junshan_kit.kit.unzip_file(f'./exp_data/{
|
|
82
|
+
junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
|
|
44
83
|
|
|
45
84
|
def _export_csv(df, data_name, data_type):
|
|
46
|
-
path = f'./exp_data/{
|
|
85
|
+
path = f'./exp_data/{data_name}/'
|
|
47
86
|
os.makedirs(path, exist_ok=True)
|
|
48
87
|
df.to_csv(path + f'{data_name}_num.csv', index=False)
|
|
49
88
|
print(path + f'{data_name}.csv')
|
|
50
89
|
|
|
51
90
|
|
|
52
|
-
def
|
|
53
|
-
|
|
91
|
+
def _export_mat(df, data_name, label_col):
|
|
92
|
+
# Extract label and feature matrices
|
|
93
|
+
y = df[label_col].values # Target column
|
|
94
|
+
X = df.drop(columns=[label_col]).values # Feature matrix
|
|
95
|
+
|
|
96
|
+
# Convert to sparse matrices
|
|
97
|
+
X_sparse = csr_matrix(X)
|
|
98
|
+
Y_sparse = csr_matrix(y.reshape(-1, 1)) # Convert target to column sparse matrix
|
|
99
|
+
|
|
100
|
+
# Get number of samples and features
|
|
101
|
+
m, n = X.shape
|
|
102
|
+
|
|
103
|
+
# Save as a MAT file (supports large datasets)
|
|
104
|
+
save_path = f'exp_data/{data_name}/{data_name}.mat'
|
|
105
|
+
savemat(save_path, {'X': X_sparse, 'Y': Y_sparse, 'm': m, 'n': n}, do_compression=True)
|
|
106
|
+
|
|
107
|
+
# Print confirmation
|
|
108
|
+
print("Sparse MAT file saved to:", save_path)
|
|
109
|
+
print("Number of samples (m):", m)
|
|
110
|
+
print("Number of features (n):", n)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None, df = None, missing_strategy = 'drop', Paras = None):
|
|
114
|
+
|
|
115
|
+
if csv_path is not None and not os.path.exists(csv_path):
|
|
54
116
|
print('\n' + '*'*60)
|
|
55
117
|
print(f"Please download the data.")
|
|
56
118
|
print(csv_path)
|
|
57
|
-
_download_data(data_name, data_type=data_type)
|
|
58
|
-
|
|
59
|
-
|
|
119
|
+
_download_data(data_name, data_type=data_type)
|
|
120
|
+
|
|
121
|
+
if not os.path.exists(f"./exp_data/{data_name}"):
|
|
122
|
+
print('\n' + '*'*60)
|
|
123
|
+
print(f"Please download the data.")
|
|
124
|
+
print(f"./exp_data/{data_name}")
|
|
125
|
+
_download_data(data_name, data_type=data_type)
|
|
126
|
+
|
|
127
|
+
if df is None:
|
|
128
|
+
df = pd.read_csv(csv_path)
|
|
129
|
+
|
|
60
130
|
cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
|
|
61
|
-
df = cleaner.preprocess_dataset(
|
|
131
|
+
df = cleaner.preprocess_dataset(df, drop_cols, label_col, label_map, title_name=data_name, user_one_hot_cols=user_one_hot_cols, print_info=print_info, time_info = time_info, missing_strategy = missing_strategy)
|
|
62
132
|
|
|
63
133
|
if export_csv:
|
|
64
134
|
_export_csv(df, data_name, data_type)
|
|
135
|
+
|
|
136
|
+
if Paras is not None and Paras["export_mat"]:
|
|
137
|
+
_export_mat(df, data_name, label_col)
|
|
65
138
|
|
|
66
139
|
return df
|
|
67
140
|
|
|
68
141
|
|
|
142
|
+
|
|
69
143
|
# ********************************************************************
|
|
70
144
|
"""
|
|
71
145
|
----------------------------------------------------------------------
|
|
@@ -73,15 +147,15 @@ def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_
|
|
|
73
147
|
----------------------------------------------------------------------
|
|
74
148
|
"""
|
|
75
149
|
|
|
76
|
-
def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False, drop_cols = []
|
|
150
|
+
def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False, drop_cols = []):
|
|
77
151
|
|
|
78
152
|
data_type = "binary"
|
|
79
|
-
csv_path = f'./exp_data/{
|
|
153
|
+
csv_path = f'./exp_data/{data_name}/creditcard.csv'
|
|
80
154
|
label_col = 'Class'
|
|
81
155
|
label_map = {0: -1, 1: 1}
|
|
82
156
|
|
|
83
157
|
|
|
84
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv
|
|
158
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
|
85
159
|
|
|
86
160
|
|
|
87
161
|
return df
|
|
@@ -89,11 +163,11 @@ def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print
|
|
|
89
163
|
|
|
90
164
|
def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_info = False, export_csv = False, drop_cols = [], Standard = False):
|
|
91
165
|
data_type = "binary"
|
|
92
|
-
csv_path = f'./exp_data/{
|
|
166
|
+
csv_path = f'./exp_data/{data_name}/diabetes_dataset.csv'
|
|
93
167
|
label_col = 'diagnosed_diabetes'
|
|
94
168
|
label_map = {0: -1, 1: 1}
|
|
95
169
|
|
|
96
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv
|
|
170
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
|
97
171
|
|
|
98
172
|
return df
|
|
99
173
|
|
|
@@ -101,7 +175,7 @@ def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_i
|
|
|
101
175
|
def electric_vehicle_population(data_name = "Electric Vehicle Population", print_info = False, export_csv = False, drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location'], Standard = False):
|
|
102
176
|
|
|
103
177
|
data_type = "binary"
|
|
104
|
-
csv_path = f'./exp_data/{
|
|
178
|
+
csv_path = f'./exp_data/{data_name}/Electric_Vehicle_Population_Data.csv'
|
|
105
179
|
# drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
|
|
106
180
|
label_col = 'Electric Vehicle Type'
|
|
107
181
|
label_map = {
|
|
@@ -110,19 +184,19 @@ def electric_vehicle_population(data_name = "Electric Vehicle Population", print
|
|
|
110
184
|
}
|
|
111
185
|
|
|
112
186
|
|
|
113
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv
|
|
187
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
|
114
188
|
|
|
115
189
|
return df
|
|
116
190
|
|
|
117
191
|
def global_house_purchase(data_name = "Global House Purchase", print_info = False, export_csv = False, drop_cols = ['property_id'], Standard =False):
|
|
118
192
|
|
|
119
193
|
data_type = "binary"
|
|
120
|
-
csv_path = f'./exp_data/{
|
|
194
|
+
csv_path = f'./exp_data/{data_name}/global_house_purchase_dataset.csv'
|
|
121
195
|
label_col = 'decision'
|
|
122
196
|
label_map = {0: -1, 1: 1}
|
|
123
197
|
|
|
124
198
|
|
|
125
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv
|
|
199
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
|
126
200
|
|
|
127
201
|
return df
|
|
128
202
|
|
|
@@ -130,13 +204,13 @@ def global_house_purchase(data_name = "Global House Purchase", print_info = Fals
|
|
|
130
204
|
def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_csv = False, drop_cols = ['id'], Standard =False):
|
|
131
205
|
|
|
132
206
|
data_type = "binary"
|
|
133
|
-
csv_path = f'./exp_data/{
|
|
207
|
+
csv_path = f'./exp_data/{data_name}/health_lifestyle_dataset.csv'
|
|
134
208
|
|
|
135
209
|
label_col = 'disease_risk'
|
|
136
210
|
label_map = {0: -1, 1: 1}
|
|
137
211
|
|
|
138
212
|
|
|
139
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv
|
|
213
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
|
140
214
|
|
|
141
215
|
return df
|
|
142
216
|
|
|
@@ -149,13 +223,13 @@ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Predic
|
|
|
149
223
|
"""
|
|
150
224
|
|
|
151
225
|
data_type = "binary"
|
|
152
|
-
csv_path = f'./exp_data/{
|
|
226
|
+
csv_path = f'./exp_data/{data_name}/medical_insurance.csv'
|
|
153
227
|
|
|
154
228
|
label_col = 'is_high_risk'
|
|
155
229
|
label_map = {0: -1, 1: 1}
|
|
156
230
|
|
|
157
231
|
|
|
158
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv
|
|
232
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
|
159
233
|
|
|
160
234
|
return df
|
|
161
235
|
|
|
@@ -163,13 +237,13 @@ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Predic
|
|
|
163
237
|
def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False, export_csv = False, drop_cols = [], Standard =False):
|
|
164
238
|
|
|
165
239
|
data_type = "binary"
|
|
166
|
-
csv_path = f'./exp_data/{
|
|
240
|
+
csv_path = f'./exp_data/{data_name}/Particle Physics Event Classification.csv'
|
|
167
241
|
|
|
168
242
|
label_col = 'Label'
|
|
169
243
|
label_map = {'s': -1, 'b': 1}
|
|
170
244
|
|
|
171
245
|
|
|
172
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv
|
|
246
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
|
173
247
|
|
|
174
248
|
return df
|
|
175
249
|
|
|
@@ -178,13 +252,13 @@ def particle_physics_event_classification(data_name = "Particle Physics Event Cl
|
|
|
178
252
|
def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False, export_csv=False, drop_cols = [], Standard = False):
|
|
179
253
|
|
|
180
254
|
data_type = "binary"
|
|
181
|
-
csv_path = f'./exp_data/{
|
|
255
|
+
csv_path = f'./exp_data/{data_name}/adult.csv'
|
|
182
256
|
|
|
183
257
|
label_col = 'income'
|
|
184
258
|
label_map = {'<=50K': -1, '>50K': 1}
|
|
185
259
|
|
|
186
260
|
|
|
187
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv
|
|
261
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
|
188
262
|
|
|
189
263
|
return df
|
|
190
264
|
|
|
@@ -192,7 +266,7 @@ def adult_income_prediction(data_name = "Adult Income Prediction", print_info =
|
|
|
192
266
|
def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False, drop_cols = ['Unnamed: 0'], Standard = False):
|
|
193
267
|
|
|
194
268
|
data_type = "binary"
|
|
195
|
-
csv_path = f'./exp_data/{
|
|
269
|
+
csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
|
|
196
270
|
|
|
197
271
|
label_col = 'rain_tomorrow'
|
|
198
272
|
label_map = {0: -1, 1: 1}
|
|
@@ -202,15 +276,15 @@ def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info =
|
|
|
202
276
|
'trans_type': 0
|
|
203
277
|
}
|
|
204
278
|
|
|
205
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info
|
|
279
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
|
|
206
280
|
|
|
207
281
|
|
|
208
282
|
return df
|
|
209
283
|
|
|
210
|
-
def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False, drop_cols = ['user_id']
|
|
284
|
+
def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False, drop_cols = ['user_id']):
|
|
211
285
|
|
|
212
286
|
data_type = "binary"
|
|
213
|
-
csv_path = f'./exp_data/{
|
|
287
|
+
csv_path = f'./exp_data/{data_name}/youtube recommendation dataset.csv'
|
|
214
288
|
|
|
215
289
|
label_col = 'subscribed_after'
|
|
216
290
|
label_map = {0: -1, 1: 1}
|
|
@@ -218,11 +292,86 @@ def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = Fa
|
|
|
218
292
|
# Extraction mode.
|
|
219
293
|
# - 0 : Extract ['year', 'month', 'day', 'hour']
|
|
220
294
|
# - 1 : Extract ['hour', 'dayofweek', 'is_weekend']
|
|
295
|
+
# - 2 : Extract ['year', 'month', 'day']
|
|
221
296
|
time_info = {
|
|
222
297
|
'time_col_name': 'timestamp',
|
|
223
298
|
'trans_type': 1
|
|
224
299
|
}
|
|
225
300
|
|
|
226
|
-
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info
|
|
301
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
|
|
302
|
+
|
|
303
|
+
return df
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def Santander_Customer_Satisfaction(data_name = "SantanderCustomerSatisfaction", print_info = False, export_csv = False):
|
|
307
|
+
data_type = "binary"
|
|
308
|
+
csv_path = None
|
|
309
|
+
|
|
310
|
+
drop_cols = ['ID_code']
|
|
311
|
+
label_col = 'target'
|
|
312
|
+
label_map = {False: -1, True: 1}
|
|
313
|
+
|
|
314
|
+
df, y, categorical_indicator, attribute_names = junshan_kit.kit.download_openml_data(data_name)
|
|
315
|
+
|
|
316
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, df=df)
|
|
317
|
+
|
|
318
|
+
return df
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def newsgroups_drift(data_name = "20_newsgroups.drift", print_info = False, export_csv = False):
|
|
322
|
+
data_type = "binary"
|
|
323
|
+
csv_path = None
|
|
324
|
+
|
|
325
|
+
drop_cols = ['ID_code']
|
|
326
|
+
label_col = 'target'
|
|
327
|
+
label_map = {False: -1, True: 1}
|
|
328
|
+
|
|
329
|
+
df, y, categorical_indicator, attribute_names = junshan_kit.kit.download_openml_data(data_name)
|
|
330
|
+
|
|
331
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, df=df)
|
|
332
|
+
|
|
333
|
+
return df
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def Homesite_Quote_Conversion(data_name = "Homesite_Quote_Conversion", print_info = False, export_csv = False):
|
|
337
|
+
data_type = "binary"
|
|
338
|
+
csv_path = None
|
|
339
|
+
missing_strategy = 'mode'
|
|
340
|
+
|
|
341
|
+
drop_cols = ['QuoteNumber']
|
|
342
|
+
label_col = 'QuoteConversion_Flag'
|
|
343
|
+
label_map = {0: -1, 1: 1}
|
|
344
|
+
|
|
345
|
+
time_info = {
|
|
346
|
+
'time_col_name': 'Original_Quote_Date',
|
|
347
|
+
'trans_type': 2
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
df, y, categorical_indicator, attribute_names = junshan_kit.kit.download_openml_data(data_name)
|
|
351
|
+
|
|
352
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, df=df, time_info = time_info, missing_strategy = missing_strategy)
|
|
353
|
+
|
|
354
|
+
return df
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def IEEE_CIS_Fraud_Detection(data_name = "IEEE-CIS_Fraud_Detection", print_info = False, export_csv = False, export_mat = False):
|
|
358
|
+
data_type = "binary"
|
|
359
|
+
csv_path = None
|
|
360
|
+
missing_strategy = 'mode'
|
|
361
|
+
|
|
362
|
+
drop_cols = ['TransactionID']
|
|
363
|
+
label_col = 'isFraud'
|
|
364
|
+
label_map = {0: -1, 1: 1}
|
|
365
|
+
|
|
366
|
+
Paras = {
|
|
367
|
+
"export_mat": export_mat
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
df, y, categorical_indicator, attribute_names = junshan_kit.kit.download_openml_data(data_name)
|
|
371
|
+
|
|
372
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, df=df, missing_strategy = missing_strategy, Paras = Paras)
|
|
373
|
+
|
|
374
|
+
return df
|
|
375
|
+
|
|
376
|
+
|
|
227
377
|
|
|
228
|
-
return df
|
|
@@ -202,6 +202,11 @@ def Build_LogRegressionBinaryL2_w8a():
|
|
|
202
202
|
nn.Flatten(),
|
|
203
203
|
nn.Linear(300, 1))
|
|
204
204
|
|
|
205
|
+
# ---------------------------------------------------------
|
|
206
|
+
def Build_LogRegressionBinaryL2_Adult_Income_Prediction():
|
|
207
|
+
pass
|
|
205
208
|
|
|
206
209
|
|
|
210
|
+
def Build_LogRegressionBinaryL2_Credit_Card_Fraud_Detection():
|
|
211
|
+
pass
|
|
207
212
|
|