junshan-kit 2.4.0__py2.py3-none-any.whl → 2.4.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- junshan_kit/{Optimizers.py → ComOptimizers.py} +42 -1
- junshan_kit/DataProcessor.py +14 -10
- junshan_kit/DataSets.py +70 -220
- junshan_kit/ExperimentHub.py +141 -7
- junshan_kit/Print_Info.py +3 -3
- junshan_kit/SPBM.py +350 -0
- junshan_kit/SPBM_func.py +601 -0
- junshan_kit/check_args.py +1 -1
- {junshan_kit-2.4.0.dist-info → junshan_kit-2.4.1.dist-info}/METADATA +1 -1
- junshan_kit-2.4.1.dist-info/RECORD +16 -0
- junshan_kit-2.4.0.dist-info/RECORD +0 -14
- {junshan_kit-2.4.0.dist-info → junshan_kit-2.4.1.dist-info}/WHEEL +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
import torch, os, time
|
2
2
|
from torch.optim.optimizer import Optimizer
|
3
3
|
from torch.nn.utils import parameters_to_vector, vector_to_parameters
|
4
|
+
import junshan_kit.SPBM_func as SPBM_func
|
4
5
|
|
5
6
|
class SPSmax(Optimizer):
|
6
7
|
def __init__(self, params, model, hyperparams, Paras):
|
@@ -82,4 +83,44 @@ class ALR_SMAG(Optimizer):
|
|
82
83
|
vector_to_parameters(xk, self.model.parameters())
|
83
84
|
|
84
85
|
# emporarily return loss (tensor type)
|
85
|
-
return loss
|
86
|
+
return loss
|
87
|
+
|
88
|
+
# ------------ Bundle Method --------------------
|
89
|
+
class Bundle(Optimizer):
|
90
|
+
def __init__(self, params, model, hyperparams, Paras):
|
91
|
+
defaults = dict()
|
92
|
+
super().__init__(params, defaults)
|
93
|
+
self.model = model
|
94
|
+
self.cutting_num = hyperparams['cutting_number']
|
95
|
+
self.delta = hyperparams['delta']
|
96
|
+
self.Paras = Paras
|
97
|
+
|
98
|
+
self.x_his, self.g_his, self.f_his = [], [], []
|
99
|
+
|
100
|
+
def step(self, closure=None):
|
101
|
+
if closure is None:
|
102
|
+
raise RuntimeError("Closure required for CuttingPlaneOptimizer")
|
103
|
+
|
104
|
+
# Reset the gradient and perform forward computation
|
105
|
+
loss = closure()
|
106
|
+
|
107
|
+
with torch.no_grad():
|
108
|
+
xk = parameters_to_vector(self.model.parameters())
|
109
|
+
# print(torch.norm(xk))
|
110
|
+
g_k = parameters_to_vector([p.grad if p.grad is not None else torch.zeros_like(p) for p in self.model.parameters()])
|
111
|
+
|
112
|
+
# Add cutting plane
|
113
|
+
x_his, f_his, g_his = SPBM_func.add_cutting(self.x_his, self.f_his, self.g_his,xk.detach().clone(), g_k.detach().clone(), loss.detach().clone(), self.cutting_num)
|
114
|
+
|
115
|
+
# the coefficient of dual problem
|
116
|
+
Gk, rk, ek = SPBM_func.get_var(x_his, f_his, g_his, self.delta)
|
117
|
+
|
118
|
+
# SOVER (dual)
|
119
|
+
xk = SPBM_func.bundle(Gk, ek, xk, self.delta, self.Paras)
|
120
|
+
|
121
|
+
# print(len(self.f_his))
|
122
|
+
vector_to_parameters(xk, self.model.parameters())
|
123
|
+
|
124
|
+
# loss(tensor)
|
125
|
+
return loss
|
126
|
+
|
junshan_kit/DataProcessor.py
CHANGED
@@ -32,7 +32,6 @@ class CSV_TO_Pandas:
|
|
32
32
|
- time_col_name : str
|
33
33
|
Name of the column containing time or datetime values.
|
34
34
|
- trans_type : int, optional, default=1
|
35
|
-
Extraction mode.
|
36
35
|
- 0 : Extract ['year', 'month', 'day', 'hour']
|
37
36
|
- 1 : Extract ['hour', 'dayofweek', 'is_weekend']
|
38
37
|
|
@@ -151,9 +150,6 @@ class CSV_TO_Pandas:
|
|
151
150
|
# Save original size
|
152
151
|
m_original, n_original = df.shape
|
153
152
|
|
154
|
-
if time_info is not None:
|
155
|
-
df = self._trans_time_fea(df, time_info)
|
156
|
-
|
157
153
|
# Step 1: Drop non-informative columns
|
158
154
|
df = df.drop(columns=drop_cols)
|
159
155
|
|
@@ -161,6 +157,9 @@ class CSV_TO_Pandas:
|
|
161
157
|
df = df.dropna(axis=0, how="any")
|
162
158
|
m_encoded, n_encoded = df.shape
|
163
159
|
|
160
|
+
if time_info is not None:
|
161
|
+
df = self._trans_time_fea(df, time_info)
|
162
|
+
|
164
163
|
# Step 3: Map target label (to -1 and +1)
|
165
164
|
df[label_col] = df[label_col].map(label_map)
|
166
165
|
|
@@ -195,11 +194,14 @@ class CSV_TO_Pandas:
|
|
195
194
|
|
196
195
|
# Step 6: Print dataset information
|
197
196
|
print("\n" + "=" * 80)
|
198
|
-
print(f"{f'{title_name} -
|
197
|
+
print(f"{f'{title_name} - Summary':^70}")
|
199
198
|
print("=" * 80)
|
200
199
|
print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
|
201
200
|
print(
|
202
|
-
f"{'
|
201
|
+
f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
|
202
|
+
)
|
203
|
+
print(
|
204
|
+
f"{'Dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
|
203
205
|
)
|
204
206
|
print(f"{'Positive samples (+1):':<40} {pos_count}")
|
205
207
|
print(f"{'Negative samples (-1):':<40} {neg_count}")
|
@@ -207,13 +209,15 @@ class CSV_TO_Pandas:
|
|
207
209
|
f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
|
208
210
|
)
|
209
211
|
print("-" * 80)
|
210
|
-
print(f"
|
212
|
+
print(f"{'More details about preprocessing':^70}")
|
213
|
+
print("-" * 80)
|
211
214
|
print(f"{'Label column:':<40} {label_col}")
|
212
215
|
print(f"{'label_map:':<40} {label_map}")
|
213
216
|
print(f"{'time column:':<40} {time_info}")
|
214
|
-
|
215
|
-
f"{'
|
216
|
-
|
217
|
+
if time_info:
|
218
|
+
print(f"{'trans_type : int, optional, default=1'}")
|
219
|
+
print(f"{' - 0 : Extract [\'year\', \'month\', \'day\', \'hour\']':<10}")
|
220
|
+
print(f"{' - 1 : Extract [\'hour\', \'dayofweek\', \'is_weekend\']':<10}")
|
217
221
|
print(
|
218
222
|
f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
|
219
223
|
)
|
junshan_kit/DataSets.py
CHANGED
@@ -12,7 +12,10 @@ import junshan_kit.kit
|
|
12
12
|
from sklearn.preprocessing import StandardScaler
|
13
13
|
|
14
14
|
#----------------------------------------------------------
|
15
|
-
def _download_data(data_name):
|
15
|
+
def _download_data(data_name, data_type):
|
16
|
+
allowed_types = ["binary", "multi"]
|
17
|
+
if data_type not in allowed_types:
|
18
|
+
raise ValueError(f"Invalid data_type: {data_type!r}. Must be one of {allowed_types}.")
|
16
19
|
from junshan_kit.kit import JianguoyunDownloaderFirefox, JianguoyunDownloaderChrome
|
17
20
|
|
18
21
|
# User selects download method
|
@@ -26,32 +29,32 @@ def _download_data(data_name):
|
|
26
29
|
choice = input("Enter the number of your choice (1 or 2): ").strip()
|
27
30
|
|
28
31
|
if choice == "1":
|
29
|
-
JianguoyunDownloaderFirefox(url, f"./exp_data/{data_name}").run()
|
32
|
+
JianguoyunDownloaderFirefox(url, f"./exp_data/{data_type}/{data_name}").run()
|
30
33
|
print("✅ Download completed using Firefox")
|
31
34
|
break
|
32
35
|
elif choice == "2":
|
33
|
-
JianguoyunDownloaderChrome(url, f"./exp_data/{data_name}").run()
|
36
|
+
JianguoyunDownloaderChrome(url, f"./exp_data/{data_type}/{data_name}").run()
|
34
37
|
print("✅ Download completed using Chrome")
|
35
38
|
break
|
36
39
|
else:
|
37
40
|
print("❌ Invalid choice. Please enter 1 or 2.\n")
|
38
41
|
|
39
42
|
# unzip file
|
40
|
-
junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
|
43
|
+
junshan_kit.kit.unzip_file(f'./exp_data/{data_type}/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
|
41
44
|
|
42
45
|
def _export_csv(df, data_name):
|
43
|
-
path = f'./
|
46
|
+
path = f'./exp_data/{data_name}/'
|
44
47
|
os.makedirs(path, exist_ok=True)
|
45
|
-
df.to_csv(path + f'{data_name}.csv')
|
48
|
+
df.to_csv(path + f'{data_name}_num.csv')
|
46
49
|
print(path + f'{data_name}.csv')
|
47
50
|
|
48
51
|
|
49
|
-
def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None):
|
52
|
+
def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None):
|
50
53
|
if not os.path.exists(csv_path):
|
51
54
|
print('\n' + '*'*60)
|
52
55
|
print(f"Please download the data.")
|
53
56
|
print(csv_path)
|
54
|
-
_download_data(data_name)
|
57
|
+
_download_data(data_name, data_type=data_type)
|
55
58
|
# junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
|
56
59
|
|
57
60
|
cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
|
@@ -62,112 +65,138 @@ def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_
|
|
62
65
|
|
63
66
|
return df
|
64
67
|
|
68
|
+
|
69
|
+
# ********************************************************************
|
65
70
|
"""
|
66
71
|
----------------------------------------------------------------------
|
67
72
|
Datasets
|
68
73
|
----------------------------------------------------------------------
|
69
74
|
"""
|
70
75
|
|
71
|
-
def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False):
|
76
|
+
def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False):
|
72
77
|
|
73
|
-
|
78
|
+
data_type = "binary"
|
79
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/creditcard.csv'
|
74
80
|
drop_cols = []
|
75
81
|
label_col = 'Class'
|
76
82
|
label_map = {0: -1, 1: 1}
|
83
|
+
|
77
84
|
|
78
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
85
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
86
|
+
|
79
87
|
|
80
88
|
return df
|
81
89
|
|
82
90
|
|
83
|
-
def
|
84
|
-
|
91
|
+
def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_info = False, export_csv = False):
|
92
|
+
data_type = "binary"
|
93
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/diabetes_dataset.csv'
|
85
94
|
drop_cols = []
|
86
95
|
label_col = 'diagnosed_diabetes'
|
87
96
|
label_map = {0: -1, 1: 1}
|
97
|
+
|
88
98
|
|
89
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
99
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
90
100
|
|
91
101
|
return df
|
92
102
|
|
93
103
|
|
94
|
-
def
|
95
|
-
|
104
|
+
def electric_vehicle_population(data_name = "Electric Vehicle Population", print_info = False, export_csv = False):
|
105
|
+
|
106
|
+
data_type = "binary"
|
107
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/Electric_Vehicle_Population_Data.csv'
|
96
108
|
drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
|
97
109
|
label_col = 'Electric Vehicle Type'
|
98
110
|
label_map = {
|
99
111
|
'Battery Electric Vehicle (BEV)': 1,
|
100
112
|
'Plug-in Hybrid Electric Vehicle (PHEV)': -1
|
101
113
|
}
|
114
|
+
|
102
115
|
|
103
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
116
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
104
117
|
|
105
118
|
return df
|
106
119
|
|
107
|
-
def
|
108
|
-
|
120
|
+
def global_house_purchase(data_name = "Global House Purchase", print_info = False, export_csv = False):
|
121
|
+
|
122
|
+
data_type = "binary"
|
123
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/global_house_purchase_dataset.csv'
|
109
124
|
drop_cols = ['property_id']
|
110
125
|
label_col = 'decision'
|
111
126
|
label_map = {0: -1, 1: 1}
|
127
|
+
|
112
128
|
|
113
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
129
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
114
130
|
|
115
131
|
return df
|
116
132
|
|
117
133
|
|
118
|
-
def
|
119
|
-
|
134
|
+
def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_csv = False):
|
135
|
+
|
136
|
+
data_type = "binary"
|
137
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/health_lifestyle_dataset.csv'
|
120
138
|
drop_cols = ['id']
|
121
139
|
label_col = 'disease_risk'
|
122
140
|
label_map = {0: -1, 1: 1}
|
141
|
+
|
123
142
|
|
124
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
143
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
125
144
|
|
126
145
|
return df
|
127
146
|
|
128
147
|
|
129
|
-
|
130
|
-
def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False):
|
148
|
+
def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False, export_csv = False):
|
131
149
|
"""
|
132
150
|
1. The missing values in this dataset are handled by directly removing the corresponding column. Since the `alcohol_freq` column contains a large number of missing values, deleting the rows would result in significant data loss, so the entire column is dropped instead.
|
133
151
|
|
134
152
|
2. There are several columns that could serve as binary classification labels, such as `is_high_risk`, `cardiovascular_disease`, and `liver_disease`. In this case, `is_high_risk` is chosen as the label column.
|
135
153
|
"""
|
136
|
-
|
154
|
+
|
155
|
+
data_type = "binary"
|
156
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/medical_insurance.csv'
|
137
157
|
drop_cols = ['alcohol_freq']
|
138
158
|
label_col = 'is_high_risk'
|
139
159
|
label_map = {0: -1, 1: 1}
|
160
|
+
|
140
161
|
|
141
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
162
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
142
163
|
|
143
164
|
return df
|
144
165
|
|
145
166
|
|
146
|
-
def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False):
|
147
|
-
|
167
|
+
def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False, export_csv = False):
|
168
|
+
|
169
|
+
data_type = "binary"
|
170
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/Particle Physics Event Classification.csv'
|
148
171
|
drop_cols = []
|
149
172
|
label_col = 'Label'
|
150
173
|
label_map = {'s': -1, 'b': 1}
|
174
|
+
|
151
175
|
|
152
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
176
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
153
177
|
|
154
178
|
return df
|
155
179
|
|
156
180
|
|
157
181
|
|
158
|
-
def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False):
|
159
|
-
|
182
|
+
def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False, export_csv=False):
|
183
|
+
|
184
|
+
data_type = "binary"
|
185
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/adult.csv'
|
160
186
|
drop_cols = []
|
161
187
|
label_col = 'income'
|
162
188
|
label_map = {'<=50K': -1, '>50K': 1}
|
189
|
+
|
163
190
|
|
164
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
|
191
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
|
165
192
|
|
166
193
|
return df
|
167
194
|
|
168
195
|
|
169
196
|
def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
|
170
|
-
|
197
|
+
|
198
|
+
data_type = "binary"
|
199
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/TNweather_1.8M.csv'
|
171
200
|
drop_cols = ['Unnamed: 0']
|
172
201
|
label_col = 'rain_tomorrow'
|
173
202
|
label_map = {0: -1, 1: 1}
|
@@ -180,194 +209,15 @@ def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info =
|
|
180
209
|
'trans_type': 0
|
181
210
|
}
|
182
211
|
|
183
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info,
|
184
|
-
|
185
|
-
return df
|
212
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
|
186
213
|
|
187
214
|
|
188
|
-
|
189
|
-
# csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
|
190
|
-
# label_col = 'rain_tomorrow'
|
191
|
-
# label_map = {0: -1, 1: 1}
|
192
|
-
|
193
|
-
# if not os.path.exists(csv_path):
|
194
|
-
# print('\n' + '*'*60)
|
195
|
-
# print(f"Please download the data.")
|
196
|
-
# print(csv_path)
|
197
|
-
# _download_data(data_name)
|
198
|
-
# # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
|
199
|
-
|
200
|
-
# # Step 0: Load the dataset
|
201
|
-
# df = pd.read_csv(csv_path)
|
202
|
-
|
203
|
-
# df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
|
204
|
-
# df = df.dropna(subset=['timestamp'])
|
205
|
-
|
206
|
-
# df['time'] = pd.to_datetime(df['time'])
|
207
|
-
# df['year'] = df['time'].dt.year
|
208
|
-
# df['month'] = df['time'].dt.month
|
209
|
-
# df['day'] = df['time'].dt.day
|
210
|
-
# df['hour'] = df['time'].dt.hour
|
211
|
-
|
212
|
-
# user_one_hot_cols = ['year','month','day', 'hour']
|
213
|
-
# drop_cols = ['Unnamed: 0', 'time']
|
214
|
-
|
215
|
-
# # Save original size
|
216
|
-
# m_original, n_original = df.shape
|
217
|
-
|
218
|
-
# # Step 1: Drop non-informative columns
|
219
|
-
# df = df.drop(columns=drop_cols)
|
220
|
-
|
221
|
-
# # Step 2: Remove rows with missing values
|
222
|
-
# df = df.dropna(axis=0, how="any")
|
223
|
-
# m_encoded, n_encoded = df.shape
|
224
|
-
|
225
|
-
# # Step 3: Map target label (to -1 and +1)
|
226
|
-
# df[label_col] = df[label_col].map(label_map)
|
227
|
-
|
228
|
-
# # Step 4: Encode categorical features (exclude label column)
|
229
|
-
# text_feature_cols = df.select_dtypes(
|
230
|
-
# include=["object", "string", "category"]
|
231
|
-
# ).columns
|
232
|
-
# text_feature_cols = [
|
233
|
-
# col for col in text_feature_cols if col != label_col
|
234
|
-
# ] # ✅ exclude label
|
235
|
-
|
236
|
-
# df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
|
237
|
-
# m_cleaned, n_cleaned = df.shape
|
238
|
-
|
239
|
-
# num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
|
240
|
-
# scaler = StandardScaler()
|
241
|
-
# df[num_cols] = scaler.fit_transform(df[num_cols])
|
242
|
-
|
243
|
-
# if export_csv:
|
244
|
-
# _export_csv(df, data_name)
|
245
|
-
|
246
|
-
# # print info
|
247
|
-
# if print_info:
|
248
|
-
# pos_count = (df[label_col] == 1).sum()
|
249
|
-
# neg_count = (df[label_col] == -1).sum()
|
250
|
-
|
251
|
-
# # Step 6: Print dataset information
|
252
|
-
# print("\n" + "=" * 80)
|
253
|
-
# print(f"{f'{data_name} - Info':^70}")
|
254
|
-
# print("=" * 80)
|
255
|
-
# print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
|
256
|
-
# print(
|
257
|
-
# f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
|
258
|
-
# )
|
259
|
-
# print(f"{'Positive samples (+1):':<40} {pos_count}")
|
260
|
-
# print(f"{'Negative samples (-1):':<40} {neg_count}")
|
261
|
-
# print(
|
262
|
-
# f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
|
263
|
-
# )
|
264
|
-
# print("-" * 80)
|
265
|
-
# print(f"Note:")
|
266
|
-
# print(f"{'Label column:':<40} {label_col}")
|
267
|
-
# print(f"{'label_map:':<40} {label_map}")
|
268
|
-
# print(
|
269
|
-
# f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
|
270
|
-
# )
|
271
|
-
# print(
|
272
|
-
# f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
|
273
|
-
# )
|
274
|
-
# print("=" * 80 + "\n")
|
275
|
-
|
276
|
-
# return df
|
277
|
-
|
278
|
-
|
279
|
-
# def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
|
280
|
-
# csv_path = f'./exp_data/{data_name}/youtube recommendation dataset.csv'
|
281
|
-
# drop_cols = ['user_id']
|
282
|
-
# label_col = 'subscribed_after'
|
283
|
-
# label_map = {0: -1, 1: 1}
|
284
|
-
|
285
|
-
# if not os.path.exists(csv_path):
|
286
|
-
# print('\n' + '*'*60)
|
287
|
-
# print(f"Please download the data.")
|
288
|
-
# print(csv_path)
|
289
|
-
# _download_data(data_name)
|
290
|
-
# # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
|
291
|
-
|
292
|
-
# # Step 0: Load the dataset
|
293
|
-
# df = pd.read_csv(csv_path)
|
294
|
-
|
295
|
-
# df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
|
296
|
-
# df = df.dropna(subset=['timestamp'])
|
297
|
-
|
298
|
-
# df["hour"] = df['timestamp'].dt.hour
|
299
|
-
# df["dayofweek"] = df['timestamp'].dt.dayofweek
|
300
|
-
# df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
|
301
|
-
|
302
|
-
# user_one_hot_cols = ['dayofweek','is_weekend','hour']
|
303
|
-
# drop_cols = ['user_id', 'timestamp']
|
304
|
-
|
305
|
-
# # Save original size
|
306
|
-
# m_original, n_original = df.shape
|
307
|
-
|
308
|
-
# # Step 1: Drop non-informative columns
|
309
|
-
# df = df.drop(columns=drop_cols)
|
310
|
-
|
311
|
-
# # Step 2: Remove rows with missing values
|
312
|
-
# df = df.dropna(axis=0, how="any")
|
313
|
-
# m_encoded, n_encoded = df.shape
|
314
|
-
|
315
|
-
# # Step 3: Map target label (to -1 and +1)
|
316
|
-
# df[label_col] = df[label_col].map(label_map)
|
317
|
-
|
318
|
-
# # Step 4: Encode categorical features (exclude label column)
|
319
|
-
# text_feature_cols = df.select_dtypes(
|
320
|
-
# include=["object", "string", "category"]
|
321
|
-
# ).columns
|
322
|
-
# text_feature_cols = [
|
323
|
-
# col for col in text_feature_cols if col != label_col
|
324
|
-
# ] # ✅ exclude label
|
325
|
-
|
326
|
-
# df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
|
327
|
-
# m_cleaned, n_cleaned = df.shape
|
328
|
-
|
329
|
-
# num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
|
330
|
-
# scaler = StandardScaler()
|
331
|
-
# df[num_cols] = scaler.fit_transform(df[num_cols])
|
332
|
-
|
333
|
-
# if export_csv:
|
334
|
-
# _export_csv(df, data_name)
|
335
|
-
|
336
|
-
# # print info
|
337
|
-
# if print_info:
|
338
|
-
# pos_count = (df[label_col] == 1).sum()
|
339
|
-
# neg_count = (df[label_col] == -1).sum()
|
340
|
-
|
341
|
-
# # Step 6: Print dataset information
|
342
|
-
# print("\n" + "=" * 80)
|
343
|
-
# print(f"{f'{data_name} - Info':^70}")
|
344
|
-
# print("=" * 80)
|
345
|
-
# print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
|
346
|
-
# print(
|
347
|
-
# f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
|
348
|
-
# )
|
349
|
-
# print(f"{'Positive samples (+1):':<40} {pos_count}")
|
350
|
-
# print(f"{'Negative samples (-1):':<40} {neg_count}")
|
351
|
-
# print(
|
352
|
-
# f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
|
353
|
-
# )
|
354
|
-
# print("-" * 80)
|
355
|
-
# print(f"Note:")
|
356
|
-
# print(f"{'Label column:':<40} {label_col}")
|
357
|
-
# print(f"{'label_map:':<40} {label_map}")
|
358
|
-
# print(
|
359
|
-
# f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
|
360
|
-
# )
|
361
|
-
# print(
|
362
|
-
# f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
|
363
|
-
# )
|
364
|
-
# print("=" * 80 + "\n")
|
365
|
-
|
366
|
-
# return df
|
367
|
-
|
215
|
+
return df
|
368
216
|
|
369
217
|
def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
|
370
|
-
|
218
|
+
|
219
|
+
data_type = "binary"
|
220
|
+
csv_path = f'./exp_data/{data_type}/{data_name}/youtube recommendation dataset.csv'
|
371
221
|
drop_cols = ['user_id']
|
372
222
|
label_col = 'subscribed_after'
|
373
223
|
label_map = {0: -1, 1: 1}
|
@@ -379,7 +229,7 @@ def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = Fa
|
|
379
229
|
'time_col_name': 'timestamp',
|
380
230
|
'trans_type': 1
|
381
231
|
}
|
382
|
-
|
383
|
-
df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info,
|
232
|
+
|
233
|
+
df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
|
384
234
|
|
385
235
|
return df
|