junshan-kit 2.3.9__py2.py3-none-any.whl → 2.4.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ import torch, os, time
2
+ from torch.optim.optimizer import Optimizer
3
+ from torch.nn.utils import parameters_to_vector, vector_to_parameters
4
+ import junshan_kit.SPBM_func as SPBM_func
5
+
6
+ class SPSmax(Optimizer):
7
+ def __init__(self, params, model, hyperparams, Paras):
8
+ defaults = dict()
9
+ super().__init__(params, defaults)
10
+ self.model = model
11
+ self.c = hyperparams['c']
12
+ self.gamma = hyperparams['gamma']
13
+ if 'f_star' not in Paras or Paras['f_star'] is None:
14
+ self.f_star = 0
15
+ else:
16
+ self.f_star = Paras['f_star']
17
+ self.step_size = []
18
+
19
+ def step(self, closure=None):
20
+ if closure is None:
21
+ raise RuntimeError("Closure required for SPSmax")
22
+
23
+ # Reset the gradient and perform forward computation
24
+ loss = closure()
25
+
26
+ with torch.no_grad():
27
+ xk = parameters_to_vector(self.model.parameters())
28
+ # print(torch.norm(xk))
29
+ g_k = parameters_to_vector([p.grad if p.grad is not None else torch.zeros_like(p) for p in self.model.parameters()])
30
+
31
+ # Step-size
32
+ step_size = (loss - self.f_star) / ((self.c * torch.norm(g_k, p=2) ** 2) + 1e-8)
33
+ step_size = min(step_size, self.gamma)
34
+ self.step_size.append(step_size)
35
+
36
+ # Update
37
+ xk = xk - step_size * g_k
38
+
39
+ # print(len(self.f_his))
40
+ vector_to_parameters(xk, self.model.parameters())
41
+
42
+ # emporarily return loss (tensor type)
43
+ return loss
44
+
45
+
46
+ class ALR_SMAG(Optimizer):
47
+ def __init__(self, params, model, hyperparams, Paras):
48
+ defaults = dict()
49
+ super().__init__(params, defaults)
50
+ self.model = model
51
+ self.c = hyperparams['c']
52
+ self.eta_max = hyperparams['eta_max']
53
+ self.beta = hyperparams['beta']
54
+ if 'f_star' not in Paras or Paras['f_star'] is None:
55
+ self.f_star = 0
56
+ else:
57
+ self.f_star = Paras['f_star']
58
+ self.step_size = []
59
+ self.d_k = torch.zeros_like(parameters_to_vector(self.model.parameters()))
60
+
61
+ def step(self, closure=None):
62
+ if closure is None:
63
+ raise RuntimeError("Closure required for SPSmax")
64
+
65
+ # Reset the gradient and perform forward computation
66
+ loss = closure()
67
+
68
+ with torch.no_grad():
69
+ xk = parameters_to_vector(self.model.parameters())
70
+ # print(torch.norm(xk))
71
+ g_k = parameters_to_vector([p.grad if p.grad is not None else torch.zeros_like(p) for p in self.model.parameters()])
72
+
73
+ self.d_k = self.beta * self.d_k + g_k
74
+ # Step-size
75
+ step_size = (loss - self.f_star) / ((self.c * torch.norm(self.d_k, p=2) ** 2) + 1e-8)
76
+ step_size = min(step_size, self.eta_max)
77
+ self.step_size.append(step_size)
78
+
79
+ # Update
80
+ xk = xk - step_size * g_k
81
+
82
+ # print(len(self.f_his))
83
+ vector_to_parameters(xk, self.model.parameters())
84
+
85
+ # emporarily return loss (tensor type)
86
+ return loss
87
+
88
+ # ------------ Bundle Method --------------------
89
+ class Bundle(Optimizer):
90
+ def __init__(self, params, model, hyperparams, Paras):
91
+ defaults = dict()
92
+ super().__init__(params, defaults)
93
+ self.model = model
94
+ self.cutting_num = hyperparams['cutting_number']
95
+ self.delta = hyperparams['delta']
96
+ self.Paras = Paras
97
+
98
+ self.x_his, self.g_his, self.f_his = [], [], []
99
+
100
+ def step(self, closure=None):
101
+ if closure is None:
102
+ raise RuntimeError("Closure required for CuttingPlaneOptimizer")
103
+
104
+ # Reset the gradient and perform forward computation
105
+ loss = closure()
106
+
107
+ with torch.no_grad():
108
+ xk = parameters_to_vector(self.model.parameters())
109
+ # print(torch.norm(xk))
110
+ g_k = parameters_to_vector([p.grad if p.grad is not None else torch.zeros_like(p) for p in self.model.parameters()])
111
+
112
+ # Add cutting plane
113
+ x_his, f_his, g_his = SPBM_func.add_cutting(self.x_his, self.f_his, self.g_his,xk.detach().clone(), g_k.detach().clone(), loss.detach().clone(), self.cutting_num)
114
+
115
+ # the coefficient of dual problem
116
+ Gk, rk, ek = SPBM_func.get_var(x_his, f_his, g_his, self.delta)
117
+
118
+ # SOVER (dual)
119
+ xk = SPBM_func.bundle(Gk, ek, xk, self.delta, self.Paras)
120
+
121
+ # print(len(self.f_his))
122
+ vector_to_parameters(xk, self.model.parameters())
123
+
124
+ # loss(tensor)
125
+ return loss
126
+
@@ -32,7 +32,6 @@ class CSV_TO_Pandas:
32
32
  - time_col_name : str
33
33
  Name of the column containing time or datetime values.
34
34
  - trans_type : int, optional, default=1
35
- Extraction mode.
36
35
  - 0 : Extract ['year', 'month', 'day', 'hour']
37
36
  - 1 : Extract ['hour', 'dayofweek', 'is_weekend']
38
37
 
@@ -151,9 +150,6 @@ class CSV_TO_Pandas:
151
150
  # Save original size
152
151
  m_original, n_original = df.shape
153
152
 
154
- if time_info is not None:
155
- df = self._trans_time_fea(df, time_info)
156
-
157
153
  # Step 1: Drop non-informative columns
158
154
  df = df.drop(columns=drop_cols)
159
155
 
@@ -161,6 +157,9 @@ class CSV_TO_Pandas:
161
157
  df = df.dropna(axis=0, how="any")
162
158
  m_encoded, n_encoded = df.shape
163
159
 
160
+ if time_info is not None:
161
+ df = self._trans_time_fea(df, time_info)
162
+
164
163
  # Step 3: Map target label (to -1 and +1)
165
164
  df[label_col] = df[label_col].map(label_map)
166
165
 
@@ -195,11 +194,14 @@ class CSV_TO_Pandas:
195
194
 
196
195
  # Step 6: Print dataset information
197
196
  print("\n" + "=" * 80)
198
- print(f"{f'{title_name} - Info':^70}")
197
+ print(f"{f'{title_name} - Summary':^70}")
199
198
  print("=" * 80)
200
199
  print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
201
200
  print(
202
- f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
201
+ f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
202
+ )
203
+ print(
204
+ f"{'Dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
203
205
  )
204
206
  print(f"{'Positive samples (+1):':<40} {pos_count}")
205
207
  print(f"{'Negative samples (-1):':<40} {neg_count}")
@@ -207,13 +209,15 @@ class CSV_TO_Pandas:
207
209
  f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
208
210
  )
209
211
  print("-" * 80)
210
- print(f"Note:")
212
+ print(f"{'More details about preprocessing':^70}")
213
+ print("-" * 80)
211
214
  print(f"{'Label column:':<40} {label_col}")
212
215
  print(f"{'label_map:':<40} {label_map}")
213
216
  print(f"{'time column:':<40} {time_info}")
214
- print(
215
- f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
216
- )
217
+ if time_info:
218
+ print(f"{'trans_type : int, optional, default=1'}")
219
+ print(f"{' - 0 : Extract [\'year\', \'month\', \'day\', \'hour\']':<10}")
220
+ print(f"{' - 1 : Extract [\'hour\', \'dayofweek\', \'is_weekend\']':<10}")
217
221
  print(
218
222
  f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
219
223
  )
junshan_kit/DataSets.py CHANGED
@@ -12,7 +12,10 @@ import junshan_kit.kit
12
12
  from sklearn.preprocessing import StandardScaler
13
13
 
14
14
  #----------------------------------------------------------
15
- def _download_data(data_name):
15
+ def _download_data(data_name, data_type):
16
+ allowed_types = ["binary", "multi"]
17
+ if data_type not in allowed_types:
18
+ raise ValueError(f"Invalid data_type: {data_type!r}. Must be one of {allowed_types}.")
16
19
  from junshan_kit.kit import JianguoyunDownloaderFirefox, JianguoyunDownloaderChrome
17
20
 
18
21
  # User selects download method
@@ -26,32 +29,32 @@ def _download_data(data_name):
26
29
  choice = input("Enter the number of your choice (1 or 2): ").strip()
27
30
 
28
31
  if choice == "1":
29
- JianguoyunDownloaderFirefox(url, f"./exp_data/{data_name}").run()
32
+ JianguoyunDownloaderFirefox(url, f"./exp_data/{data_type}/{data_name}").run()
30
33
  print("✅ Download completed using Firefox")
31
34
  break
32
35
  elif choice == "2":
33
- JianguoyunDownloaderChrome(url, f"./exp_data/{data_name}").run()
36
+ JianguoyunDownloaderChrome(url, f"./exp_data/{data_type}/{data_name}").run()
34
37
  print("✅ Download completed using Chrome")
35
38
  break
36
39
  else:
37
40
  print("❌ Invalid choice. Please enter 1 or 2.\n")
38
41
 
39
42
  # unzip file
40
- junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
43
+ junshan_kit.kit.unzip_file(f'./exp_data/{data_type}/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
41
44
 
42
45
  def _export_csv(df, data_name):
43
- path = f'./data_trans_fea/{data_name}/'
46
+ path = f'./exp_data/{data_name}/'
44
47
  os.makedirs(path, exist_ok=True)
45
- df.to_csv(path + f'{data_name}.csv')
48
+ df.to_csv(path + f'{data_name}_num.csv')
46
49
  print(path + f'{data_name}.csv')
47
50
 
48
51
 
49
- def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None):
52
+ def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None):
50
53
  if not os.path.exists(csv_path):
51
54
  print('\n' + '*'*60)
52
55
  print(f"Please download the data.")
53
56
  print(csv_path)
54
- _download_data(data_name)
57
+ _download_data(data_name, data_type=data_type)
55
58
  # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
56
59
 
57
60
  cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
@@ -62,113 +65,139 @@ def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_
62
65
 
63
66
  return df
64
67
 
68
+
69
+ # ********************************************************************
65
70
  """
66
71
  ----------------------------------------------------------------------
67
72
  Datasets
68
73
  ----------------------------------------------------------------------
69
74
  """
70
75
 
71
- def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False):
76
+ def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False):
72
77
 
73
- csv_path = f'./exp_data/{data_name}/creditcard.csv'
78
+ data_type = "binary"
79
+ csv_path = f'./exp_data/{data_type}/{data_name}/creditcard.csv'
74
80
  drop_cols = []
75
81
  label_col = 'Class'
76
82
  label_map = {0: -1, 1: 1}
83
+
77
84
 
78
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
85
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
86
+
79
87
 
80
88
  return df
81
89
 
82
90
 
83
- def diabetes_health_indicators_dataset(data_name = "Diabetes Health Indicators", print_info = False):
84
- csv_path = f'./exp_data/{data_name}/diabetes_dataset.csv'
91
+ def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_info = False, export_csv = False):
92
+ data_type = "binary"
93
+ csv_path = f'./exp_data/{data_type}/{data_name}/diabetes_dataset.csv'
85
94
  drop_cols = []
86
95
  label_col = 'diagnosed_diabetes'
87
96
  label_map = {0: -1, 1: 1}
97
+
88
98
 
89
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
99
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
90
100
 
91
101
  return df
92
102
 
93
103
 
94
- def electric_vehicle_population_data(data_name = "Electric Vehicle Population", print_info = False):
95
- csv_path = f'./exp_data/{data_name}/Electric_Vehicle_Population_Data.csv'
104
+ def electric_vehicle_population(data_name = "Electric Vehicle Population", print_info = False, export_csv = False):
105
+
106
+ data_type = "binary"
107
+ csv_path = f'./exp_data/{data_type}/{data_name}/Electric_Vehicle_Population_Data.csv'
96
108
  drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
97
109
  label_col = 'Electric Vehicle Type'
98
110
  label_map = {
99
111
  'Battery Electric Vehicle (BEV)': 1,
100
112
  'Plug-in Hybrid Electric Vehicle (PHEV)': -1
101
113
  }
114
+
102
115
 
103
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
116
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
104
117
 
105
118
  return df
106
119
 
107
- def global_house_purchase_dataset(data_name = "Global House Purchase", print_info = False):
108
- csv_path = f'./exp_data/{data_name}/global_house_purchase_dataset.csv'
120
+ def global_house_purchase(data_name = "Global House Purchase", print_info = False, export_csv = False):
121
+
122
+ data_type = "binary"
123
+ csv_path = f'./exp_data/{data_type}/{data_name}/global_house_purchase_dataset.csv'
109
124
  drop_cols = ['property_id']
110
125
  label_col = 'decision'
111
126
  label_map = {0: -1, 1: 1}
127
+
112
128
 
113
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
129
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
114
130
 
115
131
  return df
116
132
 
117
133
 
118
- def health_lifestyle_dataset(data_name = "Health Lifestyle", print_info = False):
119
- csv_path = f'./exp_data/{data_name}/health_lifestyle_dataset.csv'
134
+ def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_csv = False):
135
+
136
+ data_type = "binary"
137
+ csv_path = f'./exp_data/{data_type}/{data_name}/health_lifestyle_dataset.csv'
120
138
  drop_cols = ['id']
121
139
  label_col = 'disease_risk'
122
140
  label_map = {0: -1, 1: 1}
141
+
123
142
 
124
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
143
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
125
144
 
126
145
  return df
127
146
 
128
147
 
129
-
130
- def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False):
148
+ def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False, export_csv = False):
131
149
  """
132
150
  1. The missing values in this dataset are handled by directly removing the corresponding column. Since the `alcohol_freq` column contains a large number of missing values, deleting the rows would result in significant data loss, so the entire column is dropped instead.
133
151
 
134
152
  2. There are several columns that could serve as binary classification labels, such as `is_high_risk`, `cardiovascular_disease`, and `liver_disease`. In this case, `is_high_risk` is chosen as the label column.
135
153
  """
136
- csv_path = f'./exp_data/{data_name}/medical_insurance.csv'
154
+
155
+ data_type = "binary"
156
+ csv_path = f'./exp_data/{data_type}/{data_name}/medical_insurance.csv'
137
157
  drop_cols = ['alcohol_freq']
138
158
  label_col = 'is_high_risk'
139
159
  label_map = {0: -1, 1: 1}
160
+
140
161
 
141
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
162
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
142
163
 
143
164
  return df
144
165
 
145
166
 
146
- def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False):
147
- csv_path = f'./exp_data/{data_name}/Particle Physics Event Classification.csv'
167
+ def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False, export_csv = False):
168
+
169
+ data_type = "binary"
170
+ csv_path = f'./exp_data/{data_type}/{data_name}/Particle Physics Event Classification.csv'
148
171
  drop_cols = []
149
172
  label_col = 'Label'
150
173
  label_map = {'s': -1, 'b': 1}
174
+
151
175
 
152
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
176
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
153
177
 
154
178
  return df
155
179
 
156
180
 
157
181
 
158
- def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False):
159
- csv_path = f'./exp_data/{data_name}/adult.csv'
182
+ def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False, export_csv=False):
183
+
184
+ data_type = "binary"
185
+ csv_path = f'./exp_data/{data_type}/{data_name}/adult.csv'
160
186
  drop_cols = []
161
187
  label_col = 'income'
162
188
  label_map = {'<=50K': -1, '>50K': 1}
189
+
163
190
 
164
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
191
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
165
192
 
166
193
  return df
167
194
 
168
195
 
169
196
  def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
170
- csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
171
- drop_cols = []
197
+
198
+ data_type = "binary"
199
+ csv_path = f'./exp_data/{data_type}/{data_name}/TNweather_1.8M.csv'
200
+ drop_cols = ['Unnamed: 0']
172
201
  label_col = 'rain_tomorrow'
173
202
  label_map = {0: -1, 1: 1}
174
203
 
@@ -180,194 +209,15 @@ def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info =
180
209
  'trans_type': 0
181
210
  }
182
211
 
183
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, time_info = time_info)
184
-
185
- return df
212
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
186
213
 
187
214
 
188
- # def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
189
- # csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
190
- # label_col = 'rain_tomorrow'
191
- # label_map = {0: -1, 1: 1}
192
-
193
- # if not os.path.exists(csv_path):
194
- # print('\n' + '*'*60)
195
- # print(f"Please download the data.")
196
- # print(csv_path)
197
- # _download_data(data_name)
198
- # # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
199
-
200
- # # Step 0: Load the dataset
201
- # df = pd.read_csv(csv_path)
202
-
203
- # df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
204
- # df = df.dropna(subset=['timestamp'])
205
-
206
- # df['time'] = pd.to_datetime(df['time'])
207
- # df['year'] = df['time'].dt.year
208
- # df['month'] = df['time'].dt.month
209
- # df['day'] = df['time'].dt.day
210
- # df['hour'] = df['time'].dt.hour
211
-
212
- # user_one_hot_cols = ['year','month','day', 'hour']
213
- # drop_cols = ['Unnamed: 0', 'time']
214
-
215
- # # Save original size
216
- # m_original, n_original = df.shape
217
-
218
- # # Step 1: Drop non-informative columns
219
- # df = df.drop(columns=drop_cols)
220
-
221
- # # Step 2: Remove rows with missing values
222
- # df = df.dropna(axis=0, how="any")
223
- # m_encoded, n_encoded = df.shape
224
-
225
- # # Step 3: Map target label (to -1 and +1)
226
- # df[label_col] = df[label_col].map(label_map)
227
-
228
- # # Step 4: Encode categorical features (exclude label column)
229
- # text_feature_cols = df.select_dtypes(
230
- # include=["object", "string", "category"]
231
- # ).columns
232
- # text_feature_cols = [
233
- # col for col in text_feature_cols if col != label_col
234
- # ] # ✅ exclude label
235
-
236
- # df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
237
- # m_cleaned, n_cleaned = df.shape
238
-
239
- # num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
240
- # scaler = StandardScaler()
241
- # df[num_cols] = scaler.fit_transform(df[num_cols])
242
-
243
- # if export_csv:
244
- # _export_csv(df, data_name)
245
-
246
- # # print info
247
- # if print_info:
248
- # pos_count = (df[label_col] == 1).sum()
249
- # neg_count = (df[label_col] == -1).sum()
250
-
251
- # # Step 6: Print dataset information
252
- # print("\n" + "=" * 80)
253
- # print(f"{f'{data_name} - Info':^70}")
254
- # print("=" * 80)
255
- # print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
256
- # print(
257
- # f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
258
- # )
259
- # print(f"{'Positive samples (+1):':<40} {pos_count}")
260
- # print(f"{'Negative samples (-1):':<40} {neg_count}")
261
- # print(
262
- # f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
263
- # )
264
- # print("-" * 80)
265
- # print(f"Note:")
266
- # print(f"{'Label column:':<40} {label_col}")
267
- # print(f"{'label_map:':<40} {label_map}")
268
- # print(
269
- # f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
270
- # )
271
- # print(
272
- # f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
273
- # )
274
- # print("=" * 80 + "\n")
275
-
276
- # return df
277
-
278
-
279
- # def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
280
- # csv_path = f'./exp_data/{data_name}/youtube recommendation dataset.csv'
281
- # drop_cols = ['user_id']
282
- # label_col = 'subscribed_after'
283
- # label_map = {0: -1, 1: 1}
284
-
285
- # if not os.path.exists(csv_path):
286
- # print('\n' + '*'*60)
287
- # print(f"Please download the data.")
288
- # print(csv_path)
289
- # _download_data(data_name)
290
- # # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
291
-
292
- # # Step 0: Load the dataset
293
- # df = pd.read_csv(csv_path)
294
-
295
- # df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
296
- # df = df.dropna(subset=['timestamp'])
297
-
298
- # df["hour"] = df['timestamp'].dt.hour
299
- # df["dayofweek"] = df['timestamp'].dt.dayofweek
300
- # df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
301
-
302
- # user_one_hot_cols = ['dayofweek','is_weekend','hour']
303
- # drop_cols = ['user_id', 'timestamp']
304
-
305
- # # Save original size
306
- # m_original, n_original = df.shape
307
-
308
- # # Step 1: Drop non-informative columns
309
- # df = df.drop(columns=drop_cols)
310
-
311
- # # Step 2: Remove rows with missing values
312
- # df = df.dropna(axis=0, how="any")
313
- # m_encoded, n_encoded = df.shape
314
-
315
- # # Step 3: Map target label (to -1 and +1)
316
- # df[label_col] = df[label_col].map(label_map)
317
-
318
- # # Step 4: Encode categorical features (exclude label column)
319
- # text_feature_cols = df.select_dtypes(
320
- # include=["object", "string", "category"]
321
- # ).columns
322
- # text_feature_cols = [
323
- # col for col in text_feature_cols if col != label_col
324
- # ] # ✅ exclude label
325
-
326
- # df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
327
- # m_cleaned, n_cleaned = df.shape
328
-
329
- # num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
330
- # scaler = StandardScaler()
331
- # df[num_cols] = scaler.fit_transform(df[num_cols])
332
-
333
- # if export_csv:
334
- # _export_csv(df, data_name)
335
-
336
- # # print info
337
- # if print_info:
338
- # pos_count = (df[label_col] == 1).sum()
339
- # neg_count = (df[label_col] == -1).sum()
340
-
341
- # # Step 6: Print dataset information
342
- # print("\n" + "=" * 80)
343
- # print(f"{f'{data_name} - Info':^70}")
344
- # print("=" * 80)
345
- # print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
346
- # print(
347
- # f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
348
- # )
349
- # print(f"{'Positive samples (+1):':<40} {pos_count}")
350
- # print(f"{'Negative samples (-1):':<40} {neg_count}")
351
- # print(
352
- # f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
353
- # )
354
- # print("-" * 80)
355
- # print(f"Note:")
356
- # print(f"{'Label column:':<40} {label_col}")
357
- # print(f"{'label_map:':<40} {label_map}")
358
- # print(
359
- # f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
360
- # )
361
- # print(
362
- # f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
363
- # )
364
- # print("=" * 80 + "\n")
365
-
366
- # return df
367
-
215
+ return df
368
216
 
369
217
  def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
370
- csv_path = f'./exp_data/{data_name}/youtube recommendation dataset.csv'
218
+
219
+ data_type = "binary"
220
+ csv_path = f'./exp_data/{data_type}/{data_name}/youtube recommendation dataset.csv'
371
221
  drop_cols = ['user_id']
372
222
  label_col = 'subscribed_after'
373
223
  label_map = {0: -1, 1: 1}
@@ -379,7 +229,7 @@ def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = Fa
379
229
  'time_col_name': 'timestamp',
380
230
  'trans_type': 1
381
231
  }
382
-
383
- df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, time_info = time_info)
232
+
233
+ df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
384
234
 
385
235
  return df