PyPI - junshan-kit - Versions diffs - 2.3.9__py2.py3-none-any.whl → 2.4.1__py2.py3-none-any.whl - Mend

junshan-kit 2.3.9py2.py3-none-any.whl → 2.4.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

junshan_kit/ComOptimizers.py +126 -0
junshan_kit/DataProcessor.py +14 -10
junshan_kit/DataSets.py +71 -221
junshan_kit/ExperimentHub.py +328 -0
junshan_kit/Models.py +207 -0
junshan_kit/Print_Info.py +59 -0
junshan_kit/SPBM.py +350 -0
junshan_kit/SPBM_func.py +601 -0
junshan_kit/TrainingParas.py +470 -0
junshan_kit/check_args.py +116 -0
junshan_kit/datahub.py +281 -0
junshan_kit/kit.py +7 -1
{junshan_kit-2.3.9.dist-info → junshan_kit-2.4.1.dist-info}/METADATA +1 -2
junshan_kit-2.4.1.dist-info/RECORD +16 -0
junshan_kit-2.3.9.dist-info/RECORD +0 -7
{junshan_kit-2.3.9.dist-info → junshan_kit-2.4.1.dist-info}/WHEEL +0 -0

junshan_kit/ComOptimizers.py ADDED Viewed

@@ -0,0 +1,126 @@
+import torch, os, time
+from torch.optim.optimizer import Optimizer
+from torch.nn.utils import parameters_to_vector, vector_to_parameters
+import junshan_kit.SPBM_func as SPBM_func
+class SPSmax(Optimizer):
+    def __init__(self, params, model, hyperparams, Paras):
+        defaults = dict()
+        super().__init__(params, defaults)
+        self.model = model
+        self.c = hyperparams['c']
+        self.gamma = hyperparams['gamma']
+        if 'f_star' not in Paras or Paras['f_star'] is None:
+            self.f_star = 0
+        else:
+            self.f_star = Paras['f_star']
+        self.step_size = []
+    def step(self, closure=None):
+        if closure is None:
+            raise RuntimeError("Closure required for SPSmax")
+        # Reset the gradient and perform forward computation
+        loss = closure()
+        with torch.no_grad():
+            xk = parameters_to_vector(self.model.parameters())
+            # print(torch.norm(xk))
+            g_k = parameters_to_vector([p.grad if p.grad is not None else torch.zeros_like(p) for p in self.model.parameters()])
+            # Step-size
+            step_size = (loss - self.f_star) / ((self.c * torch.norm(g_k, p=2) ** 2) + 1e-8)
+            step_size = min(step_size, self.gamma)
+            self.step_size.append(step_size)
+            # Update
+            xk = xk - step_size * g_k
+            # print(len(self.f_his))
+            vector_to_parameters(xk, self.model.parameters())
+        # emporarily return loss (tensor type)
+        return loss
+class ALR_SMAG(Optimizer):
+    def __init__(self, params, model, hyperparams, Paras):
+        defaults = dict()
+        super().__init__(params, defaults)
+        self.model = model
+        self.c = hyperparams['c']
+        self.eta_max = hyperparams['eta_max']
+        self.beta = hyperparams['beta']
+        if 'f_star' not in Paras or Paras['f_star'] is None:
+            self.f_star = 0
+        else:
+            self.f_star = Paras['f_star']
+        self.step_size = []
+        self.d_k = torch.zeros_like(parameters_to_vector(self.model.parameters()))
+    def step(self, closure=None):
+        if closure is None:
+            raise RuntimeError("Closure required for SPSmax")
+        # Reset the gradient and perform forward computation
+        loss = closure()
+        with torch.no_grad():
+            xk = parameters_to_vector(self.model.parameters())
+            # print(torch.norm(xk))
+            g_k = parameters_to_vector([p.grad if p.grad is not None else torch.zeros_like(p) for p in self.model.parameters()])
+            self.d_k = self.beta * self.d_k + g_k
+            # Step-size
+            step_size = (loss - self.f_star) / ((self.c * torch.norm(self.d_k, p=2) ** 2) + 1e-8)
+            step_size = min(step_size, self.eta_max)
+            self.step_size.append(step_size)
+            # Update
+            xk = xk - step_size * g_k
+            # print(len(self.f_his))
+            vector_to_parameters(xk, self.model.parameters())
+        # emporarily return loss (tensor type)
+        return loss
+# ------------ Bundle Method --------------------
+class Bundle(Optimizer):
+    def __init__(self, params, model, hyperparams, Paras):
+        defaults = dict()
+        super().__init__(params, defaults)
+        self.model = model
+        self.cutting_num = hyperparams['cutting_number']
+        self.delta = hyperparams['delta']
+        self.Paras = Paras
+        self.x_his, self.g_his, self.f_his = [], [], []
+    def step(self, closure=None):
+        if closure is None:
+            raise RuntimeError("Closure required for CuttingPlaneOptimizer")
+        # Reset the gradient and perform forward computation
+        loss = closure()
+        with torch.no_grad():
+            xk = parameters_to_vector(self.model.parameters())
+            # print(torch.norm(xk))
+            g_k = parameters_to_vector([p.grad if p.grad is not None else torch.zeros_like(p) for p in self.model.parameters()])
+            # Add cutting plane
+            x_his, f_his, g_his = SPBM_func.add_cutting(self.x_his, self.f_his, self.g_his,xk.detach().clone(), g_k.detach().clone(), loss.detach().clone(), self.cutting_num)
+            # the coefficient of dual problem
+            Gk, rk, ek = SPBM_func.get_var(x_his, f_his, g_his, self.delta)
+            # SOVER (dual)
+            xk = SPBM_func.bundle(Gk, ek, xk, self.delta, self.Paras)
+            # print(len(self.f_his))
+            vector_to_parameters(xk, self.model.parameters())
+        # loss（tensor）
+        return loss

junshan_kit/DataProcessor.py CHANGED Viewed

@@ -32,7 +32,6 @@ class CSV_TO_Pandas:
             - time_col_name : str
                 Name of the column containing time or datetime values.
             - trans_type : int, optional, default=1
-                Extraction mode.
                 - 0 : Extract ['year', 'month', 'day', 'hour']
                 - 1 : Extract ['hour', 'dayofweek', 'is_weekend']
@@ -151,9 +150,6 @@ class CSV_TO_Pandas:
         # Save original size
         m_original, n_original = df.shape
-        if time_info is not None:
-            df = self._trans_time_fea(df, time_info)
         # Step 1: Drop non-informative columns
         df = df.drop(columns=drop_cols)
@@ -161,6 +157,9 @@ class CSV_TO_Pandas:
         df = df.dropna(axis=0, how="any")
         m_encoded, n_encoded = df.shape
+        if time_info is not None:
+            df = self._trans_time_fea(df, time_info)
         # Step 3: Map target label (to -1 and +1)
         df[label_col] = df[label_col].map(label_map)
@@ -195,11 +194,14 @@ class CSV_TO_Pandas:
             # Step 6: Print dataset information
             print("\n" + "=" * 80)
-            print(f"{f'{title_name} - Info':^70}")
+            print(f"{f'{title_name} - Summary':^70}")
             print("=" * 80)
             print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
             print(
-                f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
+                f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
+            )
+            print(
+                f"{'Dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
             )
             print(f"{'Positive samples (+1):':<40} {pos_count}")
             print(f"{'Negative samples (-1):':<40} {neg_count}")
@@ -207,13 +209,15 @@ class CSV_TO_Pandas:
                 f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
             )
             print("-" * 80)
-            print(f"Note:")
+            print(f"{'More details about preprocessing':^70}")
+            print("-" * 80)
             print(f"{'Label column:':<40} {label_col}")
             print(f"{'label_map:':<40} {label_map}")
             print(f"{'time column:':<40} {time_info}")
-            print(
-                f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
-            )
+            if time_info:
+                print(f"{'trans_type : int, optional, default=1'}")
+                print(f"{' - 0 : Extract [\'year\', \'month\', \'day\', \'hour\']':<10}")
+                print(f"{' - 1 : Extract [\'hour\', \'dayofweek\', \'is_weekend\']':<10}")
             print(
                 f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
             )

junshan_kit/DataSets.py CHANGED Viewed

@@ -12,7 +12,10 @@ import junshan_kit.kit
 from sklearn.preprocessing import StandardScaler
 #----------------------------------------------------------
-def _download_data(data_name):
+def _download_data(data_name, data_type):
+    allowed_types = ["binary", "multi"]
+    if data_type not in allowed_types:
+        raise ValueError(f"Invalid data_type: {data_type!r}. Must be one of {allowed_types}.")
     from junshan_kit.kit import JianguoyunDownloaderFirefox, JianguoyunDownloaderChrome
     # User selects download method
@@ -26,32 +29,32 @@ def _download_data(data_name):
         choice = input("Enter the number of your choice (1 or 2): ").strip()
         if choice == "1":
-            JianguoyunDownloaderFirefox(url, f"./exp_data/{data_name}").run()
+            JianguoyunDownloaderFirefox(url, f"./exp_data/{data_type}/{data_name}").run()
             print("✅ Download completed using Firefox")
             break
         elif choice == "2":
-            JianguoyunDownloaderChrome(url, f"./exp_data/{data_name}").run()
+            JianguoyunDownloaderChrome(url, f"./exp_data/{data_type}/{data_name}").run()
             print("✅ Download completed using Chrome")
             break
         else:
             print("❌ Invalid choice. Please enter 1 or 2.\n")
     # unzip file
-    junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
+    junshan_kit.kit.unzip_file(f'./exp_data/{data_type}/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
 def _export_csv(df, data_name):
-    path = f'./data_trans_fea/{data_name}/'
+    path = f'./exp_data/{data_name}/'
     os.makedirs(path, exist_ok=True)
-    df.to_csv(path + f'{data_name}.csv')
+    df.to_csv(path + f'{data_name}_num.csv')
     print(path + f'{data_name}.csv')
-def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None):
+def _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None):
     if not os.path.exists(csv_path):
         print('\n' + '*'*60)
         print(f"Please download the data.")
         print(csv_path)
-        _download_data(data_name)
+        _download_data(data_name, data_type=data_type)
         # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
     cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
@@ -62,113 +65,139 @@ def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_
     return df
+# ********************************************************************
 """
 ----------------------------------------------------------------------
                             Datasets
 ----------------------------------------------------------------------
 """
-def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False):
+def credit_card_fraud_detection(data_name = "Credit Card Fraud Detection", print_info = False, export_csv=False):
-    csv_path = f'./exp_data/{data_name}/creditcard.csv'
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/creditcard.csv'
     drop_cols = []
     label_col = 'Class'
     label_map = {0: -1, 1: 1}
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
     return df
-def diabetes_health_indicators_dataset(data_name = "Diabetes Health Indicators", print_info = False):
-    csv_path = f'./exp_data/{data_name}/diabetes_dataset.csv'
+def diabetes_health_indicators(data_name = "Diabetes Health Indicators", print_info = False, export_csv = False):
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/diabetes_dataset.csv'
     drop_cols = []
     label_col = 'diagnosed_diabetes'
     label_map = {0: -1, 1: 1}
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
     return df
-def electric_vehicle_population_data(data_name = "Electric Vehicle Population", print_info = False):
-    csv_path = f'./exp_data/{data_name}/Electric_Vehicle_Population_Data.csv'
+def electric_vehicle_population(data_name = "Electric Vehicle Population", print_info = False, export_csv = False):
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/Electric_Vehicle_Population_Data.csv'
     drop_cols = ['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location']
     label_col = 'Electric Vehicle Type'
     label_map = {
     'Battery Electric Vehicle (BEV)': 1,
     'Plug-in Hybrid Electric Vehicle (PHEV)': -1
     }
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
     return df
-def global_house_purchase_dataset(data_name = "Global House Purchase", print_info = False):
-    csv_path = f'./exp_data/{data_name}/global_house_purchase_dataset.csv'
+def global_house_purchase(data_name = "Global House Purchase", print_info = False, export_csv = False):
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/global_house_purchase_dataset.csv'
     drop_cols = ['property_id']
     label_col = 'decision'
     label_map = {0: -1, 1: 1}
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
     return df
-def health_lifestyle_dataset(data_name = "Health Lifestyle", print_info = False):
-    csv_path = f'./exp_data/{data_name}/health_lifestyle_dataset.csv'
+def health_lifestyle(data_name = "Health Lifestyle", print_info = False, export_csv = False):
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/health_lifestyle_dataset.csv'
     drop_cols = ['id']
     label_col = 'disease_risk'
     label_map = {0: -1, 1: 1}
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
     return df
-def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False):
+def medical_insurance_cost_prediction(data_name = "Medical Insurance Cost Prediction", print_info = False, export_csv = False):
     """
     1. The missing values in this dataset are handled by directly removing the corresponding column. Since the `alcohol_freq` column contains a large number of missing values, deleting the rows would result in significant data loss, so the entire column is dropped instead.
     2. There are several columns that could serve as binary classification labels, such as `is_high_risk`, `cardiovascular_disease`, and `liver_disease`. In this case, `is_high_risk` is chosen as the label column.
     """
-    csv_path = f'./exp_data/{data_name}/medical_insurance.csv'
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/medical_insurance.csv'
     drop_cols = ['alcohol_freq']
     label_col = 'is_high_risk'
     label_map = {0: -1, 1: 1}
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
     return df
-def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False):
-    csv_path = f'./exp_data/{data_name}/Particle Physics Event Classification.csv'
+def particle_physics_event_classification(data_name = "Particle Physics Event Classification", print_info = False, export_csv = False):
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/Particle Physics Event Classification.csv'
     drop_cols = []
     label_col = 'Label'
     label_map = {'s': -1, 'b': 1}
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
     return df
-def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False):
-    csv_path = f'./exp_data/{data_name}/adult.csv'
+def adult_income_prediction(data_name = "Adult Income Prediction", print_info = False, export_csv=False):
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/adult.csv'
     drop_cols = []
     label_col = 'income'
     label_map = {'<=50K': -1, '>50K': 1}
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info)
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv)
     return df
 def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
-    csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
-    drop_cols = []
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/TNweather_1.8M.csv'
+    drop_cols = ['Unnamed: 0']
     label_col = 'rain_tomorrow'
     label_map = {0: -1, 1: 1}
@@ -180,194 +209,15 @@ def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info =
         'trans_type': 0
     }
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, time_info = time_info)
-    return df
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
-# def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
-#     csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
-#     label_col = 'rain_tomorrow'
-#     label_map = {0: -1, 1: 1}
-#     if not os.path.exists(csv_path):
-#         print('\n' + '*'*60)
-#         print(f"Please download the data.")
-#         print(csv_path)
-#         _download_data(data_name)
-#         # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
-#     # Step 0: Load the dataset
-#     df = pd.read_csv(csv_path)
-#     df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
-#     df = df.dropna(subset=['timestamp'])
-#     df['time'] = pd.to_datetime(df['time'])
-#     df['year'] = df['time'].dt.year
-#     df['month'] = df['time'].dt.month
-#     df['day'] = df['time'].dt.day
-#     df['hour'] = df['time'].dt.hour
-#     user_one_hot_cols = ['year','month','day', 'hour']
-#     drop_cols = ['Unnamed: 0', 'time']
-#     # Save original size
-#     m_original, n_original = df.shape
-#     # Step 1: Drop non-informative columns
-#     df = df.drop(columns=drop_cols)
-#     # Step 2: Remove rows with missing values
-#     df = df.dropna(axis=0, how="any")
-#     m_encoded, n_encoded = df.shape
-#     # Step 3: Map target label (to -1 and +1)
-#     df[label_col] = df[label_col].map(label_map)
-#     # Step 4: Encode categorical features (exclude label column)
-#     text_feature_cols = df.select_dtypes(
-#         include=["object", "string", "category"]
-#     ).columns
-#     text_feature_cols = [
-#         col for col in text_feature_cols if col != label_col
-#     ]  # ✅ exclude label
-#     df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
-#     m_cleaned, n_cleaned = df.shape
-#     num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
-#     scaler = StandardScaler()
-#     df[num_cols] = scaler.fit_transform(df[num_cols])
-#     if export_csv:
-#         _export_csv(df, data_name)
-#     # print info
-#     if print_info:
-#         pos_count = (df[label_col] == 1).sum()
-#         neg_count = (df[label_col] == -1).sum()
-#         # Step 6: Print dataset information
-#         print("\n" + "=" * 80)
-#         print(f"{f'{data_name} - Info':^70}")
-#         print("=" * 80)
-#         print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
-#         print(
-#             f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
-#         )
-#         print(f"{'Positive samples (+1):':<40} {pos_count}")
-#         print(f"{'Negative samples (-1):':<40} {neg_count}")
-#         print(
-#             f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
-#         )
-#         print("-" * 80)
-#         print(f"Note:")
-#         print(f"{'Label column:':<40} {label_col}")
-#         print(f"{'label_map:':<40} {label_map}")
-#         print(
-#             f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
-#         )
-#         print(
-#             f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
-#         )
-#         print("=" * 80 + "\n")
-#     return df
-# def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
-#     csv_path = f'./exp_data/{data_name}/youtube recommendation dataset.csv'
-#     drop_cols = ['user_id']
-#     label_col = 'subscribed_after'
-#     label_map = {0: -1, 1: 1}
-#     if not os.path.exists(csv_path):
-#         print('\n' + '*'*60)
-#         print(f"Please download the data.")
-#         print(csv_path)
-#         _download_data(data_name)
-#         # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
-#     # Step 0: Load the dataset
-#     df = pd.read_csv(csv_path)
-#     df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
-#     df = df.dropna(subset=['timestamp'])
-#     df["hour"] = df['timestamp'].dt.hour
-#     df["dayofweek"] = df['timestamp'].dt.dayofweek
-#     df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
-#     user_one_hot_cols = ['dayofweek','is_weekend','hour']
-#     drop_cols = ['user_id', 'timestamp']
-#     # Save original size
-#     m_original, n_original = df.shape
-#     # Step 1: Drop non-informative columns
-#     df = df.drop(columns=drop_cols)
-#     # Step 2: Remove rows with missing values
-#     df = df.dropna(axis=0, how="any")
-#     m_encoded, n_encoded = df.shape
-#     # Step 3: Map target label (to -1 and +1)
-#     df[label_col] = df[label_col].map(label_map)
-#     # Step 4: Encode categorical features (exclude label column)
-#     text_feature_cols = df.select_dtypes(
-#         include=["object", "string", "category"]
-#     ).columns
-#     text_feature_cols = [
-#         col for col in text_feature_cols if col != label_col
-#     ]  # ✅ exclude label
-#     df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
-#     m_cleaned, n_cleaned = df.shape
-#     num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
-#     scaler = StandardScaler()
-#     df[num_cols] = scaler.fit_transform(df[num_cols])
-#     if export_csv:
-#         _export_csv(df, data_name)
-#     # print info
-#     if print_info:
-#         pos_count = (df[label_col] == 1).sum()
-#         neg_count = (df[label_col] == -1).sum()
-#         # Step 6: Print dataset information
-#         print("\n" + "=" * 80)
-#         print(f"{f'{data_name} - Info':^70}")
-#         print("=" * 80)
-#         print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
-#         print(
-#             f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
-#         )
-#         print(f"{'Positive samples (+1):':<40} {pos_count}")
-#         print(f"{'Negative samples (-1):':<40} {neg_count}")
-#         print(
-#             f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
-#         )
-#         print("-" * 80)
-#         print(f"Note:")
-#         print(f"{'Label column:':<40} {label_col}")
-#         print(f"{'label_map:':<40} {label_map}")
-#         print(
-#             f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
-#         )
-#         print(
-#             f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
-#         )
-#         print("=" * 80 + "\n")
-#     return df
+    return df
 def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
-    csv_path = f'./exp_data/{data_name}/youtube recommendation dataset.csv'
+    data_type = "binary"
+    csv_path = f'./exp_data/{data_type}/{data_name}/youtube recommendation dataset.csv'
     drop_cols = ['user_id']
     label_col = 'subscribed_after'
     label_map = {0: -1, 1: 1}
@@ -379,7 +229,7 @@ def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = Fa
         'time_col_name': 'timestamp',
         'trans_type': 1
     }
-    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, time_info = time_info)
+    df = _run(csv_path, data_name, data_type, drop_cols, label_col, label_map, print_info, export_csv=export_csv, time_info=time_info)
     return df

junshan-kit 2.3.9__py2.py3-none-any.whl → 2.4.1__py2.py3-none-any.whl

junshan-kit 2.3.9py2.py3-none-any.whl → 2.4.1py2.py3-none-any.whl