PyPI - junshan-kit - Versions diffs - 2.4.8__py2.py3-none-any.whl → 2.4.9__py2.py3-none-any.whl - Mend

junshan-kit 2.4.8py2.py3-none-any.whl → 2.4.9py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of junshan-kit might be problematic. Click here for more details.

Files changed (19) hide show

junshan_kit/DataHub.py +114 -0
junshan_kit/DataProcessor.py +114 -24
junshan_kit/DataSets.py +186 -37
junshan_kit/{Models.py → ModelsHub.py} +5 -0
junshan_kit/ParametersHub.py +404 -0
junshan_kit/Print_Info.py +6 -2
junshan_kit/TrainingHub.py +75 -0
junshan_kit/kit.py +94 -23
{junshan_kit-2.4.8.dist-info → junshan_kit-2.4.9.dist-info}/METADATA +2 -2
junshan_kit-2.4.9.dist-info/RECORD +12 -0
junshan_kit/ComOptimizers.py +0 -126
junshan_kit/ExperimentHub.py +0 -338
junshan_kit/SPBM.py +0 -350
junshan_kit/SPBM_func.py +0 -601
junshan_kit/TrainingParas.py +0 -470
junshan_kit/check_args.py +0 -116
junshan_kit/datahub.py +0 -281
junshan_kit-2.4.8.dist-info/RECORD +0 -16
{junshan_kit-2.4.8.dist-info → junshan_kit-2.4.9.dist-info}/WHEEL +0 -0

junshan_kit/DataHub.py ADDED Viewed

@@ -0,0 +1,114 @@
+import torchvision, torch
+import torchvision.transforms as transforms
+import pandas as pd
+from junshan_kit import DataSets, DataProcessor
+def Adult_Income_Prediction(Paras):
+    df = DataSets.adult_income_prediction()
+    transform = {
+        "train_size": 0.7,
+        "normalization": True
+    }
+    label_col='income'
+    train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
+    return train_dataset, test_dataset, transform
+def Credit_Card_Fraud_Detection(Paras):
+    df = DataSets.credit_card_fraud_detection()
+    transform = {
+        "train_size": 0.7,
+        "normalization": True
+    }
+    label_col='Class'
+    train_dataset, test_dataset, transform = DataProcessor.Pandas_TO_Torch(df, label_col).to_torch(transform, Paras)
+    return train_dataset, test_dataset, transform
+def MNIST(Paras, model_name):
+    """
+    Load the MNIST dataset and return both the training and test sets,
+    along with the transformation applied (ToTensor).
+    """
+    transform = torchvision.transforms.ToTensor()
+    train_dataset = torchvision.datasets.MNIST(
+        root='./exp_data/MNIST',
+        train=True,
+        download=True,
+        transform=transform
+    )
+    test_dataset = torchvision.datasets.MNIST(
+        root='./exp_data/MNIST',
+        train=False,
+        download=True,
+        transform=transform
+    )
+    if Paras["model_type"][model_name] == "binary":
+        train_mask = (train_dataset.targets == 0) | (train_dataset.targets == 1)
+        test_mask = (test_dataset.targets == 0) | (test_dataset.targets == 1)
+        train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
+        test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
+        train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
+        test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
+    return train_dataset, test_dataset, transform
+def CIFAR100(Paras, model_name):
+    """
+    Load the CIFAR-100 dataset with standard normalization and return both
+    the training and test sets, along with the transformation applied.
+    """
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5071, 0.4867, 0.4408],
+                            std=[0.2675, 0.2565, 0.2761])
+    ])
+    train_dataset = torchvision.datasets.CIFAR100(
+        root='./exp_data/CIFAR100',
+        train=True,
+        download=True,
+        transform=transform
+    )
+    test_dataset = torchvision.datasets.CIFAR100(
+        root='./exp_data/CIFAR100',
+        train=False,
+        download=True,
+        transform=transform
+    )
+    if Paras["model_type"][model_name] == "binary":
+        train_mask = (torch.tensor(train_dataset.targets) == 0) | (torch.tensor(train_dataset.targets) == 1)
+        test_mask = (torch.tensor(test_dataset.targets) == 0) | (torch.tensor(test_dataset.targets) == 1)
+        train_indices = torch.nonzero(train_mask, as_tuple=True)[0]
+        test_indices = torch.nonzero(test_mask, as_tuple=True)[0]
+        train_dataset = torch.utils.data.Subset(train_dataset, train_indices.tolist())
+        test_dataset = torch.utils.data.Subset(test_dataset, test_indices.tolist())
+    return train_dataset, test_dataset, transform

junshan_kit/DataProcessor.py CHANGED Viewed

@@ -6,6 +6,7 @@
 """
 import pandas as pd
+import torch
 from sklearn.preprocessing import StandardScaler
@@ -13,7 +14,6 @@ class CSV_TO_Pandas:
     def __init__(self):
         pass
     def _trans_time_fea(self, df, time_info: dict):
         """
         Transform and extract time-based features from a specified datetime column.
@@ -82,6 +82,15 @@ class CSV_TO_Pandas:
             user_text_fea = ['hour','dayofweek','is_weekend']
             df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
+        elif trans_type == 2:
+            df.loc[:, "year"] = df[time_col_name].dt.year
+            df.loc[:, "month"] = df[time_col_name].dt.month
+            df.loc[:, "day"] = df[time_col_name].dt.day
+            user_text_fea = ['year','month','day']
+            df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
         else:
             print("error!")
@@ -91,15 +100,15 @@ class CSV_TO_Pandas:
     def preprocess_dataset(
         self,
-        csv_path,
+        df,
         drop_cols: list,
         label_col: str,
         label_map: dict,
         title_name: str,
         user_one_hot_cols=[],
         print_info=False,
-        Standard=False,
-        time_info: dict | None = None
+        time_info: dict | None = None,
+        missing_strategy = 'drop',  # [drop, mode]
     ):
         """
         Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
@@ -121,6 +130,9 @@ class CSV_TO_Pandas:
             print_info (bool, optional):
                 Whether to print preprocessing information and dataset statistics.
                 Defaults to False.
+            title_name (str):
+                Title used for the summary table or report that documents
+                the preprocessing steps and dataset statistics.
         Returns:
             pandas.DataFrame:
@@ -145,7 +157,7 @@ class CSV_TO_Pandas:
             ... )
         """
         # Step 0: Load the dataset
-        df = pd.read_csv(csv_path)
+        # df = pd.read_csv(csv_path)
         columns = df.columns
         # Save original size
@@ -155,7 +167,15 @@ class CSV_TO_Pandas:
         df = df.drop(columns=drop_cols)
         # Step 2: Remove rows with missing values
-        df = df.dropna(axis=0, how="any")
+        if missing_strategy == 'drop':
+            df = df.dropna(axis=0, how="any")
+        elif missing_strategy == 'mode':
+            for col in df.columns:
+                if df[col].notna().any():
+                    mode_val = df[col].mode()[0]
+                    df[col] = df[col].fillna(mode_val)
         m_encoded, n_encoded = df.shape
         if time_info is not None:
@@ -177,17 +197,6 @@ class CSV_TO_Pandas:
         )
         m_cleaned, n_cleaned = df.shape
-        if Standard:
-            # Identify numerical columns Standardize numerical columns
-            num_cols = [
-                col
-                for col in df.columns
-                if col
-                not in list(text_feature_cols) + [label_col] + [user_one_hot_cols]
-            ]
-            scaler = StandardScaler()
-            df[num_cols] = scaler.fit_transform(df[num_cols])
         # print info
         if print_info:
             pos_count = (df[label_col] == 1).sum()
@@ -201,6 +210,7 @@ class CSV_TO_Pandas:
             print(
                 f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
             )
+            print(f"{'missing_strategy:':<40} {missing_strategy}")
             print(
                 f"{'Dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
             )
@@ -216,19 +226,97 @@ class CSV_TO_Pandas:
             print(f"{'label_map:':<40} {label_map}")
             print(f"{'time column:':<40} {time_info}")
             if time_info is not None:
-                print(f"{'trans_type : int, optional, default=1'}")
-                print("- 0 : Extract ['year', 'month', 'day', 'hour']")
-                print("- 1 : Extract ['hour', 'dayofweek', 'is_weekend']")
+                if time_info["trans_type"] == 0:
+                    print("- 0 : Extract ['year', 'month', 'day', 'hour']")
+                elif time_info["trans_type"] == 1:
+                    print("- 1 : Extract ['hour', 'dayofweek', 'is_weekend']")
+                elif time_info["trans_type"] == 2:
+                    print("- 2 : Extract ['year', 'month', 'day']")
+                else:
+                    assert False
             print(
                 f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
             )
-            print("-" * 80)
-            print("all columns:")
-            print(list(columns))
-            print("=" * 80 + "\n")
+            # print("-" * 80)
+            # print("all columns:")
+            # print(list(columns))
+            # print("=" * 80 + "\n")
         return df
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from torch.utils.data import Dataset
+class Pandas_TO_Torch(Dataset):
+    def __init__(self, df: pd.DataFrame,
+                 label_col: str,
+                ):
+        self.df = df
+        self.label_col = label_col
+        # Identify feature columns automatically (all except the label)
+        self.label_col = label_col
+        self.feature_cols = [col for col in self.df.columns if col != label_col]
+        # Extract features and labels
+        self.features = self.df[self.feature_cols].values.astype("float32")
+        self.labels = self.df[self.label_col].values.astype("int64")
+    def __len__(self):
+        """Return the total number of samples."""
+        return len(self.features)
+    def __getitem__(self, idx):
+        x = torch.tensor(self.features[idx], dtype=torch.float32)
+        y = torch.tensor(self.labels[idx], dtype=torch.long)
+        return x, y
+    def __repr__(self):
+        info = (
+            f"Dataset CustomNumericDataset\n"
+            f"    Number of datapoints: {len(self)}\n"
+            f"    Features: {self.features.shape[1]}\n"
+        )
+        return info
+    def to_torch(self, transform, Paras):
+        fea_cols = [col for col in self.df.columns if col != self.label_col]
+        if transform["normalization"]:
+            scaler = StandardScaler()
+            self.df[fea_cols] = scaler.fit_transform(self.df[fea_cols])
+        # Train/test split
+        train_df, test_df = train_test_split(self.df, train_size=transform["train_size"], random_state=Paras["seed"], stratify=self.df[self.label_col])
+        # Create datasets
+        train_dataset = Pandas_TO_Torch(train_df, self.label_col)
+        test_dataset  = Pandas_TO_Torch(test_df, self.label_col)
+        return train_dataset, test_dataset, transform
+class TXT_TO_Numpy:
+    def __init__(self):
+        pass
+class bz2_To_Numpy:
+    def __init__(self):
+        pass
 class StepByStep:
     def __init__(self):
@@ -242,3 +330,5 @@ class StepByStep:
             print(
                 f"Unique values ({len(df[col].unique())}): {df[col].unique().tolist()}"
             )

junshan-kit 2.4.8__py2.py3-none-any.whl → 2.4.9__py2.py3-none-any.whl

Potentially problematic release.

junshan-kit 2.4.8py2.py3-none-any.whl → 2.4.9py2.py3-none-any.whl