PyPI - junshan-kit - Versions diffs - 2.3.5__py2.py3-none-any.whl → 2.3.7__py2.py3-none-any.whl - Mend

junshan-kit 2.3.5py2.py3-none-any.whl → 2.3.7py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

junshan_kit/DataProcessor.py +101 -9
junshan_kit/DataSets.py +205 -76
{junshan_kit-2.3.5.dist-info → junshan_kit-2.3.7.dist-info}/METADATA +2 -2
junshan_kit-2.3.7.dist-info/RECORD +7 -0
junshan_kit-2.3.5.dist-info/RECORD +0 -7
{junshan_kit-2.3.5.dist-info → junshan_kit-2.3.7.dist-info}/WHEEL +0 -0

junshan_kit/DataProcessor.py CHANGED Viewed

@@ -8,9 +8,87 @@
 import pandas as pd
 from sklearn.preprocessing import StandardScaler
 class CSV_TO_Pandas:
     def __init__(self):
         pass
+    def _trans_time_fea(self, df, time_info: dict):
+        """
+        Transform and extract time-based features from a specified datetime column.
+        This function converts a given column to pandas datetime format and
+        extracts different time-related features based on the specified mode.
+        It supports two extraction modes:
+        - type = 0: Extracts basic components (year, month, day, hour)
+        - type = 1: Extracts hour, day of week, and weekend indicator
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            Input DataFrame containing the datetime column.
+        time_info:
+            - time_col_name : str
+                Name of the column containing time or datetime values.
+            - trans_type : int, optional, default=1
+                Extraction mode.
+                - 0 : Extract ['year', 'month', 'day', 'hour']
+                - 1 : Extract ['hour', 'dayofweek', 'is_weekend']
+        Returns
+        -------
+        pandas.DataFrame
+            The DataFrame with newly added time-based feature columns.
+        Notes
+        -----
+        - Rows that cannot be parsed as valid datetime will be dropped automatically.
+        - 'dayofweek' ranges from 0 (Monday) to 6 (Sunday).
+        - 'is_weekend' equals 1 if the day is Saturday or Sunday, otherwise 0.
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> data = pd.DataFrame({
+        ...     'timestamp': ['2023-08-01 12:30:00', '2023-08-05 08:15:00', 'invalid_time']
+        ... })
+        >>> df = handler._trans_time_fea(data, {"time_col_name": "timestamp", "trans_type": 1})
+        >>> print(df)
+                    timestamp  hour  dayofweek  is_weekend
+        0 2023-08-01 12:30:00    12          1           0
+        1 2023-08-05 08:15:00     8          5           1
+        """
+        time_col_name, trans_type = time_info['time_col_name'], time_info['trans_type']
+        df[time_col_name] = pd.to_datetime(df[time_col_name], errors="coerce")
+        # Drop rows where the datetime conversion failed, and make an explicit copy
+        df = df.dropna(subset=[time_col_name]).copy()
+        if trans_type == 0:
+            df.loc[:, "year"] = df[time_col_name].dt.year
+            df.loc[:, "month"] = df[time_col_name].dt.month
+            df.loc[:, "day"] = df[time_col_name].dt.day
+            df.loc[:, "hour"] = df[time_col_name].dt.hour
+            user_text_fea = ['year','month','day', 'hour']
+            df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
+        elif trans_type == 1:
+            df.loc[:, "hour"] = df[time_col_name].dt.hour
+            df.loc[:, "dayofweek"] = df[time_col_name].dt.dayofweek
+            df.loc[:, "is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
+            user_text_fea = ['hour','dayofweek','is_weekend']
+            df = pd.get_dummies(df, columns=user_text_fea, dtype=int)
+        else:
+            print("error!")
+        df = df.drop(columns=[time_col_name])
+        return df
     def preprocess_dataset(
         self,
@@ -19,9 +97,10 @@ class CSV_TO_Pandas:
         label_col: str,
         label_map: dict,
         title_name: str,
-        user_one_hot_cols = [],
+        user_one_hot_cols=[],
         print_info=False,
-        Standard = False
+        Standard=False,
+        time_info: dict | None = None
     ):
         """
         Preprocess a CSV dataset by performing data cleaning, label mapping, and feature encoding.
@@ -72,6 +151,9 @@ class CSV_TO_Pandas:
         # Save original size
         m_original, n_original = df.shape
+        if time_info is not None:
+            df = self._trans_time_fea(df, time_info)
         # Step 1: Drop non-informative columns
         df = df.drop(columns=drop_cols)
@@ -90,12 +172,19 @@ class CSV_TO_Pandas:
             col for col in text_feature_cols if col != label_col
         ]  # ✅ exclude label
-        df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
+        df = pd.get_dummies(
+            df, columns=text_feature_cols + user_one_hot_cols, dtype=int
+        )
         m_cleaned, n_cleaned = df.shape
         if Standard:
-        # Identify numerical columns Standardize numerical columns
-            num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + [user_one_hot_cols]]
+            # Identify numerical columns Standardize numerical columns
+            num_cols = [
+                col
+                for col in df.columns
+                if col
+                not in list(text_feature_cols) + [label_col] + [user_one_hot_cols]
+            ]
             scaler = StandardScaler()
             df[num_cols] = scaler.fit_transform(df[num_cols])
@@ -119,8 +208,9 @@ class CSV_TO_Pandas:
             )
             print("-" * 80)
             print(f"Note:")
-            print(f"{'Label column:':<40} {label_col}")
+            print(f"{'Label column:':<40} {label_col}")
             print(f"{'label_map:':<40} {label_map}")
+            print(f"{'time column:':<40} {time_info}")
             print(
                 f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
             )
@@ -130,7 +220,7 @@ class CSV_TO_Pandas:
             print("=" * 80 + "\n")
         return df
 class StepByStep:
     def __init__(self):
@@ -139,6 +229,8 @@ class StepByStep:
     def print_text_fea(self, df, text_feature_cols):
         for col in text_feature_cols:
             print(f"\n{'-'*80}")
-            print(f"Feature: \"{col}\"")
+            print(f'Feature: "{col}"')
             print(f"{'-'*80}")
-            print(f"Unique values ({len(df[col].unique())}): {df[col].unique().tolist()}")
+            print(
+                f"Unique values ({len(df[col].unique())}): {df[col].unique().tolist()}"
+            )

junshan_kit/DataSets.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 ----------------------------------------------------------------------
 >>> Author       : Junshan Yin
->>> Last Updated : 2025-xx-xx
+>>> Last Updated : 2025-10-16
 ----------------------------------------------------------------------
 """
@@ -46,7 +46,7 @@ def _export_csv(df, data_name):
     print(path + f'{data_name}.csv')
-def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, ):
+def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_one_hot_cols = [], export_csv = False, time_info = None):
     if not os.path.exists(csv_path):
         print('\n' + '*'*60)
         print(f"Please download the data.")
@@ -55,7 +55,7 @@ def _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, user_
         # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
     cleaner = junshan_kit.DataProcessor.CSV_TO_Pandas()
-    df = cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map, data_name, user_one_hot_cols, print_info=print_info)
+    df = cleaner.preprocess_dataset(csv_path, drop_cols, label_col, label_map, data_name, user_one_hot_cols, print_info=print_info, time_info = time_info)
     if export_csv:
         _export_csv(df, data_name)
@@ -166,91 +166,220 @@ def adult_income_prediction(data_name = "Adult Income Prediction", print_info =
     return df
 def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
     csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
+    drop_cols = []
     label_col = 'rain_tomorrow'
     label_map = {0: -1, 1: 1}
-    if not os.path.exists(csv_path):
-        print('\n' + '*'*60)
-        print(f"Please download the data.")
-        print(csv_path)
-        _download_data(data_name)
-        # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
-    # Step 0: Load the dataset
-    df = pd.read_csv(csv_path)
-    df['time'] = pd.to_datetime(df['time'])
-    df['year'] = df['time'].dt.year
-    df['month'] = df['time'].dt.month
-    df['day'] = df['time'].dt.day
-    df['hour'] = df['time'].dt.hour
-    user_one_hot_cols = ['year','month','day', 'hour']
-    drop_cols = ['Unnamed: 0', 'time']
+    # Extraction mode.
+    # - 0 : Extract ['year', 'month', 'day', 'hour']
+    # - 1 : Extract ['hour', 'dayofweek', 'is_weekend']
+    time_info = {
+        'time_col_name': 'time',
+        'trans_type': 0
+    }
-    # Save original size
-    m_original, n_original = df.shape
+    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, time_info = time_info)
-    # Step 1: Drop non-informative columns
-    df = df.drop(columns=drop_cols)
+    return df
-    # Step 2: Remove rows with missing values
-    df = df.dropna(axis=0, how="any")
-    m_encoded, n_encoded = df.shape
-    # Step 3: Map target label (to -1 and +1)
-    df[label_col] = df[label_col].map(label_map)
+# def TamilNadu_weather_2020_2025(data_name = "TN Weather 2020-2025", print_info = False, export_csv = False):
+#     csv_path = f'./exp_data/{data_name}/TNweather_1.8M.csv'
+#     label_col = 'rain_tomorrow'
+#     label_map = {0: -1, 1: 1}
-    # Step 4: Encode categorical features (exclude label column)
-    text_feature_cols = df.select_dtypes(
-        include=["object", "string", "category"]
-    ).columns
-    text_feature_cols = [
-        col for col in text_feature_cols if col != label_col
-    ]  # ✅ exclude label
+#     if not os.path.exists(csv_path):
+#         print('\n' + '*'*60)
+#         print(f"Please download the data.")
+#         print(csv_path)
+#         _download_data(data_name)
+#         # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
+#     # Step 0: Load the dataset
+#     df = pd.read_csv(csv_path)
-    df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
-    m_cleaned, n_cleaned = df.shape
+#     df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
+#     df = df.dropna(subset=['timestamp'])
-    num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
-    scaler = StandardScaler()
-    df[num_cols] = scaler.fit_transform(df[num_cols])
+#     df['time'] = pd.to_datetime(df['time'])
+#     df['year'] = df['time'].dt.year
+#     df['month'] = df['time'].dt.month
+#     df['day'] = df['time'].dt.day
+#     df['hour'] = df['time'].dt.hour
+#     user_one_hot_cols = ['year','month','day', 'hour']
+#     drop_cols = ['Unnamed: 0', 'time']
+#     # Save original size
+#     m_original, n_original = df.shape
+#     # Step 1: Drop non-informative columns
+#     df = df.drop(columns=drop_cols)
+#     # Step 2: Remove rows with missing values
+#     df = df.dropna(axis=0, how="any")
+#     m_encoded, n_encoded = df.shape
+#     # Step 3: Map target label (to -1 and +1)
+#     df[label_col] = df[label_col].map(label_map)
+#     # Step 4: Encode categorical features (exclude label column)
+#     text_feature_cols = df.select_dtypes(
+#         include=["object", "string", "category"]
+#     ).columns
+#     text_feature_cols = [
+#         col for col in text_feature_cols if col != label_col
+#     ]  # ✅ exclude label
+#     df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
+#     m_cleaned, n_cleaned = df.shape
+#     num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
+#     scaler = StandardScaler()
+#     df[num_cols] = scaler.fit_transform(df[num_cols])
+#     if export_csv:
+#         _export_csv(df, data_name)
+#     # print info
+#     if print_info:
+#         pos_count = (df[label_col] == 1).sum()
+#         neg_count = (df[label_col] == -1).sum()
+#         # Step 6: Print dataset information
+#         print("\n" + "=" * 80)
+#         print(f"{f'{data_name} - Info':^70}")
+#         print("=" * 80)
+#         print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
+#         print(
+#             f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
+#         )
+#         print(f"{'Positive samples (+1):':<40} {pos_count}")
+#         print(f"{'Negative samples (-1):':<40} {neg_count}")
+#         print(
+#             f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
+#         )
+#         print("-" * 80)
+#         print(f"Note:")
+#         print(f"{'Label column:':<40} {label_col}")
+#         print(f"{'label_map:':<40} {label_map}")
+#         print(
+#             f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
+#         )
+#         print(
+#             f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
+#         )
+#         print("=" * 80 + "\n")
+#     return df
+# def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
+#     csv_path = f'./exp_data/{data_name}/youtube recommendation dataset.csv'
+#     drop_cols = ['user_id']
+#     label_col = 'subscribed_after'
+#     label_map = {0: -1, 1: 1}
+#     if not os.path.exists(csv_path):
+#         print('\n' + '*'*60)
+#         print(f"Please download the data.")
+#         print(csv_path)
+#         _download_data(data_name)
+#         # junshan_kit.kit.unzip_file(f'./exp_data/{data_name}/{data_name}.zip', f'./exp_data/{data_name}')
+#     # Step 0: Load the dataset
+#     df = pd.read_csv(csv_path)
-    if export_csv:
-        _export_csv(df, data_name)
+#     df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
+#     df = df.dropna(subset=['timestamp'])
-    # print info
-    if print_info:
-        pos_count = (df[label_col] == 1).sum()
-        neg_count = (df[label_col] == -1).sum()
-        # Step 6: Print dataset information
-        print("\n" + "=" * 80)
-        print(f"{f'{data_name} - Info':^70}")
-        print("=" * 80)
-        print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
-        print(
-            f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
-        )
-        print(f"{'Positive samples (+1):':<40} {pos_count}")
-        print(f"{'Negative samples (-1):':<40} {neg_count}")
-        print(
-            f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
-        )
-        print("-" * 80)
-        print(f"Note:")
-        print(f"{'Label column:':<40} {label_col}")
-        print(f"{'label_map:':<40} {label_map}")
-        print(
-            f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
-        )
-        print(
-            f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
-        )
-        print("=" * 80 + "\n")
+#     df["hour"] = df['timestamp'].dt.hour
+#     df["dayofweek"] = df['timestamp'].dt.dayofweek
+#     df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
+#     user_one_hot_cols = ['dayofweek','is_weekend','hour']
+#     drop_cols = ['user_id', 'timestamp']
+#     # Save original size
+#     m_original, n_original = df.shape
+#     # Step 1: Drop non-informative columns
+#     df = df.drop(columns=drop_cols)
+#     # Step 2: Remove rows with missing values
+#     df = df.dropna(axis=0, how="any")
+#     m_encoded, n_encoded = df.shape
+#     # Step 3: Map target label (to -1 and +1)
+#     df[label_col] = df[label_col].map(label_map)
+#     # Step 4: Encode categorical features (exclude label column)
+#     text_feature_cols = df.select_dtypes(
+#         include=["object", "string", "category"]
+#     ).columns
+#     text_feature_cols = [
+#         col for col in text_feature_cols if col != label_col
+#     ]  # ✅ exclude label
+#     df = pd.get_dummies(df, columns=text_feature_cols + user_one_hot_cols, dtype=int)
+#     m_cleaned, n_cleaned = df.shape
+#     num_cols = [col for col in df.columns if col not in list(text_feature_cols) + [label_col] + user_one_hot_cols]
+#     scaler = StandardScaler()
+#     df[num_cols] = scaler.fit_transform(df[num_cols])
+#     if export_csv:
+#         _export_csv(df, data_name)
+#     # print info
+#     if print_info:
+#         pos_count = (df[label_col] == 1).sum()
+#         neg_count = (df[label_col] == -1).sum()
+#         # Step 6: Print dataset information
+#         print("\n" + "=" * 80)
+#         print(f"{f'{data_name} - Info':^70}")
+#         print("=" * 80)
+#         print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
+#         print(
+#             f"{'Size after dropping NaN & non-feature cols:':<40} {m_encoded} rows x {n_encoded} cols"
+#         )
+#         print(f"{'Positive samples (+1):':<40} {pos_count}")
+#         print(f"{'Negative samples (-1):':<40} {neg_count}")
+#         print(
+#             f"{'Size after one-hot encoding:':<40} {m_cleaned} rows x {n_cleaned} cols"
+#         )
+#         print("-" * 80)
+#         print(f"Note:")
+#         print(f"{'Label column:':<40} {label_col}")
+#         print(f"{'label_map:':<40} {label_map}")
+#         print(
+#             f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}"
+#         )
+#         print(
+#             f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}"
+#         )
+#         print("=" * 80 + "\n")
+#     return df
+def YouTube_Recommendation(data_name = "YouTube Recommendation", print_info = False, export_csv = False):
+    csv_path = f'./exp_data/{data_name}/youtube recommendation dataset.csv'
+    drop_cols = ['user_id']
+    label_col = 'subscribed_after'
+    label_map = {0: -1, 1: 1}
-    return df
+    # Extraction mode.
+    # - 0 : Extract ['year', 'month', 'day', 'hour']
+    # - 1 : Extract ['hour', 'dayofweek', 'is_weekend']
+    time_info = {
+        'time_col_name': 'timestamp',
+        'trans_type': 1
+    }
+    df = _run(csv_path, data_name, drop_cols, label_col, label_map, print_info, time_info = time_info)
+    return df

{junshan_kit-2.3.5.dist-info → junshan_kit-2.3.7.dist-info}/METADATA RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: junshan_kit
-Version: 2.3.5
+Version: 2.3.7
 Summary: This is an optimization tool.
 Author-email: Junshan Yin <junshanyin@163.com>
 Requires-Dist: kaggle==1.7.4.5
 Requires-Dist: kagglehub==0.3.13
-Requires-Dist: numpy==2.2.6
+Requires-Dist: numpy==2.2.7
 Requires-Dist: pandas==2.3.3
 Requires-Dist: scikit-learn==1.7.1
 Requires-Dist: selenium==4.36.0

junshan_kit-2.3.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+junshan_kit/DataProcessor.py,sha256=MOKMkq4OE32VyLkgUD-D2J5dORmUDLfylAir0UiI04E,8665
+junshan_kit/DataSets.py,sha256=EgDPN7Sm6MLSwxBpJE_A5TN-6eVsjGLjFoZdgg-BnZ8,13819
+junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+junshan_kit/kit.py,sha256=tB1TpW9hW1EweK1RQwHOdUo7uG1QU4vSeyR0fdaSydo,9569
+junshan_kit-2.3.7.dist-info/METADATA,sha256=kYYOgCdx-lIUDOnK2nfxReqCommjtEW-25MxUDOpS6w,329
+junshan_kit-2.3.7.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
+junshan_kit-2.3.7.dist-info/RECORD,,

junshan_kit-2.3.5.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-junshan_kit/DataProcessor.py,sha256=niI7kun5lcBpTJaHzATE5vqnD_9GTyTID9fcKeYHxZ0,5316
-junshan_kit/DataSets.py,sha256=L3D0eBCKHWqpy3qXZvWQP_yKaNzWyj5W1_OLS736xjg,8972
-junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-junshan_kit/kit.py,sha256=tB1TpW9hW1EweK1RQwHOdUo7uG1QU4vSeyR0fdaSydo,9569
-junshan_kit-2.3.5.dist-info/METADATA,sha256=mZnRM7gqHpgFRZPQ1caQHNeUm7bpTW-XsM0vf733xDE,329
-junshan_kit-2.3.5.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
-junshan_kit-2.3.5.dist-info/RECORD,,

{junshan_kit-2.3.5.dist-info → junshan_kit-2.3.7.dist-info}/WHEEL RENAMED Viewed

File without changes

junshan-kit 2.3.5__py2.py3-none-any.whl → 2.3.7__py2.py3-none-any.whl

junshan-kit 2.3.5py2.py3-none-any.whl → 2.3.7py2.py3-none-any.whl