PyPI - junshan-kit - Versions diffs - 2.2.0__py2.py3-none-any.whl → 2.2.2__py2.py3-none-any.whl - Mend

junshan-kit 2.2.0py2.py3-none-any.whl → 2.2.2py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

junshan_kit/DataProcessor.py +63 -2
junshan_kit/datahub.py +7 -0
junshan_kit/kit.py +35 -0
junshan_kit/meta.py +21 -17
junshan_kit/test.py +7 -2
{junshan_kit-2.2.0.dist-info → junshan_kit-2.2.2.dist-info}/METADATA +1 -1
junshan_kit-2.2.2.dist-info/RECORD +9 -0
junshan_kit-2.2.0.dist-info/RECORD +0 -8
{junshan_kit-2.2.0.dist-info → junshan_kit-2.2.2.dist-info}/WHEEL +0 -0

junshan_kit/DataProcessor.py CHANGED Viewed

@@ -1,10 +1,17 @@
+"""
+----------------------------------------------------------------------
+>>> Author       : Junshan Yin
+>>> Last Updated : 2025-10-12
+----------------------------------------------------------------------
+"""
 import pandas as pd
 import os
 from sklearn.preprocessing import StandardScaler
 import junshan_kit.datahub
+import kit
-class CSVToPandas:
+class CSVToPandasMeta:
     def __init__(self):
         self.data_downloader = junshan_kit.datahub.kaggle_data()
@@ -79,7 +86,7 @@ class CSVToPandas:
         scaler = StandardScaler()
         df[num_cols] = scaler.fit_transform(df[num_cols])
-        # 导出后的大小
+        # The size after export
         m_export, n_export = df.shape
         if show_info:
@@ -103,3 +110,57 @@ class CSVToPandas:
+class CSV_TO_Pandas:
+    def __init__(self):
+        pass
+    def clean_data(self, csv_path, drop_cols: list, label_col: str, label_map: dict,  print_info = False):
+        # Step 0: Load the dataset
+        df = pd.read_csv(csv_path)
+        # Save original size
+        m_original, n_original = df.shape
+        # Step 1: Drop non-informative columns
+        df = df.drop(columns=drop_cols)
+        # Step 2: Remove rows with missing values
+        df = df.dropna(axis=0, how='any')
+        m_encoded, n_encoded = df.shape
+        # Step 3: Map target label to -1 and +1
+        df[label_col] = df[label_col].map(label_map)
+        # Step 4: Encode categorical features (exclude label column)
+        text_feature_cols = df.select_dtypes(include=['object', 'string', 'category']).columns
+        text_feature_cols = [col for col in text_feature_cols if col != label_col]  # ✅ exclude label
+        df = pd.get_dummies(df, columns=text_feature_cols, dtype=int)
+        m_cleaned, n_cleaned = df.shape
+        # print info
+        if print_info:
+            pos_count = (df[label_col] == 1).sum()
+            neg_count = (df[label_col] == -1).sum()
+            # Step 6: Print dataset information
+            print('\n' + '='*80)
+            print(f"{'Dataset Info':^70}")
+            print('='*80)
+            print(f"{'Original size:':<40} {m_original} rows x {n_original} cols")
+            print(f"{'Size after dropping NaN & non-feature cols:':<40} {m_cleaned} rows x {n_cleaned} cols")
+            print(f"{'Positive samples (+1):':<40} {pos_count}")
+            print(f"{'Negative samples (-1):':<40} {neg_count}")
+            print(f"{'Size after one-hot encoding:':<40} {m_encoded} rows x {n_encoded} cols")
+            print('-'*80)
+            print(f"Note:")
+            print(f"{'Label column:':<40} {label_col}")
+            print(f"{'Dropped non-feature columns:':<40} {', '.join(drop_cols) if drop_cols else 'None'}")
+            print(f"{'text fetaure columns:':<40} {', '.join(list(text_feature_cols)) if list(text_feature_cols) else 'None'}")
+            print('='*80 + '\n')
+        return df

junshan_kit/datahub.py CHANGED Viewed

@@ -1,3 +1,10 @@
+"""
+----------------------------------------------------------------------
+>>> Author       : Junshan Yin
+>>> Last Updated : 2025-10-12
+----------------------------------------------------------------------
+"""
 import kagglehub
 import os, time
 import warnings

junshan_kit/kit.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""
+----------------------------------------------------------------------
+>>> Author       : Junshan Yin
+>>> Last Updated : 2025-10-13
+----------------------------------------------------------------------
+"""
+import zipfile
+import os
+def unzip_file(zip_path: str, unzip_folder: str):
+    """
+    Args:
+        zip_path (str): Path to the ZIP file to extract.
+        dest_folder (str, optional): Folder to extract files into.
+            If None, the function will create a folder with the same
+            name as the ZIP file (without extension).
+    Examples:
+        >>> zip_path = "./downloads/data.zip"
+        >>> unzip_folder = "./exp_data/data"
+        >>> unzip_file(zip_path, unzip_folder)
+    """
+    if unzip_folder is None:
+        unzip_folder = os.path.splitext(os.path.basename(zip_path))[0]
+    os.makedirs(unzip_folder, exist_ok=True)
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(unzip_folder)
+    print(f"✅ Extracted '{zip_path}' to '{os.path.abspath(unzip_folder)}'")

junshan_kit/meta.py CHANGED Viewed

@@ -1,3 +1,10 @@
+"""
+----------------------------------------------------------------------
+>>> Author       : Junshan Yin
+>>> Last Updated : 2025-10-12
+----------------------------------------------------------------------
+"""
 import os
 import time
 import shutil
@@ -10,13 +17,16 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 # =============================================================
 #                   KAGGLE DATA MANAGEMENT
 # =============================================================
 class KaggleData:
     def list_datasets(self):
-        """List available datasets from a specific user."""
+        """
+        List available datasets from a specific user.
+        """
         api = KaggleApi()
         api.authenticate()
         datasets = api.dataset_list(user='junshan888')
@@ -51,6 +61,11 @@ class KaggleData:
 from selenium.webdriver.chrome.options import Options as ChromeOptions
 class JianguoyunDownloaderChrome:
+    """ Example:
+    >>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
+    >>> downloader = JianguoyunDownloaderChrome(url)
+    >>> downloader.run()
+    """
     def __init__(self, url, download_path="./downloads"):
         self.url = url
         self.download_path = os.path.abspath(download_path)
@@ -148,6 +163,11 @@ from selenium.webdriver.firefox.options import Options as FirefoxOptions
 from selenium.webdriver.firefox.service import Service
 class JianguoyunDownloaderFirefox:
+    """ Example:
+    >>> url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
+    >>> downloader = JianguoyunDownloaderFirefox(url)
+    >>> downloader.run()
+    """
     def __init__(self, url, download_path="./downloads"):
         self.url = url
         self.download_path = os.path.abspath(download_path)
@@ -234,19 +254,3 @@ class JianguoyunDownloaderFirefox:
             self.close()
         print('*' * 60)
-# =============================================================
-#                           MAIN
-# =============================================================
-if __name__ == "__main__":
-    url = "https://www.jianguoyun.com/p/DdyHJxUQqdHDDRjvtI0GIAA"
-    use_chrome = True  # Set True to use Chrome, False for Firefox
-    if use_chrome:
-        downloader = JianguoyunDownloaderChrome(url)
-    else:
-        downloader = JianguoyunDownloaderFirefox(url)
-    downloader.run()

junshan_kit/test.py CHANGED Viewed

@@ -1,3 +1,8 @@
-from datahub import JianguoDownloader
+from DataProcessor import CSV_TO_Pandas
-data2 = JianguoDownloader('www.lka.com', './expspe')
+data_ = CSV_TO_Pandas()
+data_.clean_data('data_csv/Electric Vehicle Population Data/Electric_Vehicle_Population_Data.csv', [], [], {})

{junshan_kit-2.2.0.dist-info → junshan_kit-2.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: junshan_kit
-Version: 2.2.0
+Version: 2.2.2
 Summary: This is an optimization tool.
 Author-email: Junshan Yin <junshanyin@163.com>
 Requires-Dist: kaggle==1.7.4.5

junshan_kit-2.2.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+junshan_kit/DataProcessor.py,sha256=S-_QG2ZkHCGyhS8cxYEnO9z1vyKMrNHYd2j1DuAeNG0,6266
+junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+junshan_kit/datahub.py,sha256=_Q_3AlZ8vk1Ma6h9I44SxWBA8w9m1CQNvYztMcsxzUo,5377
+junshan_kit/kit.py,sha256=h4Q_87hEJbXH4A9ryaGMu_nle5RlM8OR_PaW_hWCVBY,1040
+junshan_kit/meta.py,sha256=SiY9P93aABrksNE6G3ft5gzcuP2cUgc4Vx6LH7ZFmzg,10113
+junshan_kit/test.py,sha256=FgzG4oG7kkq6rWasxdBSY1qx_B0navRI5Ei-wJ1Dvo0,180
+junshan_kit-2.2.2.dist-info/METADATA,sha256=Qe9kokd4FFGlKhg5NDaMhpQrhRSulPvCAr4wcp9rsEo,329
+junshan_kit-2.2.2.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
+junshan_kit-2.2.2.dist-info/RECORD,,

junshan_kit-2.2.0.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-junshan_kit/DataProcessor.py,sha256=AW_1jROexC3s41-RgzqzYVwPI0sOf3tzjiph4qa_Vcw,3882
-junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-junshan_kit/datahub.py,sha256=I34e26psFS8WK4X6SNucKPLtdBm0Ujzqa0VDIRACah4,5163
-junshan_kit/meta.py,sha256=5aHyUPVr3P3yoAdC4DzOZv4AtaO9iX8zGjluwpOly6Q,10017
-junshan_kit/test.py,sha256=uSckjcr_Wgj__YPTwD6x0GY8Hfn5GBEXIpRf9vIYBbU,91
-junshan_kit-2.2.0.dist-info/METADATA,sha256=aWDiR4w_Z7sVVrLcqjQNYgt3L-iFWSydzcoiUPqDsg8,329
-junshan_kit-2.2.0.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
-junshan_kit-2.2.0.dist-info/RECORD,,

{junshan_kit-2.2.0.dist-info → junshan_kit-2.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

junshan-kit 2.2.0__py2.py3-none-any.whl → 2.2.2__py2.py3-none-any.whl

junshan-kit 2.2.0py2.py3-none-any.whl → 2.2.2py2.py3-none-any.whl